PyPI - xinference - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

xinference 0.10.0py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (76) hide show

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -1825,6 +1825,17 @@
         "model_id": "qwen/Qwen1.5-14B-Chat",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen1.5-32B-Chat",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 72,
@@ -1886,6 +1897,15 @@
         "model_id": "qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-32B-Chat-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "gptq",
         "model_size_in_billions": 72,
@@ -1941,6 +1961,15 @@
         "model_id": "qwen/Qwen1.5-14B-Chat-AWQ",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-32B-Chat-AWQ",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "awq",
         "model_size_in_billions": 72,
@@ -2035,6 +2064,23 @@
         "model_hub": "modelscope",
         "model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf"
       },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
+        ],
+        "model_id": "qwen/Qwen1.5-32B-Chat-GGUF",
+        "model_hub": "modelscope",
+        "model_file_name_template": "qwen1_5-32b-chat-{quantization}.gguf"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": 72,
@@ -2075,6 +2121,60 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen1.5-moe-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "2_7",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen1.5-MoE-A2.7B-Chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "2_7",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
+        "model_hub": "modelscope"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -135,6 +135,8 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             chat_history = [h for h in chat_history if not h.get("tool_calls")]
         if not chat_history:
             chat_history = []
+        if system_prompt:
+            chat_history.append({"role": "system", "content": system_prompt})
         if tools:
             msg = self._model.chat(
                 self._tokenizer, prompt, [tools] + chat_history, **kwargs

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -42,6 +42,25 @@ from ..utils import ChatModelMixin
 logger = logging.getLogger(__name__)
+NON_DEFAULT_MODEL_LIST: List[str] = [
+    "baichuan-chat",
+    "baichuan-2-chat",
+    "vicuna-v1.3",
+    "falcon",
+    "falcon-instruct",
+    "chatglm",
+    "chatglm2",
+    "chatglm2-32k",
+    "chatglm2-128k",
+    "llama-2",
+    "llama-2-chat",
+    "internlm2-chat",
+    "qwen-vl-chat",
+    "OmniLMM",
+    "yi-vl-chat",
+    "deepseek-vl-chat",
+]
 class PytorchModel(LLM):
     def __init__(
@@ -233,17 +252,7 @@ class PytorchModel(LLM):
         if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False
         model_family = llm_family.model_family or llm_family.model_name
-        if model_family in [
-            "baichuan-chat",
-            "vicuna-v1.3",
-            "falcon",
-            "falcon-instruct",
-            "chatglm",
-            "chatglm2",
-            "chatglm2-32k",
-            "llama-2",
-            "llama-2-chat",
-        ]:
+        if model_family in NON_DEFAULT_MODEL_LIST:
             return False
         if "generate" not in llm_family.model_ability:
             return False
@@ -452,23 +461,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     ) -> bool:
         if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False
-        if llm_family.model_name in [
-            "baichuan-chat",
-            "baichuan-2-chat",
-            "vicuna-v1.3",
-            "falcon",
-            "falcon-instruct",
-            "chatglm",
-            "chatglm2",
-            "chatglm2-32k",
-            "llama-2",
-            "llama-2-chat",
-            "internlm2-chat",
-            "qwen-vl-chat",
-            "OmniLMM",
-            "yi-vl-chat",
-            "deepseek-vl-chat",
-        ]:
+        model_family = llm_family.model_family or llm_family.model_name
+        if model_family in NON_DEFAULT_MODEL_LIST:
             return False
         if "chat" not in llm_family.model_ability:
             return False

xinference/model/llm/pytorch/internlm2.py CHANGED Viewed

@@ -114,6 +114,8 @@ class Internlm2PytorchChatModel(PytorchChatModel):
             ]
         else:
             input_history = []
+        if system_prompt:
+            kwargs["meta_instruction"] = system_prompt
         if stream:
             def _stream_generator():

xinference/model/llm/pytorch/qwen_vl.py CHANGED Viewed

@@ -53,6 +53,8 @@ class QwenVLChatModel(PytorchChatModel):
         device = self._pytorch_model_config.get("device", "auto")
         device = select_device(device)
+        # for multiple GPU, set back to auto to make multiple devices work
+        device = "auto" if device == "cuda" else device
         self._tokenizer = AutoTokenizer.from_pretrained(
             self.model_path,

xinference/model/llm/pytorch/yi_vl.py CHANGED Viewed

@@ -59,6 +59,8 @@ class YiVLChatModel(PytorchChatModel):
         self._device = self._pytorch_model_config.get("device", "auto")
         self._device = select_device(self._device)
+        # for multiple GPU, set back to auto to make multiple devices work
+        self._device = "auto" if self._device == "cuda" else self._device
         key_info["model_path"] = self.model_path
         # Default device_map is auto, it can loads model to multiple cards.
@@ -190,7 +192,7 @@ class YiVLChatModel(PytorchChatModel):
                 prompt, self._tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
             )
             .unsqueeze(0)
-            .to(self._device)
+            .to(self._model.device)
         )
         images = state.get_images(return_pil=True)
@@ -215,7 +217,7 @@ class YiVLChatModel(PytorchChatModel):
             "input_ids": input_ids,
             "images": image_tensor.unsqueeze(0)
             .to(dtype=torch.bfloat16)
-            .to(self._device),
+            .to(self._model.device),
             "streamer": streamer,
             "do_sample": True,
             "top_p": float(top_p),

xinference/model/llm/utils.py CHANGED Viewed

@@ -163,7 +163,7 @@ class ChatModelMixin:
             for i, message in enumerate(chat_history):
                 role = get_role(message["role"])
-                content = message["content"]
+                content = message.get("content")
                 tool_calls = message.get("tool_calls")
                 if tool_calls:
                     content = tool_calls[0]["function"]
@@ -248,7 +248,7 @@ Begin!"""
             ret = f"<|im_start|>system\n{prompt_style.system_prompt}<|im_end|>"
             for message in chat_history:
                 role = get_role(message["role"])
-                content = message["content"]
+                content = message.get("content")
                 ret += prompt_style.intra_message_sep
                 if tools:
@@ -446,6 +446,11 @@ Begin!"""
                     "index": i,
                     "delta": {
                         "content": choice["text"],
+                        **(
+                            {"tool_calls": choice["tool_calls"]}
+                            if "tool_calls" in choice
+                            else {}
+                        ),
                     },
                     "finish_reason": choice["finish_reason"],
                 }
@@ -592,8 +597,7 @@ Begin!"""
         return text, None, None
     @classmethod
-    def _tool_calls_completion(cls, model_family, model_uid, c, tools):
-        _id = str(uuid.uuid4())
+    def _eval_tool_arguments(cls, model_family, c, tools):
         family = model_family.model_family or model_family.model_name
         if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]:
             content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
@@ -606,7 +610,41 @@ Begin!"""
                 f"Model {model_family.model_name} is not support tool calls."
             )
         logger.debug("Tool call content: %s, func: %s, args: %s", content, func, args)
+        return content, func, args
+    @classmethod
+    def _tools_token_filter(cls, model_family):
+        """
+        Generates a filter function for Qwen series models to retain outputs after "\nFinal Answer:".
+        Returns:
+            A function that takes tokens (string output by the model so far) as input
+            returns True if current token is after "\nFinal Answer:", else False.
+        """
+        family = model_family.model_family or model_family.model_name
+        if family in ["qwen-chat", "qwen1.5-chat"]:
+            # Encapsulating function to reset 'found' after each call
+            found = False
+            def process_token(tokens: str):
+                nonlocal found
+                # Once "Final Answer:" is found, future tokens are allowed.
+                if found:
+                    return True
+                # Check if the token ends with "\nFinal Answer:" and update `found`.
+                if tokens.endswith("\nFinal Answer:"):
+                    found = True
+                return False
+            return process_token
+        else:
+            # For other families, allow all tokens.
+            return lambda tokens: True
+    @classmethod
+    def _tool_calls_completion(cls, model_family, model_uid, c, tools):
+        _id = str(uuid.uuid4())
+        content, func, args = cls._eval_tool_arguments(model_family, c, tools)
         if func:
             m = {
                 "role": "assistant",

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
 import logging
 import multiprocessing
 import time
@@ -36,6 +37,8 @@ from ....types import (
     CompletionChoice,
     CompletionChunk,
     CompletionUsage,
+    ToolCallFunction,
+    ToolCalls,
 )
 from .. import LLM, LLMFamilyV1, LLMSpecV1
 from ..llm_family import CustomLLMFamilyV1
@@ -80,7 +83,15 @@ try:
 except ImportError:
     VLLM_INSTALLED = False
-VLLM_SUPPORTED_MODELS = ["llama-2", "baichuan", "internlm-16k", "mistral-v0.1"]
+VLLM_SUPPORTED_MODELS = [
+    "llama-2",
+    "baichuan",
+    "internlm-16k",
+    "mistral-v0.1",
+    "Yi",
+    "code-llama",
+    "code-llama-python",
+]
 VLLM_SUPPORTED_CHAT_MODELS = [
     "llama-2-chat",
     "vicuna-v1.3",
@@ -90,16 +101,16 @@ VLLM_SUPPORTED_CHAT_MODELS = [
     "internlm-chat-7b",
     "internlm-chat-8k",
     "internlm-chat-20b",
+    "internlm2-chat",
     "qwen-chat",
-    "Yi",
     "Yi-chat",
-    "code-llama",
-    "code-llama-python",
     "code-llama-instruct",
     "mistral-instruct-v0.1",
     "mistral-instruct-v0.2",
     "mixtral-instruct-v0.1",
     "chatglm3",
+    "chatglm3-32k",
+    "chatglm3-128k",
     "deepseek-chat",
     "deepseek-coder-instruct",
 ]
@@ -113,6 +124,9 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
     VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat")
     VLLM_SUPPORTED_CHAT_MODELS.append("orion-chat-rag")
+if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
+    VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-moe-chat")
 class VLLMModel(LLM):
     def __init__(
@@ -293,6 +307,7 @@ class VLLMModel(LLM):
         self,
         prompt: str,
         generate_config: Optional[Dict] = None,
+        tools: object = False,
     ) -> Union[Completion, AsyncGenerator[CompletionChunk, None]]:
         try:
             from vllm.sampling_params import SamplingParams
@@ -319,16 +334,46 @@ class VLLMModel(LLM):
         async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
             previous_texts = [""] * sanitized_generate_config["n"]
+            tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family)
             async for _request_output in results_generator:
                 chunk = self._convert_request_output_to_completion_chunk(
                     request_id=request_id,
                     model=self.model_uid,
                     request_output=_request_output,
                 )
                 for i, choice in enumerate(chunk["choices"]):
                     delta = choice["text"][len(previous_texts[i]) :]
                     previous_texts[i] = choice["text"]
                     choice["text"] = delta
+                if tools:
+                    # only handle the first choice
+                    choice = chunk["choices"][0]
+                    if choice["finish_reason"] is not None:
+                        # use previous text for evaluation temporarily
+                        choice_delta = choice["text"]
+                        choice["text"] = previous_texts[0]
+                        _content, func, args = ChatModelMixin._eval_tool_arguments(
+                            self.model_family, chunk, tools
+                        )
+                        choice["text"] = choice_delta
+                        if func is not None:
+                            choice["text"] = None
+                            choice["finish_reason"] = "tool_calls"
+                            choice["tool_calls"] = [
+                                ToolCalls(
+                                    id=str(uuid.uuid4()),
+                                    type="function",
+                                    function=ToolCallFunction(
+                                        name=func,
+                                        arguments=json.dumps(args, ensure_ascii=False),
+                                    ),
+                                )
+                            ]
+                    # use a filter function to skip Qwen's react thought process
+                    elif not tools_token_filter(previous_texts[0]):
+                        continue
                 prompt_tokens = len(_request_output.prompt_token_ids)
                 completion_tokens = sum(
                     len(output.token_ids) for output in _request_output.outputs
@@ -416,7 +461,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         generate_config = self._sanitize_chat_config(generate_config)
         # TODO(codingl2k1): qwen hacky to set stop for function call.
         model_family = self.model_family.model_family or self.model_family.model_name
-        if tools and "qwen-chat" == model_family:
+        if tools and model_family in ["qwen-chat", "qwen1.5-chat"]:
             stop = generate_config.get("stop")
             if isinstance(stop, str):
                 generate_config["stop"] = [stop, "Observation:"]
@@ -429,7 +474,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         stream = generate_config.get("stream", None)
         if stream:
-            agen = await self.async_generate(full_prompt, generate_config)
+            agen = await self.async_generate(full_prompt, generate_config, tools)
             assert isinstance(agen, AsyncGenerator)
             return self._async_to_chat_completion_chunks(agen)
         else:

xinference/model/rerank/core.py CHANGED Viewed

@@ -134,8 +134,11 @@ class RerankModel:
         top_n: Optional[int],
         max_chunks_per_doc: Optional[int],
         return_documents: Optional[bool],
+        **kwargs,
     ) -> Rerank:
         assert self._model is not None
+        if kwargs:
+            raise ValueError("rerank hasn't support extra parameter.")
         if max_chunks_per_doc is not None:
             raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
         sentence_combinations = [[query, doc] for doc in documents]

xinference/thirdparty/omnilmm/chat.py CHANGED Viewed

@@ -207,7 +207,7 @@ class OmniLMM3B:
 class OmniLMMChat:
     def __init__(self, model_path, device_map) -> None:
-        if "12B" in model_path:
+        if "12b" in model_path:
             self.model = OmniLMM12B(model_path, device_map)
         else:
             self.model = OmniLMM3B(model_path, device_map)

xinference/types.py CHANGED Viewed

@@ -91,11 +91,23 @@ class CompletionLogprobs(TypedDict):
     top_logprobs: List[Optional[Dict[str, float]]]
+class ToolCallFunction(TypedDict):
+    name: str
+    arguments: str
+class ToolCalls(TypedDict):
+    id: str
+    type: Literal["function"]
+    function: ToolCallFunction
 class CompletionChoice(TypedDict):
     text: str
     index: int
     logprobs: Optional[CompletionLogprobs]
     finish_reason: Optional[str]
+    tool_calls: NotRequired[List[ToolCalls]]
 class CompletionUsage(TypedDict):
@@ -147,6 +159,7 @@ class ChatCompletion(TypedDict):
 class ChatCompletionChunkDelta(TypedDict):
     role: NotRequired[str]
     content: NotRequired[str]
+    tool_calls: NotRequired[List[ToolCalls]]
 class ChatCompletionChunkChoice(TypedDict):
@@ -232,6 +245,8 @@ class LlamaCppModelConfig(TypedDict, total=False):
     n_ctx: int
     n_parts: int
     n_gpu_layers: int
+    split_mode: int
+    main_gpu: int
     seed: int
     f16_kv: bool
     logits_all: bool
@@ -355,21 +370,6 @@ try:
 except ImportError:
     CreateCompletionLlamaCpp = create_model("CreateCompletionLlamaCpp")
-CreateCompletionCTransformers: BaseModel
-try:
-    from ctransformers.llm import LLM
-    CreateCompletionCTransformers = get_pydantic_model_from_method(
-        LLM.generate,
-        exclude_fields=["tokens"],
-        include_fields={
-            "max_tokens": (Optional[int], max_tokens_field),
-            "stream": (Optional[bool], stream_field),
-        },
-    )
-except ImportError:
-    CreateCompletionCTransformers = create_model("CreateCompletionCTransformers")
 # This type is for openai API compatibility
 CreateCompletionOpenAI: BaseModel
@@ -415,7 +415,6 @@ class CreateCompletion(
     ModelAndPrompt,
     CreateCompletionTorch,
     CreateCompletionLlamaCpp,
-    CreateCompletionCTransformers,
     CreateCompletionOpenAI,
 ):
     pass
@@ -428,8 +427,6 @@ class CreateChatModel(BaseModel):
 # Currently, chat calls generates, so the params share the same one.
 CreateChatCompletionTorch = CreateCompletionTorch
 CreateChatCompletionLlamaCpp: BaseModel = CreateCompletionLlamaCpp
-CreateChatCompletionCTransformers: BaseModel = CreateCompletionCTransformers
 # This type is for openai API compatibility
 CreateChatCompletionOpenAI: BaseModel
@@ -450,7 +447,6 @@ class CreateChatCompletion(
     CreateChatModel,
     CreateChatCompletionTorch,
     CreateChatCompletionLlamaCpp,
-    CreateChatCompletionCTransformers,
     CreateChatCompletionOpenAI,
 ):
     pass

xinference/web/ui/build/asset-manifest.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
   "files": {
-    "main.js": "./static/js/main.98516614.js",
+    "main.js": "./static/js/main.76ef2b17.js",
     "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
     "index.html": "./index.html",
-    "main.98516614.js.map": "./static/js/main.98516614.js.map"
+    "main.76ef2b17.js.map": "./static/js/main.76ef2b17.js.map"
   },
   "entrypoints": [
-    "static/js/main.98516614.js"
+    "static/js/main.76ef2b17.js"
   ]
 }

xinference/web/ui/build/index.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.~~98516614~~.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1	+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.76ef2b17.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

xinference 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

Potentially problematic release.

xinference 0.10.0py3-none-any.whl → 0.10.1py3-none-any.whl