PyPI - xinference - Versions diffs - 0.12.0__py3-none-any.whl → 0.12.2__py3-none-any.whl - Mend

xinference 0.12.0py3-none-any.whl → 0.12.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (85) hide show

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -32,10 +32,15 @@ from ..._compat import (
     load_str_bytes,
     validator,
 )
-from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
+from ...constants import (
+    XINFERENCE_CACHE_DIR,
+    XINFERENCE_ENV_CSG_TOKEN,
+    XINFERENCE_MODEL_DIR,
+)
 from ..utils import (
     IS_NEW_HUGGINGFACE_HUB,
     create_symlink,
+    download_from_csghub,
     download_from_modelscope,
     is_valid_model_uri,
     parse_uri,
@@ -232,6 +237,7 @@ LLAMA_CLASSES: List[Type[LLM]] = []
 BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
 BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = []
+BUILTIN_CSGHUB_LLM_FAMILIES: List["LLMFamilyV1"] = []
 SGLANG_CLASSES: List[Type[LLM]] = []
 TRANSFORMERS_CLASSES: List[Type[LLM]] = []
@@ -292,6 +298,9 @@ def cache(
             elif llm_spec.model_hub == "modelscope":
                 logger.info(f"Caching from Modelscope: {llm_spec.model_id}")
                 return cache_from_modelscope(llm_family, llm_spec, quantization)
+            elif llm_spec.model_hub == "csghub":
+                logger.info(f"Caching from CSGHub: {llm_spec.model_id}")
+                return cache_from_csghub(llm_family, llm_spec, quantization)
             else:
                 raise ValueError(f"Unknown model hub: {llm_spec.model_hub}")
@@ -566,6 +575,7 @@ def _skip_download(
             "modelscope": _get_meta_path(
                 cache_dir, model_format, "modelscope", quantization
             ),
+            "csghub": _get_meta_path(cache_dir, model_format, "csghub", quantization),
         }
         if valid_model_revision(model_hub_to_meta_path[model_hub], model_revision):
             logger.info(f"Cache {cache_dir} exists")
@@ -650,6 +660,75 @@ def _merge_cached_files(
     logger.info(f"Merge complete.")
+def cache_from_csghub(
+    llm_family: LLMFamilyV1,
+    llm_spec: "LLMSpecV1",
+    quantization: Optional[str] = None,
+) -> str:
+    """
+    Cache model from CSGHub. Return the cache directory.
+    """
+    from pycsghub.file_download import file_download
+    from pycsghub.snapshot_download import snapshot_download
+    cache_dir = _get_cache_dir(llm_family, llm_spec)
+    if _skip_download(
+        cache_dir,
+        llm_spec.model_format,
+        llm_spec.model_hub,
+        llm_spec.model_revision,
+        quantization,
+    ):
+        return cache_dir
+    if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
+        download_dir = retry_download(
+            snapshot_download,
+            llm_family.model_name,
+            {
+                "model_size": llm_spec.model_size_in_billions,
+                "model_format": llm_spec.model_format,
+            },
+            llm_spec.model_id,
+            endpoint="https://hub-stg.opencsg.com",
+            token=os.environ.get(XINFERENCE_ENV_CSG_TOKEN),
+        )
+        create_symlink(download_dir, cache_dir)
+    elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
+        file_names, final_file_name, need_merge = _generate_model_file_names(
+            llm_spec, quantization
+        )
+        for filename in file_names:
+            download_path = retry_download(
+                file_download,
+                llm_family.model_name,
+                {
+                    "model_size": llm_spec.model_size_in_billions,
+                    "model_format": llm_spec.model_format,
+                },
+                llm_spec.model_id,
+                file_name=filename,
+                endpoint="https://hub-stg.opencsg.com",
+                token=os.environ.get(XINFERENCE_ENV_CSG_TOKEN),
+            )
+            symlink_local_file(download_path, cache_dir, filename)
+        if need_merge:
+            _merge_cached_files(cache_dir, file_names, final_file_name)
+    else:
+        raise ValueError(f"Unsupported format: {llm_spec.model_format}")
+    meta_path = _get_meta_path(
+        cache_dir, llm_spec.model_format, llm_spec.model_hub, quantization
+    )
+    _generate_meta_file(meta_path, llm_family, llm_spec, quantization)
+    return cache_dir
 def cache_from_modelscope(
     llm_family: LLMFamilyV1,
     llm_spec: "LLMSpecV1",
@@ -931,6 +1010,12 @@ def match_llm(
             + BUILTIN_LLM_FAMILIES
             + user_defined_llm_families
         )
+    elif download_from_csghub():
+        all_families = (
+            BUILTIN_CSGHUB_LLM_FAMILIES
+            + BUILTIN_LLM_FAMILIES
+            + user_defined_llm_families
+        )
     else:
         all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families

xinference/model/llm/llm_family_csghub.json ADDED Viewed

@@ -0,0 +1,66 @@
+[
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen2-instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "Qwen2 is the new series of Qwen large language models",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen2-0.5B-Instruct",
+        "model_hub": "csghub"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "qwen/Qwen2-0.5B-Instruct-GGUF",
+        "model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf",
+        "model_hub": "csghub"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  }
+]

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -632,6 +632,8 @@
         "model_format": "pytorch",
         "model_size_in_billions": 9,
         "quantizations": [
+          "4-bit",
+          "8-bit",
           "none"
         ],
         "model_hub": "modelscope",
@@ -2642,7 +2644,8 @@
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "tools"
     ],
     "model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
     "model_specs": [
@@ -2966,7 +2969,8 @@
       "zh"
     ],
     "model_ability": [
-      "chat"
+      "chat",
+      "tools"
     ],
     "model_description": "Qwen2 is the new series of Qwen large language models. ",
     "model_specs": [
@@ -3348,9 +3352,11 @@
       ],
       "intra_message_sep": "<|im_end|>",
       "stop_token_ids": [
+        2,
         92542
       ],
       "stop": [
+        "</s>",
         "<|im_end|>"
       ]
     }

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -15,6 +15,7 @@ import time
 import uuid
 from typing import Any, Dict, Iterator, List, Optional, Union
+from ....core.scheduler import InferenceRequest
 from ....types import (
     SPECIAL_TOOL_PROMPT,
     ChatCompletion,
@@ -89,24 +90,30 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             return False
         return True
-    @staticmethod
-    def _handle_tools(generate_config) -> Optional[dict]:
+    def _handle_tools(self, generate_config) -> Optional[dict]:
         """Convert openai tools to ChatGLM tools."""
         if generate_config is None:
             return None
         tools = generate_config.pop("tools", None)
         if tools is None:
             return None
-        chatglm_tools = []
-        for elem in tools:
-            if elem.get("type") != "function" or "function" not in elem:
-                raise ValueError("ChatGLM tools only support function type.")
-            chatglm_tools.append(elem["function"])
-        return {
-            "role": "system",
-            "content": f"Answer the following questions as best as you can. You have access to the following tools:",
-            "tools": chatglm_tools,
-        }
+        if self.model_family.model_name == "glm4-chat":
+            return {
+                "role": "system",
+                "content": None,
+                "tools": tools,
+            }
+        else:
+            chatglm_tools = []
+            for elem in tools:
+                if elem.get("type") != "function" or "function" not in elem:
+                    raise ValueError("ChatGLM tools only support function type.")
+                chatglm_tools.append(elem["function"])
+            return {
+                "role": "system",
+                "content": f"Answer the following questions as best as you can. You have access to the following tools:",
+                "tools": chatglm_tools,
+            }
     def chat(
         self,
@@ -238,3 +245,25 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                         prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
                     ),
                 )
+    @staticmethod
+    def require_attention_mask():
+        """
+        GLM4 needs to use attention mask and position ids during inference.
+        Otherwise, the inference result would be not available.
+        """
+        return True
+    def prepare_sanitize_generate_config(self, req: InferenceRequest):
+        """
+        Set temperature and top_p to 0.8 by default
+        """
+        raw_config = req.inference_kwargs.get("raw_params", {})
+        temperature = raw_config.get("temperature", None)
+        if temperature is None:
+            raw_config["temperature"] = 0.8
+        top_p = raw_config.get("top_p", None)
+        if top_p is None:
+            raw_config["top_p"] = 0.8
+        return raw_config

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -15,7 +15,8 @@
 import json
 import logging
 import os
-from typing import Iterable, Iterator, List, Optional, Union
+from functools import lru_cache
+from typing import Iterable, Iterator, List, Optional, Tuple, Union
 from ....core.scheduler import InferenceRequest
 from ....device_utils import (
@@ -28,6 +29,7 @@ from ....types import (
     ChatCompletionChunk,
     ChatCompletionMessage,
     Completion,
+    CompletionChoice,
     CompletionChunk,
     CreateCompletionTorch,
     Embedding,
@@ -281,35 +283,21 @@ class PytorchModel(LLM):
     def generate(
         self, prompt: str, generate_config: Optional[PytorchGenerateConfig] = None
     ) -> Union[Completion, Iterator[CompletionChunk]]:
-        from .utils import generate_stream, generate_stream_falcon
-        model_family_name = self.model_family.model_name.lower()
+        from .utils import generate_stream
         def generator_wrapper(
             prompt: str, generate_config: PytorchGenerateConfig
         ) -> Iterator[CompletionChunk]:
-            if "falcon" in model_family_name:
-                for completion_chunk, completion_usage in generate_stream_falcon(
-                    self.model_uid,
-                    self._model,
-                    self._tokenizer,
-                    prompt,
-                    self._device,
-                    generate_config,
-                ):
-                    completion_chunk["usage"] = completion_usage
-                    yield completion_chunk
-            else:
-                for completion_chunk, completion_usage in generate_stream(
-                    self.model_uid,
-                    self._model,
-                    self._tokenizer,
-                    prompt,
-                    self._device,
-                    generate_config,
-                ):
-                    completion_chunk["usage"] = completion_usage
-                    yield completion_chunk
+            for completion_chunk, completion_usage in generate_stream(
+                self.model_uid,
+                self._model,
+                self._tokenizer,
+                prompt,
+                self._device,
+                generate_config,
+            ):
+                completion_chunk["usage"] = completion_usage
+                yield completion_chunk
         logger.debug(
             "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
@@ -334,26 +322,15 @@ class PytorchModel(LLM):
         stream = generate_config.get("stream", False)
         if not stream:
-            if "falcon" in model_family_name:
-                for completion_chunk, completion_usage in generate_stream_falcon(
-                    self.model_uid,
-                    self._model,
-                    self._tokenizer,
-                    prompt,
-                    self._device,
-                    generate_config,
-                ):
-                    pass
-            else:
-                for completion_chunk, completion_usage in generate_stream(
-                    self.model_uid,
-                    self._model,
-                    self._tokenizer,
-                    prompt,
-                    self._device,
-                    generate_config,
-                ):
-                    pass
+            for completion_chunk, completion_usage in generate_stream(
+                self.model_uid,
+                self._model,
+                self._tokenizer,
+                prompt,
+                self._device,
+                generate_config,
+            ):
+                pass
             completion = Completion(
                 id=completion_chunk["id"],
                 object=completion_chunk["object"],
@@ -366,6 +343,105 @@ class PytorchModel(LLM):
         else:
             return generator_wrapper(prompt, generate_config)
+    @staticmethod
+    def require_attention_mask():
+        return False
+    @lru_cache
+    def get_context_len(self):
+        return get_context_length(self._model.config)
+    def get_max_num_seqs(self) -> int:
+        return self._pytorch_model_config.get("max_num_seqs")  # type: ignore
+    def prepare_sanitize_generate_config(self, req: InferenceRequest):
+        return self._sanitize_generate_config(req.generate_config)
+    def prepare_batch_inference(self, req_list: List[InferenceRequest]):
+        # check some parameters
+        for r in req_list:
+            if r.sanitized_generate_config is None:
+                r.sanitized_generate_config = self.prepare_sanitize_generate_config(r)
+            if r.is_prefill:
+                # check some generate params
+                max_src_len = get_max_src_len(self.get_context_len(), r)  # type: ignore
+                if max_src_len < 0:
+                    r.stopped = True
+                    r.error_msg = "Max tokens exceeds model's max length"
+                    continue
+                if r.stream_interval <= 0:
+                    r.stopped = True
+                    r.error_msg = "`stream_interval` must be greater than 0"
+                    continue
+                stop_str = r.sanitized_generate_config.get("stop", None)
+                if stop_str and (
+                    not (isinstance(stop_str, str) or isinstance(stop_str, Iterable))
+                ):
+                    r.stopped = True
+                    r.error_msg = "Invalid `stop` field type"
+                    continue
+    def _get_builtin_stop_token_ids(self) -> Tuple:
+        return (
+            tuple(self.model_family.prompt_style.stop_token_ids)
+            if self.model_family.prompt_style
+            and self.model_family.prompt_style.stop_token_ids
+            else tuple()
+        )
+    def handle_batch_inference_results(self, req_list: List[InferenceRequest]):
+        for req in req_list:
+            if req.error_msg is None:
+                # nothing need handle for non-stream case
+                if req.stream:
+                    results = []
+                    for i, c in enumerate(req.completion):
+                        if c == "<bos_stream>":
+                            chunk = req.completion[i + 1]
+                            results.append(
+                                CompletionChunk(
+                                    id=chunk["id"],
+                                    object=chunk["object"],
+                                    created=chunk["created"],
+                                    model=chunk["model"],
+                                    choices=[
+                                        CompletionChoice(
+                                            text="",
+                                            index=0,
+                                            logprobs=None,
+                                            finish_reason=None,
+                                        )
+                                    ],
+                                )
+                            )
+                            continue
+                        elif c == "<eos_stream>":
+                            break
+                        else:
+                            results.append(c)
+                    if req.stopped and req.include_usage:
+                        results.append(req.completion[-1])
+                    req.completion = results
+    def batch_inference(self, req_list: List[InferenceRequest]):
+        from .utils import batch_inference_one_step
+        self.prepare_batch_inference(req_list)
+        context_len = self.get_context_len()
+        assert isinstance(context_len, int)
+        batch_inference_one_step(
+            req_list,
+            self.model_uid,
+            self._model,
+            self._tokenizer,
+            self._device,
+            context_len,
+            self._get_builtin_stop_token_ids(),
+            require_attention_mask=self.require_attention_mask(),
+        )
+        self.handle_batch_inference_results(req_list)
     def create_embedding(self, input: Union[str, List[str]]) -> Embedding:
         try:
             import torch
@@ -464,7 +540,6 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             pytorch_model_config,
             peft_model,
         )
-        self._context_len = None
     def _sanitize_generate_config(
         self,
@@ -540,7 +615,6 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     def load(self):
         super().load()
-        self._context_len = get_context_length(self._model.config)
     def _get_full_prompt(self, prompt, system_prompt, chat_history, tools):
         assert self.model_family.prompt_style is not None
@@ -553,48 +627,14 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
         )
         return full_prompt
-    def get_max_num_seqs(self) -> int:
-        return self._pytorch_model_config.get("max_num_seqs")  # type: ignore
-    def batch_inference(self, req_list: List[InferenceRequest]):
-        from .utils import batch_inference_one_step
+    def prepare_batch_inference(self, req_list: List[InferenceRequest]):
+        super().prepare_batch_inference(req_list)
         for r in req_list:
-            if r.sanitized_generate_config is None:
-                r.sanitized_generate_config = self._sanitize_generate_config(
-                    r.generate_config
-                )
-            if r.is_prefill:
-                # check some generate params
-                max_src_len = get_max_src_len(self._context_len, r)  # type: ignore
-                if max_src_len < 0:
-                    r.stopped = True
-                    r.error_msg = "Max tokens exceeds model's max length"
-                    continue
-                if r.stream_interval <= 0:
-                    r.stopped = True
-                    r.error_msg = "`stream_interval` must be greater than 0"
-                    continue
-                stop_str = r.sanitized_generate_config.get("stop", None)
-                if stop_str and (
-                    not (isinstance(stop_str, str) or isinstance(stop_str, Iterable))
-                ):
-                    r.stopped = True
-                    r.error_msg = "Invalid `stop` field type"
-                    continue
-                r.full_prompt = self._get_full_prompt(
-                    r.prompt, r.system_prompt, r.chat_history, None
-                )
+            r.full_prompt = self._get_full_prompt(
+                r.prompt, r.system_prompt, r.chat_history, None
+            )
-        assert isinstance(self._context_len, int)
-        batch_inference_one_step(
-            req_list,
-            self.model_uid,
-            self._model,
-            self._tokenizer,
-            self._device,
-            self._context_len,
-        )
+    def handle_batch_inference_results(self, req_list: List[InferenceRequest]):
         for req in req_list:
             if req.stream and req.error_msg is None:
                 if req.completion:

xinference/model/llm/pytorch/glm4v.py CHANGED Viewed

@@ -56,19 +56,40 @@ class Glm4VModel(PytorchChatModel):
             return True
         return False
-    def load(self, **kwargs):
+    def load(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer
         device = self._pytorch_model_config.get("device", "auto")
         self._device = select_device(device)
-        self._device = "auto" if self._device == "cuda" else self._device
+        kwargs = {"device_map": self._device}
+        quantization = self.quantization
+        # referenced from PytorchModel.load
+        if quantization != "none":
+            if self._device == "cuda" and self._is_linux():
+                kwargs["device_map"] = "auto"
+                self._device = "auto"
+                if quantization == "4-bit":
+                    kwargs["load_in_4bit"] = True
+                elif quantization == "8-bit":
+                    kwargs["load_in_8bit"] = True
+                else:
+                    raise ValueError(
+                        f"Quantization {quantization} is not supported in temporary"
+                    )
+            else:
+                if quantization != "8-bit":
+                    raise ValueError(
+                        f"Only 8-bit quantization is supported if it is not linux system or cuda device"
+                    )
         model = AutoModelForCausalLM.from_pretrained(
             self.model_path,
             low_cpu_mem_usage=True,
             trust_remote_code=True,
             torch_dtype=torch.float16,
-            device_map=self._device,
+            **kwargs,
         )
         self._model = model.eval()

xinference/model/llm/pytorch/internlm2.py CHANGED Viewed

@@ -15,6 +15,7 @@ import time
 import uuid
 from typing import Any, Dict, Iterator, List, Optional, Union
+from ....core.scheduler import InferenceRequest
 from ....types import (
     ChatCompletion,
     ChatCompletionChoice,
@@ -88,6 +89,20 @@ class Internlm2PytorchChatModel(PytorchChatModel):
             return False
         return True
+    def prepare_sanitize_generate_config(self, req: InferenceRequest):
+        """
+        Overwrite this func for this special model.
+        Cannot use the default configuration, which works poorly on this model.
+        """
+        raw_config = req.inference_kwargs.get("raw_params", {})
+        temperature = raw_config.get("temperature", None)
+        if temperature is None:
+            raw_config["temperature"] = 0.8
+        top_p = raw_config.get("top_p", None)
+        if top_p is None:
+            raw_config["top_p"] = 0.8
+        return raw_config
     def chat(
         self,
         prompt: str,

xinference/model/llm/pytorch/qwen_vl.py CHANGED Viewed

@@ -45,7 +45,7 @@ class QwenVLChatModel(PytorchChatModel):
     def match(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if "qwen" in model_family.model_name:
+        if "qwen" in model_family.model_name and "vision" in model_family.model_ability:
             return True
         return False

xinference 0.12.0__py3-none-any.whl → 0.12.2__py3-none-any.whl

Potentially problematic release.

xinference 0.12.0py3-none-any.whl → 0.12.2py3-none-any.whl