PyPI - xinference - Versions diffs - 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl - Mend

xinference 0.13.0py3-none-any.whl → 0.13.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (70) hide show

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -14,7 +14,6 @@
 import logging
 import os
-import platform
 import shutil
 from threading import Lock
 from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
@@ -541,25 +540,50 @@ def _get_cache_dir_for_model_mem(
 def _get_cache_dir(
     llm_family: LLMFamilyV1,
     llm_spec: "LLMSpecV1",
+    quantization: Optional[str] = None,
     create_if_not_exist=True,
 ):
     # If the model id contains quantization, then we should give each
     # quantization a dedicated cache dir.
     quant_suffix = ""
-    for q in llm_spec.quantizations:
-        if llm_spec.model_id and q in llm_spec.model_id:
-            quant_suffix = q
-            break
-    cache_dir_name = (
+    if llm_spec.model_id and "{" in llm_spec.model_id and quantization is not None:
+        quant_suffix = quantization
+    else:
+        for q in llm_spec.quantizations:
+            if llm_spec.model_id and q in llm_spec.model_id:
+                quant_suffix = q
+                break
+    # some model name includes ".", e.g. qwen1.5-chat
+    # if the model does not require trust_remote_code, it's OK
+    # because no need to import modeling_xxx.py from the path
+    # but when the model need to trust_remote_code,
+    # e.g. internlm2.5-chat, the import will fail,
+    # but before the model may have been downloaded,
+    # thus we check it first, if exist, return it,
+    # otherwise, we replace the "." with "_" in model name
+    old_cache_dir_name = (
         f"{llm_family.model_name}-{llm_spec.model_format}"
         f"-{llm_spec.model_size_in_billions}b"
     )
     if quant_suffix:
-        cache_dir_name += f"-{quant_suffix}"
-    cache_dir = os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, cache_dir_name))
-    if create_if_not_exist and not os.path.exists(cache_dir):
-        os.makedirs(cache_dir, exist_ok=True)
-    return cache_dir
+        old_cache_dir_name += f"-{quant_suffix}"
+    old_cache_dir = os.path.realpath(
+        os.path.join(XINFERENCE_CACHE_DIR, old_cache_dir_name)
+    )
+    if os.path.exists(old_cache_dir):
+        return old_cache_dir
+    else:
+        cache_dir_name = (
+            f"{llm_family.model_name.replace('.', '_')}-{llm_spec.model_format}"
+            f"-{llm_spec.model_size_in_billions}b"
+        )
+        if quant_suffix:
+            cache_dir_name += f"-{quant_suffix}"
+        cache_dir = os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, cache_dir_name))
+        if create_if_not_exist and not os.path.exists(cache_dir):
+            os.makedirs(cache_dir, exist_ok=True)
+        return cache_dir
 def _get_meta_path(
@@ -900,6 +924,7 @@ def _check_revision(
     llm_spec: "LLMSpecV1",
     builtin: list,
     meta_path: str,
+    quantization: Optional[str] = None,
 ) -> bool:
     for family in builtin:
         if llm_family.model_name == family.model_name:
@@ -908,59 +933,63 @@ def _check_revision(
                 if (
                     spec.model_format == "pytorch"
                     and spec.model_size_in_billions == llm_spec.model_size_in_billions
+                    and (quantization is None or quantization in spec.quantizations)
                 ):
                     return valid_model_revision(meta_path, spec.model_revision)
     return False
 def get_cache_status(
-    llm_family: LLMFamilyV1,
-    llm_spec: "LLMSpecV1",
+    llm_family: LLMFamilyV1, llm_spec: "LLMSpecV1", quantization: Optional[str] = None
 ) -> Union[bool, List[bool]]:
     """
-    When calling this function from above, `llm_family` is constructed only from BUILTIN_LLM_FAMILIES,
-    so we should check both huggingface and modelscope cache files.
+    Checks if a model's cache status is available based on the model format and quantization.
+    Supports different directories and model formats.
     """
-    cache_dir = _get_cache_dir(llm_family, llm_spec, create_if_not_exist=False)
-    # check revision for pytorch model
-    if llm_spec.model_format == "pytorch":
-        hf_meta_path = _get_meta_path(cache_dir, "pytorch", "huggingface", "none")
-        ms_meta_path = _get_meta_path(cache_dir, "pytorch", "modelscope", "none")
-        revisions = [
-            _check_revision(llm_family, llm_spec, BUILTIN_LLM_FAMILIES, hf_meta_path),
-            _check_revision(
-                llm_family, llm_spec, BUILTIN_MODELSCOPE_LLM_FAMILIES, ms_meta_path
-            ),
-        ]
-        return any(revisions)
-    # just check meta file for ggml and gptq model
-    elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
-        ret = []
-        for q in llm_spec.quantizations:
-            assert q is not None
-            hf_meta_path = _get_meta_path(
-                cache_dir, llm_spec.model_format, "huggingface", q
-            )
-            ms_meta_path = _get_meta_path(
-                cache_dir, llm_spec.model_format, "modelscope", q
-            )
-            results = [os.path.exists(hf_meta_path), os.path.exists(ms_meta_path)]
-            ret.append(any(results))
-        return ret
-    else:
-        raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
-def _is_linux():
-    return platform.system() == "Linux"
+    def check_file_status(meta_path: str) -> bool:
+        return os.path.exists(meta_path)
+    def check_revision_status(
+        meta_path: str, families: list, quantization: Optional[str] = None
+    ) -> bool:
+        return _check_revision(llm_family, llm_spec, families, meta_path, quantization)
-def _has_cuda_device():
-    # `cuda_count` method already contains the logic for the
-    # number of GPUs specified by `CUDA_VISIBLE_DEVICES`.
-    from ...utils import cuda_count
+    def handle_quantization(q: Union[str, None]) -> bool:
+        specific_cache_dir = _get_cache_dir(
+            llm_family, llm_spec, q, create_if_not_exist=False
+        )
+        meta_paths = {
+            "huggingface": _get_meta_path(
+                specific_cache_dir, llm_spec.model_format, "huggingface", q
+            ),
+            "modelscope": _get_meta_path(
+                specific_cache_dir, llm_spec.model_format, "modelscope", q
+            ),
+        }
+        if llm_spec.model_format == "pytorch":
+            return check_revision_status(
+                meta_paths["huggingface"], BUILTIN_LLM_FAMILIES, q
+            ) or check_revision_status(
+                meta_paths["modelscope"], BUILTIN_MODELSCOPE_LLM_FAMILIES, q
+            )
+        else:
+            return check_file_status(meta_paths["huggingface"]) or check_file_status(
+                meta_paths["modelscope"]
+            )
-    return cuda_count() > 0
+    if llm_spec.model_id and "{" in llm_spec.model_id:
+        return (
+            [handle_quantization(q) for q in llm_spec.quantizations]
+            if quantization is None
+            else handle_quantization(quantization)
+        )
+    else:
+        return (
+            [handle_quantization(q) for q in llm_spec.quantizations]
+            if llm_spec.model_format != "pytorch"
+            else handle_quantization(None)
+        )
 def get_user_defined_llm_families():
@@ -1006,6 +1035,7 @@ def match_llm(
     model_format: Optional[str] = None,
     model_size_in_billions: Optional[Union[int, str]] = None,
     quantization: Optional[str] = None,
+    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
 ) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
     """
     Find an LLM family, spec, and quantization that satisfy given criteria.
@@ -1029,7 +1059,22 @@ def match_llm(
             spec.model_id = spec.model_id.format(quantization=q)
         return spec
-    if download_from_modelscope():
+    # priority: download_hub > download_from_modelscope() and download_from_csghub()
+    if download_hub == "modelscope":
+        all_families = (
+            BUILTIN_MODELSCOPE_LLM_FAMILIES
+            + BUILTIN_LLM_FAMILIES
+            + user_defined_llm_families
+        )
+    elif download_hub == "csghub":
+        all_families = (
+            BUILTIN_CSGHUB_LLM_FAMILIES
+            + BUILTIN_LLM_FAMILIES
+            + user_defined_llm_families
+        )
+    elif download_hub == "huggingface":
+        all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
+    elif download_from_modelscope():
         all_families = (
             BUILTIN_MODELSCOPE_LLM_FAMILIES
             + BUILTIN_LLM_FAMILIES

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -304,21 +304,6 @@
     ],
     "model_description": "ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.",
     "model_specs": [
-      {
-        "model_format": "ggmlv3",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "q4_0",
-          "q4_1",
-          "q5_0",
-          "q5_1",
-          "q8_0"
-        ],
-        "model_hub": "modelscope",
-        "model_id": "Xorbits/chatglm2-6B-GGML",
-        "model_revision": "v1.0.0",
-        "model_file_name_template": "chatglm2-ggml-{quantization}.bin"
-      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 6,
@@ -392,17 +377,6 @@
     ],
     "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
     "model_specs": [
-      {
-        "model_format": "ggmlv3",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "q4_0"
-        ],
-        "model_hub": "modelscope",
-        "model_id": "Xorbits/chatglm3-ggml",
-        "model_revision": "v1.0.0",
-        "model_file_name_template": "chatglm3-ggml-{quantization}.bin"
-      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 6,
@@ -547,6 +521,33 @@
         "model_hub": "modelscope",
         "model_id": "ZhipuAI/glm-4-9b-chat",
         "model_revision": "master"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "Q2_K",
+          "IQ3_XS",
+          "IQ3_S",
+          "IQ3_M",
+          "Q3_K_S",
+          "Q3_K_L",
+          "Q3_K",
+          "IQ4_XS",
+          "IQ4_NL",
+          "Q4_K_S",
+          "Q4_K",
+          "Q5_K_S",
+          "Q5_K",
+          "Q6_K",
+          "Q8_0",
+          "BF16",
+          "FP16"
+        ],
+        "model_file_name_template": "glm-4-9b-chat.{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "LLM-Research/glm-4-9b-chat-GGUF",
+        "model_revision": "master"
       }
     ],
     "prompt_style": {
@@ -593,6 +594,33 @@
         "model_hub": "modelscope",
         "model_id": "ZhipuAI/glm-4-9b-chat-1m",
         "model_revision": "master"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "Q2_K",
+          "IQ3_XS",
+          "IQ3_S",
+          "IQ3_M",
+          "Q3_K_S",
+          "Q3_K_L",
+          "Q3_K",
+          "IQ4_XS",
+          "IQ4_NL",
+          "Q4_K_S",
+          "Q4_K",
+          "Q5_K_S",
+          "Q5_K",
+          "Q6_K",
+          "Q8_0",
+          "BF16",
+          "FP16"
+        ],
+        "model_file_name_template": "glm-4-9b-chat-1m.{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "LLM-Research/glm-4-9b-chat-1m-GGUF",
+        "model_revision": "master"
       }
     ],
     "prompt_style": {
@@ -660,6 +688,66 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "codegeex4",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "the open-source version of the latest CodeGeeX4 model series",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "ZhipuAI/codegeex4-all-9b",
+        "model_hub": "modelscope",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "IQ2_M",
+          "IQ3_M",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K_L",
+          "Q8_0"
+        ],
+        "model_file_name_template": "codegeex4-all-9b-{quantization}.gguf",
+        "model_id": "ZhipuAI/codegeex4-all-9b-GGUF",
+        "model_hub": "modelscope"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATGLM3",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "stop_token_ids": [
+        151329,
+        151336,
+        151338
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|user|>",
+        "<|observation|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,
@@ -900,6 +988,88 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "internlm2.5-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "InternLM2.5 series of the InternLM model.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat",
+        "model_hub": "modelscope"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "INTERNLM2",
+      "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
+      "roles": [
+        "<|im_start|>user",
+        "<|im_start|>assistant"
+      ],
+      "intra_message_sep": "<|im_end|>",
+      "stop_token_ids": [
+        2,
+        92542
+      ],
+      "stop": [
+        "</s>",
+        "<|im_end|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 262144,
+    "model_name": "internlm2.5-chat-1m",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "InternLM2.5 series of the InternLM model supports 1M long-context",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat-1m",
+        "model_hub": "modelscope"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "INTERNLM2",
+      "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
+      "roles": [
+        "<|im_start|>user",
+        "<|im_start|>assistant"
+      ],
+      "intra_message_sep": "<|im_end|>",
+      "stop_token_ids": [
+        2,
+        92542
+      ],
+      "stop": [
+        "</s>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 100000,
@@ -3771,6 +3941,29 @@
         ],
         "model_id": "AI-ModelScope/gemma-2-27b-it",
         "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_K_L",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_L",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q6_K_L",
+          "Q8_0",
+          "f32"
+        ],
+        "model_id": "LLM-Research/gemma-2-9b-it-GGUF",
+        "model_file_name_template": "gemma-2-9b-it-{quantization}.gguf",
+        "model_hub": "modelscope"
       }
     ],
     "prompt_style": {
@@ -4115,7 +4308,7 @@
       "zh"
     ],
     "model_ability": [
-      "generate"
+      "chat"
     ],
     "model_description": "Aquila2-chat series models are the chat models",
     "model_specs": [

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -34,9 +34,6 @@ from ....types import (
     CompletionChoice,
     CompletionChunk,
     CreateCompletionTorch,
-    Embedding,
-    EmbeddingData,
-    EmbeddingUsage,
     LoRA,
     PytorchGenerateConfig,
     PytorchModelConfig,
@@ -673,83 +670,6 @@ class PytorchModel(LLM):
         )
         self.handle_batch_inference_results(req_list)
-    def create_embedding(self, input: Union[str, List[str]]) -> Embedding:
-        try:
-            import torch
-            import torch.nn.functional as F
-        except ImportError as e:
-            raise ImportError(
-                "Could not import torch. Please install it with `pip install torch`."
-            ) from e
-        if isinstance(input, str):
-            inputs = [input]
-        else:
-            inputs = input
-        tokenizer = self._tokenizer
-        tokenizer.pad_token = tokenizer.eos_token
-        is_llama = "llama" in str(type(self._model))  # llama supports batch inference
-        is_chatglm = "chatglm" in str(type(self._model))
-        if is_llama:
-            encoding = tokenizer.batch_encode_plus(
-                inputs, padding=True, return_tensors="pt"
-            )
-            input_ids = encoding["input_ids"].to(self._device)
-            attention_mask = encoding["attention_mask"].to(self._device)
-            model_output = self._model(
-                input_ids, attention_mask, output_hidden_states=True
-            )
-            data = model_output.hidden_states[-1]
-            mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
-            masked_embeddings = data * mask
-            sum_embeddings = torch.sum(masked_embeddings, dim=1)
-            seq_length = torch.sum(mask, dim=1)
-            embedding = sum_embeddings / seq_length
-            normalized_embeddings = F.normalize(embedding, p=2, dim=1)
-            normalized_embeddings = normalized_embeddings.tolist()
-            token_num = torch.sum(attention_mask).item()
-            embedding_list = []
-            for index, data in enumerate(normalized_embeddings):
-                embedding_list.append(
-                    EmbeddingData(index=index, object="embedding", embedding=data)
-                )
-            usage = EmbeddingUsage(prompt_tokens=token_num, total_tokens=token_num)
-            ret = Embedding(
-                object="list",
-                model=self.model_uid,
-                data=embedding_list,
-                usage=usage,
-            )
-        else:
-            embedding = []
-            token_num = 0
-            for index, text in enumerate(inputs):
-                input_ids = tokenizer.encode(text, return_tensors="pt").to(self._device)
-                model_output = self._model(input_ids, output_hidden_states=True)
-                if is_chatglm:
-                    data = (model_output.hidden_states[-1].transpose(0, 1))[0]
-                else:
-                    data = model_output.hidden_states[-1][0]
-                data = F.normalize(torch.mean(data, dim=0), p=2, dim=0)
-                data = data.tolist()
-                embedding.append(
-                    EmbeddingData(index=index, object="embedding", embedding=data)
-                )
-                token_num += len(input_ids[0])
-            usage = EmbeddingUsage(prompt_tokens=token_num, total_tokens=token_num)
-            ret = Embedding(
-                object="list", model=self.model_uid, data=embedding, usage=usage
-            )
-        return ret
 class PytorchChatModel(PytorchModel, ChatModelMixin):
     def __init__(

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -269,8 +269,13 @@ class SGLANGModel(LLM):
         )
         stream = sanitized_generate_config.pop("stream")
         stream_options = sanitized_generate_config.pop("stream_options")
-        if isinstance(stream_options, dict):
-            include_usage = stream_options.pop("include_usage", False)
+        include_usage = (
+            stream_options.pop("include_usage")
+            if isinstance(stream_options, dict)
+            else False
+        )
         request_id = str(uuid.uuid1())
         state = pipeline.run(
             question=prompt,

xinference/model/llm/utils.py CHANGED Viewed

@@ -779,8 +779,10 @@ Begin!"""
 def get_file_location(
     llm_family: LLMFamilyV1, spec: LLMSpecV1, quantization: str
 ) -> Tuple[str, bool]:
-    cache_dir = _get_cache_dir(llm_family, spec, create_if_not_exist=False)
-    cache_status = get_cache_status(llm_family, spec)
+    cache_dir = _get_cache_dir(
+        llm_family, spec, quantization, create_if_not_exist=False
+    )
+    cache_status = get_cache_status(llm_family, spec, quantization)
     if isinstance(cache_status, list):
         is_cached = None
         for q, cs in zip(spec.quantizations, cache_status):

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -112,6 +112,8 @@ VLLM_SUPPORTED_CHAT_MODELS = [
     "internlm-chat-8k",
     "internlm-chat-20b",
     "internlm2-chat",
+    "internlm2.5-chat",
+    "internlm2.5-chat-1m",
     "qwen-chat",
     "Yi-chat",
     "Yi-1.5-chat",
@@ -127,6 +129,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
     "chatglm3-128k",
     "glm4-chat",
     "glm4-chat-1m",
+    "codegeex4",
     "deepseek-chat",
     "deepseek-coder-instruct",
 ]

xinference 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl

Potentially problematic release.

xinference 0.13.0py3-none-any.whl → 0.13.2py3-none-any.whl