PyPI - xinference - Versions diffs - 0.13.0__py3-none-any.whl → 0.13.1__py3-none-any.whl - Mend

xinference 0.13.0py3-none-any.whl → 0.13.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (66) hide show

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -14,7 +14,6 @@
 import logging
 import os
-import platform
 import shutil
 from threading import Lock
 from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
@@ -541,15 +540,20 @@ def _get_cache_dir_for_model_mem(
 def _get_cache_dir(
     llm_family: LLMFamilyV1,
     llm_spec: "LLMSpecV1",
+    quantization: Optional[str] = None,
     create_if_not_exist=True,
 ):
     # If the model id contains quantization, then we should give each
     # quantization a dedicated cache dir.
     quant_suffix = ""
-    for q in llm_spec.quantizations:
-        if llm_spec.model_id and q in llm_spec.model_id:
-            quant_suffix = q
-            break
+    if llm_spec.model_id and "{" in llm_spec.model_id and quantization is not None:
+        quant_suffix = quantization
+    else:
+        for q in llm_spec.quantizations:
+            if llm_spec.model_id and q in llm_spec.model_id:
+                quant_suffix = q
+                break
     cache_dir_name = (
         f"{llm_family.model_name}-{llm_spec.model_format}"
         f"-{llm_spec.model_size_in_billions}b"
@@ -900,6 +904,7 @@ def _check_revision(
     llm_spec: "LLMSpecV1",
     builtin: list,
     meta_path: str,
+    quantization: Optional[str] = None,
 ) -> bool:
     for family in builtin:
         if llm_family.model_name == family.model_name:
@@ -908,59 +913,63 @@ def _check_revision(
                 if (
                     spec.model_format == "pytorch"
                     and spec.model_size_in_billions == llm_spec.model_size_in_billions
+                    and (quantization is None or quantization in spec.quantizations)
                 ):
                     return valid_model_revision(meta_path, spec.model_revision)
     return False
 def get_cache_status(
-    llm_family: LLMFamilyV1,
-    llm_spec: "LLMSpecV1",
+    llm_family: LLMFamilyV1, llm_spec: "LLMSpecV1", quantization: Optional[str] = None
 ) -> Union[bool, List[bool]]:
     """
-    When calling this function from above, `llm_family` is constructed only from BUILTIN_LLM_FAMILIES,
-    so we should check both huggingface and modelscope cache files.
+    Checks if a model's cache status is available based on the model format and quantization.
+    Supports different directories and model formats.
     """
-    cache_dir = _get_cache_dir(llm_family, llm_spec, create_if_not_exist=False)
-    # check revision for pytorch model
-    if llm_spec.model_format == "pytorch":
-        hf_meta_path = _get_meta_path(cache_dir, "pytorch", "huggingface", "none")
-        ms_meta_path = _get_meta_path(cache_dir, "pytorch", "modelscope", "none")
-        revisions = [
-            _check_revision(llm_family, llm_spec, BUILTIN_LLM_FAMILIES, hf_meta_path),
-            _check_revision(
-                llm_family, llm_spec, BUILTIN_MODELSCOPE_LLM_FAMILIES, ms_meta_path
-            ),
-        ]
-        return any(revisions)
-    # just check meta file for ggml and gptq model
-    elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
-        ret = []
-        for q in llm_spec.quantizations:
-            assert q is not None
-            hf_meta_path = _get_meta_path(
-                cache_dir, llm_spec.model_format, "huggingface", q
-            )
-            ms_meta_path = _get_meta_path(
-                cache_dir, llm_spec.model_format, "modelscope", q
-            )
-            results = [os.path.exists(hf_meta_path), os.path.exists(ms_meta_path)]
-            ret.append(any(results))
-        return ret
-    else:
-        raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
-def _is_linux():
-    return platform.system() == "Linux"
+    def check_file_status(meta_path: str) -> bool:
+        return os.path.exists(meta_path)
+    def check_revision_status(
+        meta_path: str, families: list, quantization: Optional[str] = None
+    ) -> bool:
+        return _check_revision(llm_family, llm_spec, families, meta_path, quantization)
-def _has_cuda_device():
-    # `cuda_count` method already contains the logic for the
-    # number of GPUs specified by `CUDA_VISIBLE_DEVICES`.
-    from ...utils import cuda_count
+    def handle_quantization(q: Union[str, None]) -> bool:
+        specific_cache_dir = _get_cache_dir(
+            llm_family, llm_spec, q, create_if_not_exist=False
+        )
+        meta_paths = {
+            "huggingface": _get_meta_path(
+                specific_cache_dir, llm_spec.model_format, "huggingface", q
+            ),
+            "modelscope": _get_meta_path(
+                specific_cache_dir, llm_spec.model_format, "modelscope", q
+            ),
+        }
+        if llm_spec.model_format == "pytorch":
+            return check_revision_status(
+                meta_paths["huggingface"], BUILTIN_LLM_FAMILIES, q
+            ) or check_revision_status(
+                meta_paths["modelscope"], BUILTIN_MODELSCOPE_LLM_FAMILIES, q
+            )
+        else:
+            return check_file_status(meta_paths["huggingface"]) or check_file_status(
+                meta_paths["modelscope"]
+            )
-    return cuda_count() > 0
+    if llm_spec.model_id and "{" in llm_spec.model_id:
+        return (
+            [handle_quantization(q) for q in llm_spec.quantizations]
+            if quantization is None
+            else handle_quantization(quantization)
+        )
+    else:
+        return (
+            [handle_quantization(q) for q in llm_spec.quantizations]
+            if llm_spec.model_format != "pytorch"
+            else handle_quantization(None)
+        )
 def get_user_defined_llm_families():
@@ -1006,6 +1015,7 @@ def match_llm(
     model_format: Optional[str] = None,
     model_size_in_billions: Optional[Union[int, str]] = None,
     quantization: Optional[str] = None,
+    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
 ) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
     """
     Find an LLM family, spec, and quantization that satisfy given criteria.
@@ -1029,7 +1039,22 @@ def match_llm(
             spec.model_id = spec.model_id.format(quantization=q)
         return spec
-    if download_from_modelscope():
+    # priority: download_hub > download_from_modelscope() and download_from_csghub()
+    if download_hub == "modelscope":
+        all_families = (
+            BUILTIN_MODELSCOPE_LLM_FAMILIES
+            + BUILTIN_LLM_FAMILIES
+            + user_defined_llm_families
+        )
+    elif download_hub == "csghub":
+        all_families = (
+            BUILTIN_CSGHUB_LLM_FAMILIES
+            + BUILTIN_LLM_FAMILIES
+            + user_defined_llm_families
+        )
+    elif download_hub == "huggingface":
+        all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
+    elif download_from_modelscope():
         all_families = (
             BUILTIN_MODELSCOPE_LLM_FAMILIES
             + BUILTIN_LLM_FAMILIES

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -304,21 +304,6 @@
     ],
     "model_description": "ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.",
     "model_specs": [
-      {
-        "model_format": "ggmlv3",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "q4_0",
-          "q4_1",
-          "q5_0",
-          "q5_1",
-          "q8_0"
-        ],
-        "model_hub": "modelscope",
-        "model_id": "Xorbits/chatglm2-6B-GGML",
-        "model_revision": "v1.0.0",
-        "model_file_name_template": "chatglm2-ggml-{quantization}.bin"
-      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 6,
@@ -392,17 +377,6 @@
     ],
     "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
     "model_specs": [
-      {
-        "model_format": "ggmlv3",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "q4_0"
-        ],
-        "model_hub": "modelscope",
-        "model_id": "Xorbits/chatglm3-ggml",
-        "model_revision": "v1.0.0",
-        "model_file_name_template": "chatglm3-ggml-{quantization}.bin"
-      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 6,
@@ -547,6 +521,33 @@
         "model_hub": "modelscope",
         "model_id": "ZhipuAI/glm-4-9b-chat",
         "model_revision": "master"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "Q2_K",
+          "IQ3_XS",
+          "IQ3_S",
+          "IQ3_M",
+          "Q3_K_S",
+          "Q3_K_L",
+          "Q3_K",
+          "IQ4_XS",
+          "IQ4_NL",
+          "Q4_K_S",
+          "Q4_K",
+          "Q5_K_S",
+          "Q5_K",
+          "Q6_K",
+          "Q8_0",
+          "BF16",
+          "FP16"
+        ],
+        "model_file_name_template": "glm-4-9b-chat.{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "LLM-Research/glm-4-9b-chat-GGUF",
+        "model_revision": "master"
       }
     ],
     "prompt_style": {
@@ -593,6 +594,33 @@
         "model_hub": "modelscope",
         "model_id": "ZhipuAI/glm-4-9b-chat-1m",
         "model_revision": "master"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "Q2_K",
+          "IQ3_XS",
+          "IQ3_S",
+          "IQ3_M",
+          "Q3_K_S",
+          "Q3_K_L",
+          "Q3_K",
+          "IQ4_XS",
+          "IQ4_NL",
+          "Q4_K_S",
+          "Q4_K",
+          "Q5_K_S",
+          "Q5_K",
+          "Q6_K",
+          "Q8_0",
+          "BF16",
+          "FP16"
+        ],
+        "model_file_name_template": "glm-4-9b-chat-1m.{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_id": "LLM-Research/glm-4-9b-chat-1m-GGUF",
+        "model_revision": "master"
       }
     ],
     "prompt_style": {
@@ -4115,7 +4143,7 @@
       "zh"
     ],
     "model_ability": [
-      "generate"
+      "chat"
     ],
     "model_description": "Aquila2-chat series models are the chat models",
     "model_specs": [

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -34,9 +34,6 @@ from ....types import (
     CompletionChoice,
     CompletionChunk,
     CreateCompletionTorch,
-    Embedding,
-    EmbeddingData,
-    EmbeddingUsage,
     LoRA,
     PytorchGenerateConfig,
     PytorchModelConfig,
@@ -673,83 +670,6 @@ class PytorchModel(LLM):
         )
         self.handle_batch_inference_results(req_list)
-    def create_embedding(self, input: Union[str, List[str]]) -> Embedding:
-        try:
-            import torch
-            import torch.nn.functional as F
-        except ImportError as e:
-            raise ImportError(
-                "Could not import torch. Please install it with `pip install torch`."
-            ) from e
-        if isinstance(input, str):
-            inputs = [input]
-        else:
-            inputs = input
-        tokenizer = self._tokenizer
-        tokenizer.pad_token = tokenizer.eos_token
-        is_llama = "llama" in str(type(self._model))  # llama supports batch inference
-        is_chatglm = "chatglm" in str(type(self._model))
-        if is_llama:
-            encoding = tokenizer.batch_encode_plus(
-                inputs, padding=True, return_tensors="pt"
-            )
-            input_ids = encoding["input_ids"].to(self._device)
-            attention_mask = encoding["attention_mask"].to(self._device)
-            model_output = self._model(
-                input_ids, attention_mask, output_hidden_states=True
-            )
-            data = model_output.hidden_states[-1]
-            mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
-            masked_embeddings = data * mask
-            sum_embeddings = torch.sum(masked_embeddings, dim=1)
-            seq_length = torch.sum(mask, dim=1)
-            embedding = sum_embeddings / seq_length
-            normalized_embeddings = F.normalize(embedding, p=2, dim=1)
-            normalized_embeddings = normalized_embeddings.tolist()
-            token_num = torch.sum(attention_mask).item()
-            embedding_list = []
-            for index, data in enumerate(normalized_embeddings):
-                embedding_list.append(
-                    EmbeddingData(index=index, object="embedding", embedding=data)
-                )
-            usage = EmbeddingUsage(prompt_tokens=token_num, total_tokens=token_num)
-            ret = Embedding(
-                object="list",
-                model=self.model_uid,
-                data=embedding_list,
-                usage=usage,
-            )
-        else:
-            embedding = []
-            token_num = 0
-            for index, text in enumerate(inputs):
-                input_ids = tokenizer.encode(text, return_tensors="pt").to(self._device)
-                model_output = self._model(input_ids, output_hidden_states=True)
-                if is_chatglm:
-                    data = (model_output.hidden_states[-1].transpose(0, 1))[0]
-                else:
-                    data = model_output.hidden_states[-1][0]
-                data = F.normalize(torch.mean(data, dim=0), p=2, dim=0)
-                data = data.tolist()
-                embedding.append(
-                    EmbeddingData(index=index, object="embedding", embedding=data)
-                )
-                token_num += len(input_ids[0])
-            usage = EmbeddingUsage(prompt_tokens=token_num, total_tokens=token_num)
-            ret = Embedding(
-                object="list", model=self.model_uid, data=embedding, usage=usage
-            )
-        return ret
 class PytorchChatModel(PytorchModel, ChatModelMixin):
     def __init__(

xinference/model/llm/utils.py CHANGED Viewed

@@ -779,8 +779,10 @@ Begin!"""
 def get_file_location(
     llm_family: LLMFamilyV1, spec: LLMSpecV1, quantization: str
 ) -> Tuple[str, bool]:
-    cache_dir = _get_cache_dir(llm_family, spec, create_if_not_exist=False)
-    cache_status = get_cache_status(llm_family, spec)
+    cache_dir = _get_cache_dir(
+        llm_family, spec, quantization, create_if_not_exist=False
+    )
+    cache_status = get_cache_status(llm_family, spec, quantization)
     if isinstance(cache_status, list):
         is_cached = None
         for q, cs in zip(spec.quantizations, cache_status):

xinference/model/rerank/core.py CHANGED Viewed

@@ -18,7 +18,7 @@ import os
 import uuid
 from collections import defaultdict
 from collections.abc import Sequence
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Literal, Optional, Tuple
 import numpy as np
 import torch
@@ -285,7 +285,12 @@ def cache(model_spec: RerankModelSpec):
 def create_rerank_model_instance(
-    subpool_addr: str, devices: List[str], model_uid: str, model_name: str, **kwargs
+    subpool_addr: str,
+    devices: List[str],
+    model_uid: str,
+    model_name: str,
+    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
+    **kwargs,
 ) -> Tuple[RerankModel, RerankModelDescription]:
     from ..utils import download_from_modelscope
     from . import BUILTIN_RERANK_MODELS, MODELSCOPE_RERANK_MODELS
@@ -298,30 +303,24 @@ def create_rerank_model_instance(
             break
     if model_spec is None:
-        if download_from_modelscope():
-            if model_name in MODELSCOPE_RERANK_MODELS:
-                logger.debug(f"Rerank model {model_name} found in ModelScope.")
-                model_spec = MODELSCOPE_RERANK_MODELS[model_name]
-            else:
-                logger.debug(
-                    f"Rerank model {model_name} not found in ModelScope, "
-                    f"now try to download from huggingface."
-                )
-                if model_name in BUILTIN_RERANK_MODELS:
-                    model_spec = BUILTIN_RERANK_MODELS[model_name]
-                else:
-                    raise ValueError(
-                        f"Rerank model {model_name} not found, available"
-                        f"model list: {BUILTIN_RERANK_MODELS.keys()}"
-                    )
+        if download_hub == "huggingface" and model_name in BUILTIN_RERANK_MODELS:
+            logger.debug(f"Rerank model {model_name} found in Huggingface.")
+            model_spec = BUILTIN_RERANK_MODELS[model_name]
+        elif download_hub == "modelscope" and model_name in MODELSCOPE_RERANK_MODELS:
+            logger.debug(f"Rerank model {model_name} found in ModelScope.")
+            model_spec = MODELSCOPE_RERANK_MODELS[model_name]
+        elif download_from_modelscope() and model_name in MODELSCOPE_RERANK_MODELS:
+            logger.debug(f"Rerank model {model_name} found in ModelScope.")
+            model_spec = MODELSCOPE_RERANK_MODELS[model_name]
+        elif model_name in BUILTIN_RERANK_MODELS:
+            logger.debug(f"Rerank model {model_name} found in Huggingface.")
+            model_spec = BUILTIN_RERANK_MODELS[model_name]
         else:
-            if model_name in BUILTIN_RERANK_MODELS:
-                model_spec = BUILTIN_RERANK_MODELS[model_name]
-            else:
-                raise ValueError(
-                    f"Rerank model {model_name} not found, available"
-                    f"model list: {BUILTIN_RERANK_MODELS.keys()}"
-                )
+            raise ValueError(
+                f"Rerank model {model_name} not found, available"
+                f"Huggingface: {BUILTIN_RERANK_MODELS.keys()}"
+                f"ModelScope: {MODELSCOPE_RERANK_MODELS.keys()}"
+            )
     model_path = cache(model_spec)
     use_fp16 = kwargs.pop("use_fp16", False)

xinference/types.py CHANGED Viewed

@@ -285,7 +285,6 @@ class LlamaCppModelConfig(TypedDict, total=False):
     vocab_only: bool
     use_mmap: bool
     use_mlock: bool
-    embedding: bool
     n_threads: Optional[int]
     n_batch: int
     last_n_tokens_size: int

xinference/web/ui/build/asset-manifest.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
   "files": {
     "main.css": "./static/css/main.4bafd904.css",
-    "main.js": "./static/js/main.0fb6f3ab.js",
+    "main.js": "./static/js/main.95c1d652.js",
     "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
     "index.html": "./index.html",
     "main.4bafd904.css.map": "./static/css/main.4bafd904.css.map",
-    "main.0fb6f3ab.js.map": "./static/js/main.0fb6f3ab.js.map"
+    "main.95c1d652.js.map": "./static/js/main.95c1d652.js.map"
   },
   "entrypoints": [
     "static/css/main.4bafd904.css",
-    "static/js/main.0fb6f3ab.js"
+    "static/js/main.95c1d652.js"
   ]
 }

xinference/web/ui/build/index.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.~~0fb6f3ab~~.js"></script><link href="./static/css/main.4bafd904.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1	+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.95c1d652.js"></script><link href="./static/css/main.4bafd904.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

xinference 0.13.0__py3-none-any.whl → 0.13.1__py3-none-any.whl

Potentially problematic release.

xinference 0.13.0py3-none-any.whl → 0.13.1py3-none-any.whl