PyPI - xinference - Versions diffs - 0.12.3__py3-none-any.whl → 0.13.1__py3-none-any.whl - Mend - Supply Chain Defender

xinference 0.12.3py3-none-any.whl → 0.13.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (101) hide show

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -574,19 +574,6 @@
     ],
     "model_description": "ChatGLM is an open-source General Language Model (GLM) based LLM trained on both Chinese and English data.",
     "model_specs": [
-      {
-        "model_format": "ggmlv3",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "q4_0",
-          "q4_1",
-          "q5_0",
-          "q5_1",
-          "q8_0"
-        ],
-        "model_id": "Xorbits/chatglm-6B-GGML",
-        "model_file_name_template": "chatglm-ggml-{quantization}.bin"
-      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 6,
@@ -622,19 +609,6 @@
     ],
     "model_description": "ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.",
     "model_specs": [
-      {
-        "model_format": "ggmlv3",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "q4_0",
-          "q4_1",
-          "q5_0",
-          "q5_1",
-          "q8_0"
-        ],
-        "model_id": "Xorbits/chatglm2-6B-GGML",
-        "model_file_name_template": "chatglm2-ggml-{quantization}.bin"
-      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 6,
@@ -706,15 +680,6 @@
     ],
     "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
     "model_specs": [
-      {
-        "model_format": "ggmlv3",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "q4_0"
-        ],
-        "model_id": "Xorbits/chatglm3-6B-GGML",
-        "model_file_name_template": "chatglm3-ggml-{quantization}.bin"
-      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 6,
@@ -855,6 +820,32 @@
         ],
         "model_id": "THUDM/glm-4-9b-chat",
         "model_revision": "b84dc74294ccd507a3d78bde8aebf628221af9bd"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "Q2_K",
+          "IQ3_XS",
+          "IQ3_S",
+          "IQ3_M",
+          "Q3_K_S",
+          "Q3_K_L",
+          "Q3_K",
+          "IQ4_XS",
+          "IQ4_NL",
+          "Q4_K_S",
+          "Q4_K",
+          "Q5_K_S",
+          "Q5_K",
+          "Q6_K",
+          "Q8_0",
+          "BF16",
+          "FP16"
+        ],
+        "model_file_name_template": "glm-4-9b-chat.{quantization}.gguf",
+        "model_id": "legraphista/glm-4-9b-chat-GGUF",
+        "model_revision": "0155a14edf0176863e9a003cdd78ce599e4d62c0"
       }
     ],
     "prompt_style": {
@@ -900,6 +891,32 @@
         ],
         "model_id": "THUDM/glm-4-9b-chat-1m",
         "model_revision": "715ddbe91082f976ff6a4ca06d59e5bbff6c3642"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "Q2_K",
+          "IQ3_XS",
+          "IQ3_S",
+          "IQ3_M",
+          "Q3_K_S",
+          "Q3_K_L",
+          "Q3_K",
+          "IQ4_XS",
+          "IQ4_NL",
+          "Q4_K_S",
+          "Q4_K",
+          "Q5_K_S",
+          "Q5_K",
+          "Q6_K",
+          "Q8_0",
+          "BF16",
+          "FP16"
+        ],
+        "model_file_name_template": "glm-4-9b-chat-1m.{quantization}.gguf",
+        "model_id": "legraphista/glm-4-9b-chat-1m-GGUF",
+        "model_revision": "782e28bd5eee3c514c07108da15e0b5e06dcf776"
       }
     ],
     "prompt_style": {
@@ -944,7 +961,7 @@
           "none"
         ],
         "model_id": "THUDM/glm-4v-9b",
-        "model_revision": "e8b84fefc07e58a90c8489337675573fda95e289"
+        "model_revision": "6c2e4732db8443f64a48d5af04b74425a7d169c4"
       }
     ],
     "prompt_style": {
@@ -2549,6 +2566,38 @@
         ],
         "model_id": "Qwen/Qwen2-72B-Instruct-AWQ"
       },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "Qwen/Qwen2-0.5B-Instruct-MLX"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "Qwen/Qwen2-1.5B-Instruct-MLX"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "Qwen/Qwen2-7B-Instruct-MLX"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "mlx-community/Qwen2-72B-Instruct-4bit"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": "0_5",
@@ -2565,6 +2614,82 @@
         ],
         "model_id": "Qwen/Qwen2-0.5B-Instruct-GGUF",
         "model_file_name_template": "qwen2-0_5b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2-1.5B-Instruct-GGUF",
+        "model_file_name_template": "qwen2-1_5b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2-7B-Instruct-GGUF",
+        "model_file_name_template": "qwen2-7b-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2-72B-Instruct-GGUF",
+        "model_file_name_template": "qwen2-72b-instruct-{quantization}.gguf",
+        "model_file_name_split_template": "qwen2-72b-instruct-{quantization}-{part}.gguf",
+        "quantization_parts": {
+          "q5_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q5_k_m": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q6_k": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "q8_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "fp16": [
+            "00001-of-00004",
+            "00002-of-00004",
+            "00003-of-00004",
+            "00004-of-00004"
+          ]
+        }
       }
     ],
     "prompt_style": {
@@ -2618,6 +2743,34 @@
           "Int4"
         ],
         "model_id": "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
+        ],
+        "model_id": "Qwen/Qwen2-57B-A14B-Instruct-GGUF",
+        "model_file_name_template": "qwen2-57b-a14b-instruct-{quantization}.gguf",
+        "model_file_name_split_template": "qwen2-57b-a14b-instruct-{quantization}-{part}.gguf",
+        "quantization_parts": {
+          "q8_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "fp16": [
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
+          ]
+        }
       }
     ],
     "prompt_style": {
@@ -5809,6 +5962,16 @@
       "roles": [
         "user",
         "assistant"
+      ],
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
       ]
     }
   },
@@ -5997,6 +6160,99 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "gemma-2-it",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "google/gemma-2-9b-it"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "google/gemma-2-27b-it"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "mlx-community/gemma-2-9b-it-4bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "8-bit"
+        ],
+        "model_id": "mlx-community/gemma-2-9b-it-8bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "None"
+        ],
+        "model_id": "mlx-community/gemma-2-9b-it-fp16"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "4-bit"
+        ],
+        "model_id": "mlx-community/gemma-2-27b-it-4bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "8-bit"
+        ],
+        "model_id": "mlx-community/gemma-2-27b-it-8bit"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 27,
+        "quantizations": [
+          "None"
+        ],
+        "model_id": "mlx-community/gemma-2-27b-it-fp16"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "gemma",
+      "roles": [
+        "user",
+        "model"
+      ],
+      "stop": [
+        "<end_of_turn>",
+        "<start_of_turn>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -14,7 +14,6 @@
 import logging
 import os
-import platform
 import shutil
 from threading import Lock
 from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
@@ -107,6 +106,28 @@ class PytorchLLMSpecV1(BaseModel):
         return v
+class MLXLLMSpecV1(BaseModel):
+    model_format: Literal["mlx"]
+    # Must in order that `str` first, then `int`
+    model_size_in_billions: Union[str, int]
+    quantizations: List[str]
+    model_id: Optional[str]
+    model_hub: str = "huggingface"
+    model_uri: Optional[str]
+    model_revision: Optional[str]
+    @validator("model_size_in_billions", pre=False)
+    def validate_model_size_with_radix(cls, v: object) -> object:
+        if isinstance(v, str):
+            if (
+                "_" in v
+            ):  # for example, "1_8" just returns "1_8", otherwise int("1_8") returns 18
+                return v
+            else:
+                return int(v)
+        return v
 class PromptStyleV1(BaseModel):
     style_name: str
     system_prompt: str = ""
@@ -226,7 +247,7 @@ class CustomLLMFamilyV1(LLMFamilyV1):
 LLMSpecV1 = Annotated[
-    Union[GgmlLLMSpecV1, PytorchLLMSpecV1],
+    Union[GgmlLLMSpecV1, PytorchLLMSpecV1, MLXLLMSpecV1],
     Field(discriminator="model_format"),
 ]
@@ -249,6 +270,8 @@ UD_LLM_FAMILIES_LOCK = Lock()
 VLLM_CLASSES: List[Type[LLM]] = []
+MLX_CLASSES: List[Type[LLM]] = []
 LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
 SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
@@ -517,15 +540,20 @@ def _get_cache_dir_for_model_mem(
 def _get_cache_dir(
     llm_family: LLMFamilyV1,
     llm_spec: "LLMSpecV1",
+    quantization: Optional[str] = None,
     create_if_not_exist=True,
 ):
     # If the model id contains quantization, then we should give each
     # quantization a dedicated cache dir.
     quant_suffix = ""
-    for q in llm_spec.quantizations:
-        if llm_spec.model_id and q in llm_spec.model_id:
-            quant_suffix = q
-            break
+    if llm_spec.model_id and "{" in llm_spec.model_id and quantization is not None:
+        quant_suffix = quantization
+    else:
+        for q in llm_spec.quantizations:
+            if llm_spec.model_id and q in llm_spec.model_id:
+                quant_suffix = q
+                break
     cache_dir_name = (
         f"{llm_family.model_name}-{llm_spec.model_format}"
         f"-{llm_spec.model_size_in_billions}b"
@@ -549,7 +577,7 @@ def _get_meta_path(
             return os.path.join(cache_dir, "__valid_download")
         else:
             return os.path.join(cache_dir, f"__valid_download_{model_hub}")
-    elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
+    elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
         assert quantization is not None
         if model_hub == "huggingface":
             return os.path.join(cache_dir, f"__valid_download_{quantization}")
@@ -588,7 +616,7 @@ def _skip_download(
                     logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
                     return True
             return False
-    elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
+    elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
         assert quantization is not None
         return os.path.exists(
             _get_meta_path(cache_dir, model_format, model_hub, quantization)
@@ -683,7 +711,7 @@ def cache_from_csghub(
     ):
         return cache_dir
-    if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
+    if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
         download_dir = retry_download(
             snapshot_download,
             llm_family.model_name,
@@ -751,7 +779,7 @@ def cache_from_modelscope(
     ):
         return cache_dir
-    if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
+    if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
         download_dir = retry_download(
             snapshot_download,
             llm_family.model_name,
@@ -820,8 +848,8 @@ def cache_from_huggingface(
     if not IS_NEW_HUGGINGFACE_HUB:
         use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
-    if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
-        assert isinstance(llm_spec, PytorchLLMSpecV1)
+    if llm_spec.model_format in ["pytorch", "gptq", "awq", "mlx"]:
+        assert isinstance(llm_spec, (PytorchLLMSpecV1, MLXLLMSpecV1))
         download_dir = retry_download(
             huggingface_hub.snapshot_download,
             llm_family.model_name,
@@ -876,6 +904,7 @@ def _check_revision(
     llm_spec: "LLMSpecV1",
     builtin: list,
     meta_path: str,
+    quantization: Optional[str] = None,
 ) -> bool:
     for family in builtin:
         if llm_family.model_name == family.model_name:
@@ -884,59 +913,63 @@ def _check_revision(
                 if (
                     spec.model_format == "pytorch"
                     and spec.model_size_in_billions == llm_spec.model_size_in_billions
+                    and (quantization is None or quantization in spec.quantizations)
                 ):
                     return valid_model_revision(meta_path, spec.model_revision)
     return False
 def get_cache_status(
-    llm_family: LLMFamilyV1,
-    llm_spec: "LLMSpecV1",
+    llm_family: LLMFamilyV1, llm_spec: "LLMSpecV1", quantization: Optional[str] = None
 ) -> Union[bool, List[bool]]:
     """
-    When calling this function from above, `llm_family` is constructed only from BUILTIN_LLM_FAMILIES,
-    so we should check both huggingface and modelscope cache files.
+    Checks if a model's cache status is available based on the model format and quantization.
+    Supports different directories and model formats.
     """
-    cache_dir = _get_cache_dir(llm_family, llm_spec, create_if_not_exist=False)
-    # check revision for pytorch model
-    if llm_spec.model_format == "pytorch":
-        hf_meta_path = _get_meta_path(cache_dir, "pytorch", "huggingface", "none")
-        ms_meta_path = _get_meta_path(cache_dir, "pytorch", "modelscope", "none")
-        revisions = [
-            _check_revision(llm_family, llm_spec, BUILTIN_LLM_FAMILIES, hf_meta_path),
-            _check_revision(
-                llm_family, llm_spec, BUILTIN_MODELSCOPE_LLM_FAMILIES, ms_meta_path
-            ),
-        ]
-        return any(revisions)
-    # just check meta file for ggml and gptq model
-    elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
-        ret = []
-        for q in llm_spec.quantizations:
-            assert q is not None
-            hf_meta_path = _get_meta_path(
-                cache_dir, llm_spec.model_format, "huggingface", q
-            )
-            ms_meta_path = _get_meta_path(
-                cache_dir, llm_spec.model_format, "modelscope", q
-            )
-            results = [os.path.exists(hf_meta_path), os.path.exists(ms_meta_path)]
-            ret.append(any(results))
-        return ret
-    else:
-        raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
-def _is_linux():
-    return platform.system() == "Linux"
+    def check_file_status(meta_path: str) -> bool:
+        return os.path.exists(meta_path)
+    def check_revision_status(
+        meta_path: str, families: list, quantization: Optional[str] = None
+    ) -> bool:
+        return _check_revision(llm_family, llm_spec, families, meta_path, quantization)
-def _has_cuda_device():
-    # `cuda_count` method already contains the logic for the
-    # number of GPUs specified by `CUDA_VISIBLE_DEVICES`.
-    from ...utils import cuda_count
+    def handle_quantization(q: Union[str, None]) -> bool:
+        specific_cache_dir = _get_cache_dir(
+            llm_family, llm_spec, q, create_if_not_exist=False
+        )
+        meta_paths = {
+            "huggingface": _get_meta_path(
+                specific_cache_dir, llm_spec.model_format, "huggingface", q
+            ),
+            "modelscope": _get_meta_path(
+                specific_cache_dir, llm_spec.model_format, "modelscope", q
+            ),
+        }
+        if llm_spec.model_format == "pytorch":
+            return check_revision_status(
+                meta_paths["huggingface"], BUILTIN_LLM_FAMILIES, q
+            ) or check_revision_status(
+                meta_paths["modelscope"], BUILTIN_MODELSCOPE_LLM_FAMILIES, q
+            )
+        else:
+            return check_file_status(meta_paths["huggingface"]) or check_file_status(
+                meta_paths["modelscope"]
+            )
-    return cuda_count() > 0
+    if llm_spec.model_id and "{" in llm_spec.model_id:
+        return (
+            [handle_quantization(q) for q in llm_spec.quantizations]
+            if quantization is None
+            else handle_quantization(quantization)
+        )
+    else:
+        return (
+            [handle_quantization(q) for q in llm_spec.quantizations]
+            if llm_spec.model_format != "pytorch"
+            else handle_quantization(None)
+        )
 def get_user_defined_llm_families():
@@ -982,6 +1015,7 @@ def match_llm(
     model_format: Optional[str] = None,
     model_size_in_billions: Optional[Union[int, str]] = None,
     quantization: Optional[str] = None,
+    download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
 ) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
     """
     Find an LLM family, spec, and quantization that satisfy given criteria.
@@ -1005,7 +1039,22 @@ def match_llm(
             spec.model_id = spec.model_id.format(quantization=q)
         return spec
-    if download_from_modelscope():
+    # priority: download_hub > download_from_modelscope() and download_from_csghub()
+    if download_hub == "modelscope":
+        all_families = (
+            BUILTIN_MODELSCOPE_LLM_FAMILIES
+            + BUILTIN_LLM_FAMILIES
+            + user_defined_llm_families
+        )
+    elif download_hub == "csghub":
+        all_families = (
+            BUILTIN_CSGHUB_LLM_FAMILIES
+            + BUILTIN_LLM_FAMILIES
+            + user_defined_llm_families
+        )
+    elif download_hub == "huggingface":
+        all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
+    elif download_from_modelscope():
         all_families = (
             BUILTIN_MODELSCOPE_LLM_FAMILIES
             + BUILTIN_LLM_FAMILIES