PyPI - xinference - Versions diffs - 0.10.2.post1__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend - Supply Chain Defender

xinference 0.10.2.post1py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (92) hide show

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -461,6 +461,106 @@
       }
     ]
   },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "phi-3-mini-128k-instruct",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The Phi-3-Mini-128K-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "microsoft/Phi-3-mini-128k-instruct",
+        "model_revision": "ebee18c488086b396dde649f2aa6548b9b8d2404"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "PHI3",
+      "system_prompt": "You are a helpful AI assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "inter_message_sep": "<|end|>\n",
+      "stop_token_ids":[
+        32000,
+        32001,
+        32007
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|assistant|>",
+        "<|end|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "phi-3-mini-4k-instruct",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The Phi-3-Mini-4k-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
+    "model_specs": [
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "fp16",
+          "q4"
+        ],
+        "model_id": "microsoft/Phi-3-mini-4k-instruct-gguf",
+        "model_file_name_template": "Phi-3-mini-4k-instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "microsoft/Phi-3-mini-4k-instruct",
+        "model_revision": "b86bcaf57ea4dfdec5dbe12a377028b2fab0d480"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "PHI3",
+      "system_prompt": "You are a helpful AI assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "inter_message_sep": "<|end|>\n",
+      "stop_token_ids":[
+        32000,
+        32001,
+        32007
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|assistant|>",
+        "<|end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,
@@ -624,7 +724,7 @@
           "none"
         ],
         "model_id": "THUDM/chatglm3-6b",
-        "model_revision": "b098244a71fbe69ce149682d9072a7629f7e908c"
+        "model_revision": "103caa40027ebfd8450289ca2f278eac4ff26405"
       }
     ],
     "prompt_style": {
@@ -1220,6 +1320,148 @@
       }
     ]
   },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "llama-3",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Llama 3 is an auto-regressive language model that uses an optimized transformer architecture",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "meta-llama/Meta-Llama-3-8B"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "QuantFactory/Meta-Llama-3-8B-GGUF",
+        "model_file_name_template": "Meta-Llama-3-8B.{quantization}.gguf"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "meta-llama/Meta-Llama-3-70B"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "Q4_K_M",
+          "Q5_K_M"
+        ],
+        "model_id": "NousResearch/Meta-Llama-3-70B-GGUF",
+        "model_file_name_template": "Meta-Llama-3-70B-{quantization}.gguf"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "llama-3-instruct",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks..",
+    "model_specs": [
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "IQ3_M",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
+        "model_file_name_template": "Meta-Llama-3-8B-Instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "meta-llama/Meta-Llama-3-8B-Instruct"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "IQ1_M",
+          "IQ2_XS",
+          "Q4_K_M"
+        ],
+        "model_id": "lmstudio-community/Meta-Llama-3-70B-Instruct-GGUF",
+        "model_file_name_template": "Meta-Llama-3-70B-Instruct-{quantization}.gguf"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "meta-llama/Meta-Llama-3-70B-Instruct"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "LLAMA3",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n\n",
+      "inter_message_sep": "<|eot_id|>",
+      "stop_token_ids": [
+        128001,
+        128009
+      ],
+      "stop": [
+        "<|end_of_text|>",
+        "<|eot_id|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,
@@ -1625,6 +1867,16 @@
         ],
         "model_id": "Qwen/Qwen1.5-72B-Chat"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 110,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen1.5-110B-Chat"
+      },
       {
         "model_format": "gptq",
         "model_size_in_billions": "0_5",
@@ -1687,6 +1939,14 @@
         ],
         "model_id": "Qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 110,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-110B-Chat-GPTQ-Int4"
+      },
       {
         "model_format": "awq",
         "model_size_in_billions": "0_5",
@@ -1743,6 +2003,14 @@
         ],
         "model_id": "Qwen/Qwen1.5-72B-Chat-AWQ"
       },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 110,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-110B-Chat-AWQ"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": "0_5",
@@ -3177,6 +3445,142 @@
       "inter_message_sep": ""
     }
   },
+  {
+    "version": 1,
+    "context_length": 65536,
+    "model_name": "mixtral-8x22B-instruct-v0.1",
+    "model_lang": [
+      "en",
+      "fr",
+      "it",
+      "de",
+      "es"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The Mixtral-8x22B-Instruct-v0.1 Large Language Model (LLM) is an instruct fine-tuned version of the Mixtral-8x22B-v0.1, specializing in chatting.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "141",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "mistralai/Mixtral-8x22B-Instruct-v0.1",
+        "model_revision": "ebb919ac9e9f7f9a900644621bae7963bc593f4f"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "141",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "141",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "jarrelscy/Mixtral-8x22B-Instruct-v0.1-GPTQ-4bit"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "141",
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6",
+          "Q8_0",
+          "fp16"
+        ],
+        "model_id": "MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-GGUF",
+        "model_file_name_template": "Mixtral-8x22B-Instruct-{quantization}.gguf",
+        "model_file_name_split_template": "Mixtral-8x22B-Instruct-v0.1.{quantization}-{part}.gguf",
+        "quantization_parts": {
+          "Q2_K": [
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
+          ],
+          "Q3_K_L": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "Q3_K_M": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "Q3_K_S": [
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
+          ],
+          "Q4_K_M": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "Q4_K_S": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "Q5_K_M": [
+            "00001-of-00004",
+            "00002-of-00004",
+            "00003-of-00004",
+            "00004-of-00004"
+          ],
+          "Q5_K_S": [
+            "00001-of-00004",
+            "00002-of-00004",
+            "00003-of-00004",
+            "00004-of-00004"
+          ],
+          "Q6": [
+            "00001-of-00004",
+            "00002-of-00004",
+            "00003-of-00004",
+            "00004-of-00004"
+          ],
+          "Q8_0": [
+            "00001-of-00004",
+            "00002-of-00004",
+            "00003-of-00004",
+            "00004-of-00004"
+          ],
+          "fp16": [
+            "00001-of-00007",
+            "00002-of-00007",
+            "00003-of-00007",
+            "00004-of-00007",
+            "00005-of-00007",
+            "00006-of-00007",
+            "00007-of-00007"
+          ]
+        }
+      }
+    ],
+    "prompt_style": {
+      "style_name": "MIXTRAL_V01",
+      "system_prompt": "",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "",
+      "inter_message_sep": ""
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,
@@ -4953,7 +5357,7 @@
           "Q8_0"
         ],
         "model_id": "andrewcanis/c4ai-command-r-v01-GGUF",
-        "model_file_name_template": "c4ai-command-r-v01.{quantization}.gguf"
+        "model_file_name_template": "c4ai-command-r-v01-{quantization}.gguf"
       },
       {
         "model_format": "pytorch",
@@ -5015,5 +5419,45 @@
         "model_revision": "bb63b5b7005ecedb30b0cfd0d5953b02a5817f7b"
       }
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "Starling-LM",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "We introduce Starling-7B, an open large language model (LLM) trained by Reinforcement Learning from AI Feedback (RLAIF). The model harnesses the power of our new GPT-4 labeled ranking dataset",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "berkeley-nest/Starling-LM-7B-alpha",
+        "model_revision": "1dddf3b95bc1391f6307299eb1c162c194bde9bd"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "ADD_COLON_SINGLE",
+      "system_prompt": "",
+      "roles": [
+        "GPT4 Correct User",
+        "GPT4 Correct Assistant"
+      ],
+      "intra_message_sep": "<|end_of_turn|>",
+      "inter_message_sep": "",
+      "stop_token_ids": [
+        2,
+        32000
+      ]
+    }
   }
 ]

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -33,7 +33,6 @@ from ..._compat import (
     validator,
 )
 from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
-from ...types import LoRA
 from ..utils import (
     download_from_modelscope,
     is_valid_model_uri,
@@ -167,7 +166,7 @@ class CustomLLMFamilyV1(LLMFamilyV1):
             )
         if (
             llm_spec.model_family != "other"
-            and "tool_call" in llm_spec.model_ability
+            and "tools" in llm_spec.model_ability
             and llm_spec.model_family not in BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES
         ):
             raise ValueError(
@@ -227,16 +226,23 @@ LLMFamilyV1.update_forward_refs()
 CustomLLMFamilyV1.update_forward_refs()
-LLM_CLASSES: List[Type[LLM]] = []
-PEFT_SUPPORTED_CLASSES: List[Type[LLM]] = []
+LLAMA_CLASSES: List[Type[LLM]] = []
 BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
 BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = []
+SGLANG_CLASSES: List[Type[LLM]] = []
+TRANSFORMERS_CLASSES: List[Type[LLM]] = []
 UD_LLM_FAMILIES: List["LLMFamilyV1"] = []
 UD_LLM_FAMILIES_LOCK = Lock()
+VLLM_CLASSES: List[Type[LLM]] = []
+LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
+SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
 LLM_LAUNCH_VERSIONS: Dict[str, List[str]] = {}
@@ -822,7 +828,6 @@ def match_llm(
     model_format: Optional[str] = None,
     model_size_in_billions: Optional[Union[int, str]] = None,
     quantization: Optional[str] = None,
-    is_local_deployment: bool = False,
 ) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
     """
     Find an LLM family, spec, and quantization that satisfy given criteria.
@@ -880,30 +885,15 @@ def match_llm(
                     matched_quantization,
                 )
             else:
-                if spec.model_format == "pytorch":
-                    return family, _apply_format_to_model_id(spec, "none"), "none"
-                else:
-                    # by default, choose the most coarse-grained quantization.
-                    # TODO: too hacky.
-                    quantizations = spec.quantizations
-                    quantizations.sort()
-                    for q in quantizations:
-                        if (
-                            is_local_deployment
-                            and not (_is_linux() and _has_cuda_device())
-                            and q == "4-bit"
-                        ):
-                            logger.warning(
-                                "Skipping %s for non-linux or non-cuda local deployment .",
-                                q,
-                            )
-                            continue
-                        return family, _apply_format_to_model_id(spec, q), q
+                # TODO: If user does not specify quantization, just use the first one
+                _q = "none" if spec.model_format == "pytorch" else spec.quantizations[0]
+                return family, _apply_format_to_model_id(spec, _q), _q
     return None
 def register_llm(llm_family: LLMFamilyV1, persist: bool):
     from ..utils import is_valid_model_name
+    from . import generate_engine_config_by_model_family
     if not is_valid_model_name(llm_family.model_name):
         raise ValueError(f"Invalid model name {llm_family.model_name}.")
@@ -916,6 +906,7 @@ def register_llm(llm_family: LLMFamilyV1, persist: bool):
                 )
         UD_LLM_FAMILIES.append(llm_family)
+        generate_engine_config_by_model_family(llm_family)
     if persist:
         # We only validate model URL when persist is True.
@@ -941,6 +932,7 @@ def unregister_llm(model_name: str, raise_error: bool = True):
                 break
         if llm_family:
             UD_LLM_FAMILIES.remove(llm_family)
+            del LLM_ENGINES[model_name]
             persist_path = os.path.join(
                 XINFERENCE_MODEL_DIR, "llm", f"{llm_family.model_name}.json"
@@ -972,21 +964,33 @@ def unregister_llm(model_name: str, raise_error: bool = True):
                 logger.warning(f"Custom model {model_name} not found")
-def match_llm_cls(
-    family: LLMFamilyV1,
-    llm_spec: "LLMSpecV1",
+def check_engine_by_spec_parameters(
+    model_engine: str,
+    model_name: str,
+    model_format: str,
+    model_size_in_billions: Union[str, int],
     quantization: str,
-    peft_model: Optional[List[LoRA]] = None,
-) -> Optional[Type[LLM]]:
-    """
-    Find an LLM implementation for given LLM family and spec.
-    """
-    if peft_model is not None:
-        for cls in PEFT_SUPPORTED_CLASSES:
-            if cls.match(family, llm_spec, quantization):
-                return cls
-    else:
-        for cls in LLM_CLASSES:
-            if cls.match(family, llm_spec, quantization):
-                return cls
-    return None
+) -> Type[LLM]:
+    def get_model_engine_from_spell(engine_str: str) -> str:
+        for engine in LLM_ENGINES[model_name].keys():
+            if engine.lower() == engine_str.lower():
+                return engine
+        return engine_str
+    if model_name not in LLM_ENGINES:
+        raise ValueError(f"Model {model_name} not found.")
+    model_engine = get_model_engine_from_spell(model_engine)
+    if model_engine not in LLM_ENGINES[model_name]:
+        raise ValueError(f"Model {model_name} cannot be run on engine {model_engine}.")
+    match_params = LLM_ENGINES[model_name][model_engine]
+    for param in match_params:
+        if (
+            model_name == param["model_name"]
+            and model_format == param["model_format"]
+            and model_size_in_billions == param["model_size_in_billions"]
+            and quantization in param["quantizations"]
+        ):
+            return param["llm_class"]
+    raise ValueError(
+        f"Model {model_name} cannot be run on engine {model_engine}, with format {model_format}, size {model_size_in_billions} and quantization {quantization}."
+    )