PyPI - xinference - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl - Mend - Supply Chain Defender

xinference 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (97) hide show

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -913,6 +913,38 @@
         "model_id": "meta-llama/Llama-2-7b-chat-hf",
         "model_revision": "08751db2aca9bf2f7f80d2e516117a53d7450235"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-7B-Chat-GPTQ"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-70B-Chat-GPTQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-70B-Chat-AWQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-7B-Chat-AWQ"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 13,
@@ -924,6 +956,22 @@
         "model_id": "meta-llama/Llama-2-13b-chat-hf",
         "model_revision": "0ba94ac9b9e1d5a0037780667e8b219adde1908c"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 13,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-13B-chat-GPTQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 13,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-13B-chat-AWQ"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 70,
@@ -1045,6 +1093,22 @@
         "model_id": "TheBloke/Llama-2-7B-GGML",
         "model_file_name_template": "llama-2-7b.ggmlv3.{quantization}.bin"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-7B-GPTQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-7B-AWQ"
+      },
       {
         "model_format": "ggmlv3",
         "model_size_in_billions": 13,
@@ -1111,6 +1175,22 @@
         "model_id": "meta-llama/Llama-2-13b-hf",
         "model_revision": "db6b8eb1feabb38985fdf785a89895959e944936"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 13,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-13B-GPTQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 13,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-13B-AWQ"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 70,
@@ -1121,6 +1201,22 @@
         ],
         "model_id": "meta-llama/Llama-2-70b-hf",
         "model_revision": "cc8aa03a000ff08b4d5c5b39673321a2a396c396"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-70B-GPTQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Llama-2-70B-AWQ"
       }
     ]
   },
@@ -1509,6 +1605,16 @@
         ],
         "model_id": "Qwen/Qwen1.5-14B-Chat"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen1.5-32B-Chat"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 72,
@@ -1564,6 +1670,14 @@
         ],
         "model_id": "Qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-32B-Chat-GPTQ-{quantization}"
+      },
       {
         "model_format": "gptq",
         "model_size_in_billions": 72,
@@ -1613,6 +1727,14 @@
         ],
         "model_id": "Qwen/Qwen1.5-14B-Chat-AWQ"
       },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-32B-Chat-AWQ"
+      },
       {
         "model_format": "awq",
         "model_size_in_billions": 72,
@@ -1701,6 +1823,22 @@
         "model_id": "Qwen/Qwen1.5-14B-Chat-GGUF",
         "model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf"
       },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
+        ],
+        "model_id": "Qwen/Qwen1.5-32B-Chat-GGUF",
+        "model_file_name_template": "qwen1_5-32b-chat-{quantization}.gguf"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": 72,
@@ -1740,6 +1878,126 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen1.5-moe-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "2_7",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "2_7",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 65536,
+    "model_name": "codeqwen1.5-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
+    "model_specs": [
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
+        ],
+        "model_id": "Qwen/CodeQwen1.5-7B-Chat-GGUF",
+        "model_file_name_template": "codeqwen-1_5-7b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/CodeQwen1.5-7B-Chat"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/CodeQwen1.5-7B-Chat-AWQ"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 8192,
@@ -1780,13 +2038,13 @@
     "model_description": "GPT-2 is a Transformer-based LLM that is trained on WebTest, a 40 GB dataset of Reddit posts with 3+ upvotes.",
     "model_specs": [
       {
-        "model_format": "ggmlv3",
-        "model_size_in_billions": 1,
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_5",
         "quantizations": [
           "none"
         ],
-        "model_id": "marella/gpt-2-ggml",
-        "model_file_name_template": "ggml-model.bin"
+        "model_id": "openai-community/gpt2",
+        "model_revision": "607a30d783dfa663caf39e06633721c8d4cfcd7e"
       }
     ]
   },
@@ -2569,6 +2827,22 @@
         "model_id": "mistralai/Mistral-7B-Instruct-v0.1",
         "model_revision": "54766df6d50e4d3d7ccd66758e5341ba105a6d36"
       },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Mistral-7B-Instruct-v0.1-AWQ"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": 7,
@@ -2630,6 +2904,22 @@
         "model_id": "mistralai/Mistral-7B-Instruct-v0.2",
         "model_revision": "b70aa86578567ba3301b21c8a27bea4e8f6d6d61"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": 7,
@@ -2790,6 +3080,14 @@
         "model_id": "mistralai/Mixtral-8x7B-v0.1",
         "model_revision": "58301445dc1378584211722b7ebf8743ec4e192b"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "46_7",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Mixtral-8x7B-v0.1-GPTQ"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": "46_7",
@@ -2839,10 +3137,17 @@
         "model_format": "awq",
         "model_size_in_billions": "46_7",
         "quantizations": [
-          "4-bit"
+          "Int4"
         ],
-        "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ",
-        "model_revision": "9afb6f0a7d7fe9ecebdda1baa4ff4e13e73e97d7"
+        "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "46_7",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
       },
       {
         "model_format": "ggufv2",
@@ -4515,5 +4820,200 @@
         "</s>"
       ]
     }
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "seallm_v2",
+    "model_lang": [
+      "en",
+      "zh",
+      "vi",
+      "id",
+      "th",
+      "ms",
+      "km",
+      "lo",
+      "my",
+      "tl"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "We introduce SeaLLM-7B-v2, the state-of-the-art multilingual LLM for Southeast Asian (SEA) languages",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "SeaLLMs/SeaLLM-7B-v2",
+        "model_revision": "f1bd48e0d75365c24a3c5ad006b2d0a0c9dca30f"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Q4_0",
+          "Q8_0"
+        ],
+        "model_id": "SeaLLMs/SeaLLM-7B-v2-gguf",
+        "model_file_name_template": "SeaLLM-7B-v2.{quantization}.gguf"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "seallm_v2.5",
+    "model_lang": [
+      "en",
+      "zh",
+      "vi",
+      "id",
+      "th",
+      "ms",
+      "km",
+      "lo",
+      "my",
+      "tl"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "We introduce SeaLLM-7B-v2.5, the state-of-the-art multilingual LLM for Southeast Asian (SEA) languages",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "SeaLLMs/SeaLLM-7B-v2.5",
+        "model_revision": "c54a8eb8e2d58c5a680bfbbe3a7ae71753bb644b"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Q4_K_M",
+          "Q8_0"
+        ],
+        "model_id": "SeaLLMs/SeaLLM-7B-v2.5-GGUF",
+        "model_file_name_template": "SeaLLM-7B-v2.5.{quantization}.gguf"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "c4ai-command-r-v01",
+    "model_lang": [
+      "en",
+      "fr",
+      "de",
+      "es",
+      "it",
+      "pt",
+      "ja",
+      "ko",
+      "zh",
+      "ar"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "C4AI Command-R is a research release of a 35 billion parameter highly performant generative model.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 35,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "CohereForAI/c4ai-command-r-v01",
+        "model_revision": "16881ccde1c68bbc7041280e6a66637bc46bfe88"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 35,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "andrewcanis/c4ai-command-r-v01-GGUF",
+        "model_file_name_template": "c4ai-command-r-v01.{quantization}.gguf"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 104,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "CohereForAI/c4ai-command-r-plus",
+        "model_revision": "ba7f1d954c9d1609013677d87e4142ab95c34e62"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 104,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "alpindale/c4ai-command-r-plus-GPTQ",
+        "model_revision": "35febfc08f723ac0df32480eb4af349a7d08656e"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "c4ai-command-r-v01-4bit",
+    "model_lang": [
+      "en",
+      "fr",
+      "de",
+      "es",
+      "it",
+      "pt",
+      "ja",
+      "ko",
+      "zh",
+      "ar"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "This model is 4bit quantized version of C4AI Command-R using bitsandbytes.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 35,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "CohereForAI/c4ai-command-r-v01-4bit",
+        "model_revision": "f2e87936a146643c9dd143422dcafb9cb1552611"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 104,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "CohereForAI/c4ai-command-r-plus-4bit",
+        "model_revision": "bb63b5b7005ecedb30b0cfd0d5953b02a5817f7b"
+      }
+    ]
   }
 ]

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -33,6 +33,7 @@ from ..._compat import (
     validator,
 )
 from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
+from ...types import LoRA
 from ..utils import (
     download_from_modelscope,
     is_valid_model_uri,
@@ -199,6 +200,21 @@ class CustomLLMFamilyV1(LLMFamilyV1):
                 )
             llm_spec.prompt_style = BUILTIN_LLM_PROMPT_STYLE[prompt_style_name]
+        # check model ability, registering LLM only provides generate and chat
+        # but for vision models, we add back the abilities so that
+        # gradio chat interface can be generated properly
+        if (
+            llm_spec.model_family != "other"
+            and llm_spec.model_family
+            in {
+                family.model_name
+                for family in BUILTIN_LLM_FAMILIES
+                if "vision" in family.model_ability
+            }
+            and "vision" not in llm_spec.model_ability
+        ):
+            llm_spec.model_ability.append("vision")
         return llm_spec
@@ -782,10 +798,29 @@ def get_user_defined_llm_families():
         return UD_LLM_FAMILIES.copy()
+def match_model_size(
+    model_size: Union[int, str], spec_model_size: Union[int, str]
+) -> bool:
+    if isinstance(model_size, str):
+        model_size = model_size.replace("_", ".")
+    if isinstance(spec_model_size, str):
+        spec_model_size = spec_model_size.replace("_", ".")
+    if model_size == spec_model_size:
+        return True
+    try:
+        ms = int(model_size)
+        ss = int(spec_model_size)
+        return ms == ss
+    except ValueError:
+        return False
 def match_llm(
     model_name: str,
     model_format: Optional[str] = None,
-    model_size_in_billions: Optional[int] = None,
+    model_size_in_billions: Optional[Union[int, str]] = None,
     quantization: Optional[str] = None,
     is_local_deployment: bool = False,
 ) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
@@ -829,7 +864,9 @@ def match_llm(
                 model_format
                 and model_format != spec.model_format
                 or model_size_in_billions
-                and model_size_in_billions != spec.model_size_in_billions
+                and not match_model_size(
+                    model_size_in_billions, spec.model_size_in_billions
+                )
                 or quantization
                 and matched_quantization is None
             ):
@@ -939,12 +976,12 @@ def match_llm_cls(
     family: LLMFamilyV1,
     llm_spec: "LLMSpecV1",
     quantization: str,
-    peft_model_path: Optional[str] = None,
+    peft_model: Optional[List[LoRA]] = None,
 ) -> Optional[Type[LLM]]:
     """
     Find an LLM implementation for given LLM family and spec.
     """
-    if peft_model_path is not None:
+    if peft_model is not None:
         for cls in PEFT_SUPPORTED_CLASSES:
             if cls.match(family, llm_spec, quantization):
                 return cls