PyPI - xinference - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl - Mend

xinference 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (97) hide show

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -1825,6 +1825,17 @@
         "model_id": "qwen/Qwen1.5-14B-Chat",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen1.5-32B-Chat",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 72,
@@ -1886,6 +1897,15 @@
         "model_id": "qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-32B-Chat-GPTQ-{quantization}",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "gptq",
         "model_size_in_billions": 72,
@@ -1941,6 +1961,15 @@
         "model_id": "qwen/Qwen1.5-14B-Chat-AWQ",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-32B-Chat-AWQ",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "awq",
         "model_size_in_billions": 72,
@@ -2035,6 +2064,23 @@
         "model_hub": "modelscope",
         "model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf"
       },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 32,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
+        ],
+        "model_id": "qwen/Qwen1.5-32B-Chat-GGUF",
+        "model_hub": "modelscope",
+        "model_file_name_template": "qwen1_5-32b-chat-{quantization}.gguf"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": 72,
@@ -2075,6 +2121,131 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "qwen1.5-moe-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "2_7",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen1.5-MoE-A2.7B-Chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "2_7",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
+        "model_hub": "modelscope"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 65536,
+    "model_name": "codeqwen1.5-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
+    "model_specs": [
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
+        ],
+        "model_id": "qwen/CodeQwen1.5-7B-Chat-GGUF",
+        "model_hub": "modelscope",
+        "model_file_name_template": "codeqwen-1_5-7b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/CodeQwen1.5-7B-Chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/CodeQwen1.5-7B-Chat-AWQ",
+        "model_hub": "modelscope"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,
@@ -2945,5 +3116,94 @@
         "</s>"
       ]
     }
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "c4ai-command-r-v01",
+    "model_lang": [
+      "en",
+      "fr",
+      "de",
+      "es",
+      "it",
+      "pt",
+      "ja",
+      "ko",
+      "zh",
+      "ar"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "C4AI Command-R is a research release of a 35 billion parameter highly performant generative model.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 35,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "AI-ModelScope/c4ai-command-r-v01",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 35,
+        "quantizations": [
+          "Q2_K",
+          "Q4_K_M",
+          "Q5_K_M"
+        ],
+        "model_id": "mirror013/C4AI-Command-R-v01-GGUF",
+        "model_file_name_template": "c4ai-command-r-v01.{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 104,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "AI-ModelScope/c4ai-command-r-plus",
+        "model_revision": "master"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "c4ai-command-r-v01-4bit",
+    "model_lang": [
+      "en",
+      "fr",
+      "de",
+      "es",
+      "it",
+      "pt",
+      "ja",
+      "ko",
+      "zh",
+      "ar"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "This model is 4bit quantized version of C4AI Command-R using bitsandbytes.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 35,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "mirror013/c4ai-command-r-v01-4bit",
+        "model_revision": "master"
+      }
+    ]
   }
 ]

xinference/model/llm/pytorch/baichuan.py CHANGED Viewed

@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+from typing import List, Optional
+from ....types import LoRA
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from .core import PytorchChatModel, PytorchModelConfig
@@ -27,7 +28,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -36,7 +37,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
         )
         self._use_fast_tokenizer = False

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -24,6 +24,7 @@ from ....types import (
     CompletionChoice,
     CompletionChunk,
     CompletionUsage,
+    LoRA,
     PytorchGenerateConfig,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -39,7 +40,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -48,7 +49,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
         )
     def _load_model(self, **kwargs):
@@ -135,6 +136,8 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             chat_history = [h for h in chat_history if not h.get("tool_calls")]
         if not chat_history:
             chat_history = []
+        if system_prompt:
+            chat_history.append({"role": "system", "content": system_prompt})
         if tools:
             msg = self._model.chat(
                 self._tokenizer, prompt, [tools] + chat_history, **kwargs

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -32,6 +32,7 @@ from ....types import (
     Embedding,
     EmbeddingData,
     EmbeddingUsage,
+    LoRA,
     PytorchGenerateConfig,
     PytorchModelConfig,
 )
@@ -42,6 +43,25 @@ from ..utils import ChatModelMixin
 logger = logging.getLogger(__name__)
+NON_DEFAULT_MODEL_LIST: List[str] = [
+    "baichuan-chat",
+    "baichuan-2-chat",
+    "vicuna-v1.3",
+    "falcon",
+    "falcon-instruct",
+    "chatglm",
+    "chatglm2",
+    "chatglm2-32k",
+    "chatglm2-128k",
+    "llama-2",
+    "llama-2-chat",
+    "internlm2-chat",
+    "qwen-vl-chat",
+    "OmniLMM",
+    "yi-vl-chat",
+    "deepseek-vl-chat",
+]
 class PytorchModel(LLM):
     def __init__(
@@ -52,14 +72,14 @@ class PytorchModel(LLM):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(model_uid, model_family, model_spec, quantization, model_path)
         self._use_fast_tokenizer = True
         self._pytorch_model_config: PytorchModelConfig = self._sanitize_model_config(
             pytorch_model_config
         )
-        self._peft_model_path = peft_model_path
+        self._peft_model = peft_model
     def _sanitize_model_config(
         self, pytorch_model_config: Optional[PytorchModelConfig]
@@ -115,7 +135,7 @@ class PytorchModel(LLM):
         return model, tokenizer
     def _apply_lora(self):
-        if self._peft_model_path is not None:
+        if self._peft_model is not None:
             try:
                 from peft import PeftModel
             except ImportError:
@@ -123,14 +143,15 @@ class PytorchModel(LLM):
                     f"Failed to import 'PeftModel' from 'peft'. Please make sure 'peft' is installed.\n\n"
                 )
-            # Apply LoRA
-            self._model = PeftModel.from_pretrained(
-                self._model,
-                self._peft_model_path,
-            )
-            logger.info(
-                f"Successfully loaded the PEFT adaptor for model {self.model_uid}."
-            )
+            for peft_model in self._peft_model:
+                # Apply LoRA
+                self._model = PeftModel.from_pretrained(
+                    self._model,
+                    peft_model.local_path,
+                )
+                logger.info(
+                    f"PEFT adaptor '{peft_model.lora_name}' successfully loaded for model '{self.model_uid}'."
+                )
     def load(self):
         try:
@@ -233,17 +254,7 @@ class PytorchModel(LLM):
         if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False
         model_family = llm_family.model_family or llm_family.model_name
-        if model_family in [
-            "baichuan-chat",
-            "vicuna-v1.3",
-            "falcon",
-            "falcon-instruct",
-            "chatglm",
-            "chatglm2",
-            "chatglm2-32k",
-            "llama-2",
-            "llama-2-chat",
-        ]:
+        if model_family in NON_DEFAULT_MODEL_LIST:
             return False
         if "generate" not in llm_family.model_ability:
             return False
@@ -412,7 +423,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -421,7 +432,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             quantization,
             model_path,
             pytorch_model_config,
-            peft_model_path,
+            peft_model,
         )
     def _sanitize_generate_config(
@@ -452,23 +463,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     ) -> bool:
         if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False
-        if llm_family.model_name in [
-            "baichuan-chat",
-            "baichuan-2-chat",
-            "vicuna-v1.3",
-            "falcon",
-            "falcon-instruct",
-            "chatglm",
-            "chatglm2",
-            "chatglm2-32k",
-            "llama-2",
-            "llama-2-chat",
-            "internlm2-chat",
-            "qwen-vl-chat",
-            "OmniLMM",
-            "yi-vl-chat",
-            "deepseek-vl-chat",
-        ]:
+        model_family = llm_family.model_family or llm_family.model_name
+        if model_family in NON_DEFAULT_MODEL_LIST:
             return False
         if "chat" not in llm_family.model_ability:
             return False

xinference/model/llm/pytorch/falcon.py CHANGED Viewed

@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+from typing import List, Optional
+from ....types import LoRA
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from .core import PytorchChatModel, PytorchModel, PytorchModelConfig
@@ -27,7 +28,7 @@ class FalconPytorchModel(PytorchModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -36,7 +37,7 @@ class FalconPytorchModel(PytorchModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
         )
     def _load_model(self, **kwargs):
@@ -86,7 +87,7 @@ class FalconPytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -95,7 +96,7 @@ class FalconPytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
         )
     def _load_model(self, **kwargs):

xinference/model/llm/pytorch/internlm2.py CHANGED Viewed

@@ -23,6 +23,7 @@ from ....types import (
     CompletionChoice,
     CompletionChunk,
     CompletionUsage,
+    LoRA,
     PytorchGenerateConfig,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -38,7 +39,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -47,7 +48,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
         )
     def _load_model(self, **kwargs):
@@ -114,6 +115,8 @@ class Internlm2PytorchChatModel(PytorchChatModel):
             ]
         else:
             input_history = []
+        if system_prompt:
+            kwargs["meta_instruction"] = system_prompt
         if stream:
             def _stream_generator():

xinference/model/llm/pytorch/llama_2.py CHANGED Viewed

@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+from typing import List, Optional
+from ....types import LoRA
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from .core import PytorchChatModel, PytorchModel, PytorchModelConfig
@@ -27,7 +28,7 @@ class LlamaPytorchModel(PytorchModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -36,7 +37,7 @@ class LlamaPytorchModel(PytorchModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
         )
     def _load_model(self, **kwargs):
@@ -69,8 +70,8 @@ class LlamaPytorchChatModel(PytorchChatModel):
         model_spec: "LLMSpecV1",
         quantization: str,
         model_path: str,
-        peft_model_path: Optional[str] = None,
         pytorch_model_config: Optional["PytorchModelConfig"] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -78,7 +79,7 @@ class LlamaPytorchChatModel(PytorchChatModel):
             model_spec,
             quantization,
             model_path,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
             pytorch_model_config=pytorch_model_config,
         )
         self._use_fast_tokenizer = False

xinference/model/llm/pytorch/qwen_vl.py CHANGED Viewed

@@ -53,6 +53,8 @@ class QwenVLChatModel(PytorchChatModel):
         device = self._pytorch_model_config.get("device", "auto")
         device = select_device(device)
+        # for multiple GPU, set back to auto to make multiple devices work
+        device = "auto" if device == "cuda" else device
         self._tokenizer = AutoTokenizer.from_pretrained(
             self.model_path,

xinference/model/llm/pytorch/vicuna.py CHANGED Viewed

@@ -26,8 +26,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+from typing import List, Optional
+from ....types import LoRA
 from .. import LLMFamilyV1, LLMSpecV1
 from .core import PytorchChatModel, PytorchModelConfig
@@ -41,7 +42,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional["PytorchModelConfig"] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -50,7 +51,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
         )
         self._use_fast_tokenizer = False

xinference 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

Potentially problematic release.

xinference 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl