PyPI - xinference - Versions diffs - 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl - Mend

xinference 0.10.1py3-none-any.whl → 0.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (55) hide show

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -33,6 +33,7 @@ from ..._compat import (
     validator,
 )
 from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
+from ...types import LoRA
 from ..utils import (
     download_from_modelscope,
     is_valid_model_uri,
@@ -797,10 +798,29 @@ def get_user_defined_llm_families():
         return UD_LLM_FAMILIES.copy()
+def match_model_size(
+    model_size: Union[int, str], spec_model_size: Union[int, str]
+) -> bool:
+    if isinstance(model_size, str):
+        model_size = model_size.replace("_", ".")
+    if isinstance(spec_model_size, str):
+        spec_model_size = spec_model_size.replace("_", ".")
+    if model_size == spec_model_size:
+        return True
+    try:
+        ms = int(model_size)
+        ss = int(spec_model_size)
+        return ms == ss
+    except ValueError:
+        return False
 def match_llm(
     model_name: str,
     model_format: Optional[str] = None,
-    model_size_in_billions: Optional[int] = None,
+    model_size_in_billions: Optional[Union[int, str]] = None,
     quantization: Optional[str] = None,
     is_local_deployment: bool = False,
 ) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
@@ -844,7 +864,9 @@ def match_llm(
                 model_format
                 and model_format != spec.model_format
                 or model_size_in_billions
-                and model_size_in_billions != spec.model_size_in_billions
+                and not match_model_size(
+                    model_size_in_billions, spec.model_size_in_billions
+                )
                 or quantization
                 and matched_quantization is None
             ):
@@ -954,12 +976,12 @@ def match_llm_cls(
     family: LLMFamilyV1,
     llm_spec: "LLMSpecV1",
     quantization: str,
-    peft_model_path: Optional[str] = None,
+    peft_model: Optional[List[LoRA]] = None,
 ) -> Optional[Type[LLM]]:
     """
     Find an LLM implementation for given LLM family and spec.
     """
-    if peft_model_path is not None:
+    if peft_model is not None:
         for cls in PEFT_SUPPORTED_CLASSES:
             if cls.match(family, llm_spec, quantization):
                 return cls

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -2175,6 +2175,77 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 65536,
+    "model_name": "codeqwen1.5-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
+    "model_specs": [
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
+        ],
+        "model_id": "qwen/CodeQwen1.5-7B-Chat-GGUF",
+        "model_hub": "modelscope",
+        "model_file_name_template": "codeqwen-1_5-7b-chat-{quantization}.gguf"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/CodeQwen1.5-7B-Chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/CodeQwen1.5-7B-Chat-AWQ",
+        "model_hub": "modelscope"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "stop_token_ids": [
+        151643,
+        151644,
+        151645
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,
@@ -3045,5 +3116,94 @@
         "</s>"
       ]
     }
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "c4ai-command-r-v01",
+    "model_lang": [
+      "en",
+      "fr",
+      "de",
+      "es",
+      "it",
+      "pt",
+      "ja",
+      "ko",
+      "zh",
+      "ar"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "C4AI Command-R is a research release of a 35 billion parameter highly performant generative model.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 35,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "AI-ModelScope/c4ai-command-r-v01",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 35,
+        "quantizations": [
+          "Q2_K",
+          "Q4_K_M",
+          "Q5_K_M"
+        ],
+        "model_id": "mirror013/C4AI-Command-R-v01-GGUF",
+        "model_file_name_template": "c4ai-command-r-v01.{quantization}.gguf",
+        "model_hub": "modelscope",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 104,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "AI-ModelScope/c4ai-command-r-plus",
+        "model_revision": "master"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "c4ai-command-r-v01-4bit",
+    "model_lang": [
+      "en",
+      "fr",
+      "de",
+      "es",
+      "it",
+      "pt",
+      "ja",
+      "ko",
+      "zh",
+      "ar"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "This model is 4bit quantized version of C4AI Command-R using bitsandbytes.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 35,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "mirror013/c4ai-command-r-v01-4bit",
+        "model_revision": "master"
+      }
+    ]
   }
 ]

xinference/model/llm/pytorch/baichuan.py CHANGED Viewed

@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+from typing import List, Optional
+from ....types import LoRA
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from .core import PytorchChatModel, PytorchModelConfig
@@ -27,7 +28,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -36,7 +37,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
         )
         self._use_fast_tokenizer = False

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -24,6 +24,7 @@ from ....types import (
     CompletionChoice,
     CompletionChunk,
     CompletionUsage,
+    LoRA,
     PytorchGenerateConfig,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -39,7 +40,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -48,7 +49,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
         )
     def _load_model(self, **kwargs):

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -32,6 +32,7 @@ from ....types import (
     Embedding,
     EmbeddingData,
     EmbeddingUsage,
+    LoRA,
     PytorchGenerateConfig,
     PytorchModelConfig,
 )
@@ -71,14 +72,14 @@ class PytorchModel(LLM):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(model_uid, model_family, model_spec, quantization, model_path)
         self._use_fast_tokenizer = True
         self._pytorch_model_config: PytorchModelConfig = self._sanitize_model_config(
             pytorch_model_config
         )
-        self._peft_model_path = peft_model_path
+        self._peft_model = peft_model
     def _sanitize_model_config(
         self, pytorch_model_config: Optional[PytorchModelConfig]
@@ -134,7 +135,7 @@ class PytorchModel(LLM):
         return model, tokenizer
     def _apply_lora(self):
-        if self._peft_model_path is not None:
+        if self._peft_model is not None:
             try:
                 from peft import PeftModel
             except ImportError:
@@ -142,14 +143,15 @@ class PytorchModel(LLM):
                     f"Failed to import 'PeftModel' from 'peft'. Please make sure 'peft' is installed.\n\n"
                 )
-            # Apply LoRA
-            self._model = PeftModel.from_pretrained(
-                self._model,
-                self._peft_model_path,
-            )
-            logger.info(
-                f"Successfully loaded the PEFT adaptor for model {self.model_uid}."
-            )
+            for peft_model in self._peft_model:
+                # Apply LoRA
+                self._model = PeftModel.from_pretrained(
+                    self._model,
+                    peft_model.local_path,
+                )
+                logger.info(
+                    f"PEFT adaptor '{peft_model.lora_name}' successfully loaded for model '{self.model_uid}'."
+                )
     def load(self):
         try:
@@ -421,7 +423,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -430,7 +432,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             quantization,
             model_path,
             pytorch_model_config,
-            peft_model_path,
+            peft_model,
         )
     def _sanitize_generate_config(

xinference/model/llm/pytorch/falcon.py CHANGED Viewed

@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+from typing import List, Optional
+from ....types import LoRA
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from .core import PytorchChatModel, PytorchModel, PytorchModelConfig
@@ -27,7 +28,7 @@ class FalconPytorchModel(PytorchModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -36,7 +37,7 @@ class FalconPytorchModel(PytorchModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
         )
     def _load_model(self, **kwargs):
@@ -86,7 +87,7 @@ class FalconPytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -95,7 +96,7 @@ class FalconPytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
         )
     def _load_model(self, **kwargs):

xinference/model/llm/pytorch/internlm2.py CHANGED Viewed

@@ -23,6 +23,7 @@ from ....types import (
     CompletionChoice,
     CompletionChunk,
     CompletionUsage,
+    LoRA,
     PytorchGenerateConfig,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -38,7 +39,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -47,7 +48,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
         )
     def _load_model(self, **kwargs):

xinference/model/llm/pytorch/llama_2.py CHANGED Viewed

@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+from typing import List, Optional
+from ....types import LoRA
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from .core import PytorchChatModel, PytorchModel, PytorchModelConfig
@@ -27,7 +28,7 @@ class LlamaPytorchModel(PytorchModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -36,7 +37,7 @@ class LlamaPytorchModel(PytorchModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
         )
     def _load_model(self, **kwargs):
@@ -69,8 +70,8 @@ class LlamaPytorchChatModel(PytorchChatModel):
         model_spec: "LLMSpecV1",
         quantization: str,
         model_path: str,
-        peft_model_path: Optional[str] = None,
         pytorch_model_config: Optional["PytorchModelConfig"] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -78,7 +79,7 @@ class LlamaPytorchChatModel(PytorchChatModel):
             model_spec,
             quantization,
             model_path,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
             pytorch_model_config=pytorch_model_config,
         )
         self._use_fast_tokenizer = False

xinference/model/llm/pytorch/vicuna.py CHANGED Viewed

@@ -26,8 +26,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+from typing import List, Optional
+from ....types import LoRA
 from .. import LLMFamilyV1, LLMSpecV1
 from .core import PytorchChatModel, PytorchModelConfig
@@ -41,7 +42,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional["PytorchModelConfig"] = None,
-        peft_model_path: Optional[str] = None,
+        peft_model: Optional[List[LoRA]] = None,
     ):
         super().__init__(
             model_uid,
@@ -50,7 +51,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
-            peft_model_path=peft_model_path,
+            peft_model=peft_model,
         )
         self._use_fast_tokenizer = False

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -116,6 +116,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
 ]
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")
+    VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat")
 if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
@@ -126,6 +127,8 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
 if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-moe-chat")
+    VLLM_SUPPORTED_MODELS.append("c4ai-command-r-v01")
+    VLLM_SUPPORTED_MODELS.append("c4ai-command-r-v01-4bit")
 class VLLMModel(LLM):

xinference/model/rerank/core.py CHANGED Viewed

@@ -42,8 +42,9 @@ def get_rerank_model_descriptions():
 class RerankModelSpec(CacheableModelSpec):
     model_name: str
     language: List[str]
+    type: Optional[str] = "normal"
     model_id: str
-    model_revision: str
+    model_revision: Optional[str]
     model_hub: str = "huggingface"
@@ -63,6 +64,7 @@ class RerankModelDescription(ModelDescription):
             "model_type": "rerank",
             "address": self.address,
             "accelerators": self.devices,
+            "type": self._model_spec.type,
             "model_name": self._model_spec.model_name,
             "language": self._model_spec.language,
             "model_revision": self._model_spec.model_revision,
@@ -97,12 +99,14 @@ def generate_rerank_description(model_spec: RerankModelSpec) -> Dict[str, List[D
 class RerankModel:
     def __init__(
         self,
+        model_spec: RerankModelSpec,
         model_uid: str,
         model_path: str,
         device: Optional[str] = None,
         use_fp16: bool = False,
         model_config: Optional[Dict] = None,
     ):
+        self._model_spec = model_spec
         self._model_uid = model_uid
         self._model_path = model_path
         self._device = device
@@ -112,20 +116,25 @@ class RerankModel:
     def load(self):
         try:
-            from sentence_transformers.cross_encoder import CrossEncoder
+            if self._model_spec.type == "normal":
+                from FlagEmbedding import FlagReranker
+            elif self._model_spec.type == "LLM-based":
+                from FlagEmbedding import FlagLLMReranker as FlagReranker
+            elif self._model_spec.type == "LLM-based layerwise":
+                from FlagEmbedding import LayerWiseFlagLLMReranker as FlagReranker
+            else:
+                raise RuntimeError(
+                    f"Unsupported Rank model type: {self._model_spec.type}"
+                )
         except ImportError:
-            error_message = "Failed to import module 'SentenceTransformer'"
+            error_message = "Failed to import module 'FlagEmbedding'"
             installation_guide = [
-                "Please make sure 'sentence-transformers' is installed. ",
-                "You can install it by `pip install sentence-transformers`\n",
+                "Please make sure 'FlagEmbedding' is installed. ",
+                "You can install it by `pip install FlagEmbedding`\n",
             ]
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-        self._model = CrossEncoder(
-            self._model_path, device=self._device, **self._model_config
-        )
-        if self._use_fp16:
-            self._model.model.half()
+        self._model = FlagReranker(self._model_path, use_fp16=True)
     def rerank(
         self,
@@ -142,7 +151,7 @@ class RerankModel:
         if max_chunks_per_doc is not None:
             raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
         sentence_combinations = [[query, doc] for doc in documents]
-        similarity_scores = self._model.predict(sentence_combinations)
+        similarity_scores = self._model.compute_score(sentence_combinations)
         sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
         if top_n is not None:
             sim_scores_argsort = sim_scores_argsort[:top_n]
@@ -224,7 +233,9 @@ def create_rerank_model_instance(
     model_path = cache(model_spec)
     use_fp16 = kwargs.pop("use_fp16", False)
-    model = RerankModel(model_uid, model_path, use_fp16=use_fp16, model_config=kwargs)
+    model = RerankModel(
+        model_spec, model_uid, model_path, use_fp16=use_fp16, model_config=kwargs
+    )
     model_description = RerankModelDescription(
         subpool_addr, devices, model_spec, model_path=model_path
     )

xinference/model/rerank/model_spec.json CHANGED Viewed

@@ -1,20 +1,44 @@
 [
   {
     "model_name": "bge-reranker-large",
+    "type": "normal",
     "language": ["en", "zh"],
     "model_id": "BAAI/bge-reranker-large",
     "model_revision": "27c9168d479987529781de8474dff94d69beca11"
   },
   {
     "model_name": "bge-reranker-base",
+    "type": "normal",
     "language": ["en", "zh"],
     "model_id": "BAAI/bge-reranker-base",
     "model_revision": "465b4b7ddf2be0a020c8ad6e525b9bb1dbb708ae"
   },
   {
     "model_name": "bce-reranker-base_v1",
+    "type": "normal",
     "language": ["en", "zh"],
     "model_id": "maidalun1020/bce-reranker-base_v1",
     "model_revision": "eaa31a577a0574e87a08959bd229ca14ce1b5496"
+  },
+  {
+    "model_name": "bge-reranker-v2-m3",
+    "type": "normal",
+    "language": ["en", "zh", "multilingual"],
+    "model_id": "BAAI/bge-reranker-v2-m3",
+    "model_revision": "12e974610ba9083ed95f3edf08d7e899581f4de4"
+  },
+  {
+    "model_name": "bge-reranker-v2-gemma",
+    "type": "LLM-based",
+    "language": ["en", "zh", "multilingual"],
+    "model_id": "BAAI/bge-reranker-v2-gemma",
+    "model_revision": "1787044f8b6fb740a9de4557c3a12377f84d9e17"
+  },
+  {
+    "model_name": "bge-reranker-v2-minicpm-layerwise",
+    "type": "LLM-based layerwise",
+    "language": ["en", "zh", "multilingual"],
+    "model_id": "BAAI/bge-reranker-v2-minicpm-layerwise",
+    "model_revision": "47b5332b296c4d8cb6ee2c60502cc62a0d708881"
   }
 ]

xinference 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl

Potentially problematic release.

xinference 0.10.1py3-none-any.whl → 0.10.2py3-none-any.whl