PyPI - xinference - Versions diffs - 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl - Mend

xinference 0.14.2py3-none-any.whl → 0.14.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (191) hide show

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -146,5 +146,13 @@
     "model_revision": "fb5f676733139f35670bed9b59a77d476b1aa898",
     "ability": "text-to-audio",
     "multilingual": true
+  },
+  {
+    "model_name": "FishSpeech-1.2-SFT",
+    "model_family": "FishAudio",
+    "model_id": "fishaudio/fish-speech-1.2-sft",
+    "model_revision": "180288e21ec5c50cfc564023a22f789e4b88a0e0",
+    "ability": "text-to-audio",
+    "multilingual": true
   }
 ]

xinference/model/embedding/core.py CHANGED Viewed

@@ -124,6 +124,7 @@ class EmbeddingModel:
         model_path: str,
         model_spec: EmbeddingModelSpec,
         device: Optional[str] = None,
+        **kwargs,
     ):
         self._model_uid = model_uid
         self._model_path = model_path
@@ -131,6 +132,7 @@ class EmbeddingModel:
         self._model = None
         self._counter = 0
         self._model_spec = model_spec
+        self._kwargs = kwargs
     def load(self):
         try:
@@ -154,10 +156,32 @@ class EmbeddingModel:
             "gte" in self._model_spec.model_name.lower()
             and "qwen2" in self._model_spec.model_name.lower()
         ):
+            import torch
+            torch_dtype_str = self._kwargs.get("torch_dtype")
+            if torch_dtype_str is not None:
+                try:
+                    torch_dtype = getattr(torch, torch_dtype_str)
+                    if torch_dtype not in [
+                        torch.float16,
+                        torch.float32,
+                        torch.bfloat16,
+                    ]:
+                        logger.warning(
+                            f"Load embedding model with unsupported torch dtype :  {torch_dtype_str}. Using default torch dtype: fp32."
+                        )
+                        torch_dtype = torch.float32
+                except AttributeError:
+                    logger.warning(
+                        f"Load embedding model with  unknown torch dtype '{torch_dtype_str}'. Using default torch dtype: fp32."
+                    )
+                    torch_dtype = torch.float32
+            else:
+                torch_dtype = "auto"
             self._model = XSentenceTransformer(
                 self._model_path,
                 device=self._device,
-                model_kwargs={"device_map": "auto"},
+                model_kwargs={"device_map": "auto", "torch_dtype": torch_dtype},
             )
         else:
             self._model = SentenceTransformer(self._model_path, device=self._device)

xinference/model/embedding/custom.py CHANGED Viewed

@@ -47,6 +47,10 @@ def register_embedding(model_spec: CustomEmbeddingModelSpec, persist: bool):
     if not is_valid_model_name(model_spec.model_name):
         raise ValueError(f"Invalid model name {model_spec.model_name}.")
+    model_uri = model_spec.model_uri
+    if model_uri and not is_valid_model_uri(model_uri):
+        raise ValueError(f"Invalid model URI {model_uri}.")
     with UD_EMBEDDING_LOCK:
         for model_name in (
             list(BUILTIN_EMBEDDING_MODELS.keys())
@@ -61,11 +65,6 @@ def register_embedding(model_spec: CustomEmbeddingModelSpec, persist: bool):
         UD_EMBEDDINGS.append(model_spec)
     if persist:
-        # We only validate model URL when persist is True.
-        model_uri = model_spec.model_uri
-        if model_uri and not is_valid_model_uri(model_uri):
-            raise ValueError(f"Invalid model URI {model_uri}.")
         persist_path = os.path.join(
             XINFERENCE_MODEL_DIR, "embedding", f"{model_spec.model_name}.json"
         )

xinference/model/flexible/core.py CHANGED Viewed

@@ -99,11 +99,15 @@ def get_flexible_model_descriptions():
 def register_flexible_model(model_spec: FlexibleModelSpec, persist: bool):
-    from ..utils import is_valid_model_name
+    from ..utils import is_valid_model_name, is_valid_model_uri
     if not is_valid_model_name(model_spec.model_name):
         raise ValueError(f"Invalid model name {model_spec.model_name}.")
+    model_uri = model_spec.model_uri
+    if model_uri and not is_valid_model_uri(model_uri):
+        raise ValueError(f"Invalid model URI {model_uri}.")
     if model_spec.launcher_args:
         try:
             model_spec.parser_args()

xinference/model/image/custom.py CHANGED Viewed

@@ -47,6 +47,10 @@ def register_image(model_spec: CustomImageModelFamilyV1, persist: bool):
     if not is_valid_model_name(model_spec.model_name):
         raise ValueError(f"Invalid model name {model_spec.model_name}.")
+    model_uri = model_spec.model_uri
+    if model_uri and not is_valid_model_uri(model_uri):
+        raise ValueError(f"Invalid model URI {model_uri}")
     with UD_IMAGE_LOCK:
         for model_name in (
             list(BUILTIN_IMAGE_MODELS.keys())
@@ -60,11 +64,6 @@ def register_image(model_spec: CustomImageModelFamilyV1, persist: bool):
         UD_IMAGES.append(model_spec)
     if persist:
-        #  We only validate model URL when persist is True.
-        model_uri = model_spec.model_uri
-        if model_uri and not is_valid_model_uri(model_uri):
-            raise ValueError(f"Invalid model URI {model_uri}")
         persist_path = os.path.join(
             XINFERENCE_MODEL_DIR, "image", f"{model_spec.model_name}.json"
         )

xinference/model/image/model_spec.json CHANGED Viewed

@@ -24,7 +24,8 @@
     "model_revision": "ea42f8cef0f178587cf766dc8129abd379c90671",
     "model_ability": [
       "text2image",
-      "image2image"
+      "image2image",
+      "inpainting"
     ]
   },
   {

xinference/model/image/model_spec_modelscope.json CHANGED Viewed

@@ -27,7 +27,8 @@
     "model_revision": "master",
     "model_ability": [
       "text2image",
-      "image2image"
+      "image2image",
+      "inpainting"
     ]
   },
   {

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -24,6 +24,9 @@ from functools import partial
 from io import BytesIO
 from typing import Dict, List, Optional, Union
+import PIL.Image
+from PIL import ImageOps
 from ....constants import XINFERENCE_IMAGE_DIR
 from ....device_utils import move_model_to_available_device
 from ....types import Image, ImageList, LoRA
@@ -46,8 +49,13 @@ class DiffusionModel:
         self._model_uid = model_uid
         self._model_path = model_path
         self._device = device
+        # when a model has text2image ability,
+        # it will be loaded as AutoPipelineForText2Image
+        # for image2image and inpainting,
+        # we convert to the corresponding model
         self._model = None
         self._i2i_model = None  # image to image model
+        self._inpainting_model = None  # inpainting model
         self._lora_model = lora_model
         self._lora_load_kwargs = lora_load_kwargs or {}
         self._lora_fuse_kwargs = lora_fuse_kwargs or {}
@@ -152,6 +160,10 @@ class DiffusionModel:
         model=None,
         **kwargs,
     ):
+        import gc
+        from ....device_utils import empty_cache
         logger.debug(
             "stable diffusion args: %s",
             kwargs,
@@ -159,6 +171,11 @@ class DiffusionModel:
         model = model if model is not None else self._model
         assert callable(model)
         images = model(**kwargs).images
+        # clean cache
+        gc.collect()
+        empty_cache()
         if response_format == "url":
             os.makedirs(XINFERENCE_IMAGE_DIR, exist_ok=True)
             image_list = []
@@ -209,9 +226,17 @@ class DiffusionModel:
             **kwargs,
         )
+    @staticmethod
+    def pad_to_multiple(image, multiple=8):
+        x, y = image.size
+        padding_x = (multiple - x % multiple) % multiple
+        padding_y = (multiple - y % multiple) % multiple
+        padding = (0, 0, padding_x, padding_y)
+        return ImageOps.expand(image, padding)
     def image_to_image(
         self,
-        image: bytes,
+        image: PIL.Image,
         prompt: Optional[Union[str, List[str]]] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         n: int = 1,
@@ -232,10 +257,19 @@ class DiffusionModel:
                 self._i2i_model = model = AutoPipelineForImage2Image.from_pipe(
                     self._model
                 )
+        if padding_image_to_multiple := kwargs.pop("padding_image_to_multiple", None):
+            # Model like SD3 image to image requires image's height and width is times of 16
+            # padding the image if specified
+            image = self.pad_to_multiple(image, multiple=int(padding_image_to_multiple))
         if size:
             width, height = map(int, re.split(r"[^\d]+", size))
+            if padding_image_to_multiple:
+                width, height = image.size
             kwargs["width"] = width
             kwargs["height"] = height
         self._filter_kwargs(kwargs)
         return self._call_model(
             image=image,
@@ -249,8 +283,8 @@ class DiffusionModel:
     def inpainting(
         self,
-        image: bytes,
-        mask_image: bytes,
+        image: PIL.Image,
+        mask_image: PIL.Image,
         prompt: Optional[Union[str, List[str]]] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         n: int = 1,
@@ -258,7 +292,35 @@ class DiffusionModel:
         response_format: str = "url",
         **kwargs,
     ):
+        if "inpainting" not in self._abilities:
+            raise RuntimeError(f"{self._model_uid} does not support inpainting")
+        if (
+            "text2image" in self._abilities or "image2image" in self._abilities
+        ) and self._model is not None:
+            from diffusers import AutoPipelineForInpainting
+            if self._inpainting_model is not None:
+                model = self._inpainting_model
+            else:
+                model = self._inpainting_model = AutoPipelineForInpainting.from_pipe(
+                    self._model
+                )
+        else:
+            model = self._model
         width, height = map(int, re.split(r"[^\d]+", size))
+        if padding_image_to_multiple := kwargs.pop("padding_image_to_multiple", None):
+            # Model like SD3 inpainting requires image's height and width is times of 16
+            # padding the image if specified
+            image = self.pad_to_multiple(image, multiple=int(padding_image_to_multiple))
+            mask_image = self.pad_to_multiple(
+                mask_image, multiple=int(padding_image_to_multiple)
+            )
+            # calculate actual image size after padding
+            width, height = image.size
         return self._call_model(
             image=image,
             mask_image=mask_image,
@@ -268,5 +330,6 @@ class DiffusionModel:
             width=width,
             num_images_per_prompt=n,
             response_format=response_format,
+            model=model,
             **kwargs,
         )

xinference/model/llm/__init__.py CHANGED Viewed

@@ -34,6 +34,7 @@ from .llm_family import (
     BUILTIN_MODELSCOPE_LLM_FAMILIES,
     LLAMA_CLASSES,
     LLM_ENGINES,
+    LMDEPLOY_CLASSES,
     MLX_CLASSES,
     SGLANG_CLASSES,
     SUPPORTED_ENGINES,
@@ -113,10 +114,12 @@ def generate_engine_config_by_model_family(model_family):
 def _install():
     from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
+    from .lmdeploy.core import LMDeployChatModel, LMDeployModel
     from .mlx.core import MLXChatModel, MLXModel
     from .sglang.core import SGLANGChatModel, SGLANGModel
     from .transformers.chatglm import ChatglmPytorchChatModel
     from .transformers.cogvlm2 import CogVLM2Model
+    from .transformers.cogvlm2_video import CogVLM2VideoModel
     from .transformers.core import PytorchChatModel, PytorchModel
     from .transformers.deepseek_vl import DeepSeekVLChatModel
     from .transformers.glm4v import Glm4VModel
@@ -147,6 +150,7 @@ def _install():
     SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
     VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
     MLX_CLASSES.extend([MLXModel, MLXChatModel])
+    LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
     TRANSFORMERS_CLASSES.extend(
         [
             ChatglmPytorchChatModel,
@@ -160,6 +164,7 @@ def _install():
             InternVLChatModel,
             PytorchModel,
             CogVLM2Model,
+            CogVLM2VideoModel,
             MiniCPMV25Model,
             MiniCPMV26Model,
             Glm4VModel,
@@ -174,6 +179,7 @@ def _install():
     SUPPORTED_ENGINES["Transformers"] = TRANSFORMERS_CLASSES
     SUPPORTED_ENGINES["llama.cpp"] = LLAMA_CLASSES
     SUPPORTED_ENGINES["MLX"] = MLX_CLASSES
+    SUPPORTED_ENGINES["LMDEPLOY"] = LMDEPLOY_CLASSES
     json_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "llm_family.json"

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -7189,15 +7189,6 @@
           "model_id": "OpenGVLab/InternVL2-4B",
           "model_revision": "b50544dafada6c41e80bfde2f57cc9b0140fc21c"
         },
-        {
-          "model_format": "awq",
-          "model_size_in_billions": 4,
-          "quantizations": [
-            "Int4"
-          ],
-          "model_id": "OpenGVLab/InternVL2-8B-AWQ",
-          "model_revision": "9f1a4756b7ae18eb26d8a22b618dfc283e8193b3"
-        },
         {
           "model_format": "pytorch",
           "model_size_in_billions": 8,
@@ -7209,6 +7200,15 @@
           "model_id": "OpenGVLab/InternVL2-8B",
           "model_revision": "3bfd3664dea4f3da628785f5125d30f889701253"
         },
+        {
+          "model_format": "awq",
+          "model_size_in_billions": 8,
+          "quantizations": [
+            "Int4"
+          ],
+          "model_id": "OpenGVLab/InternVL2-8B-AWQ",
+          "model_revision": "9f1a4756b7ae18eb26d8a22b618dfc283e8193b3"
+        },
         {
           "model_format": "pytorch",
           "model_size_in_billions": 26,
@@ -7342,6 +7342,51 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "cogvlm2-video-llama3-chat",
+    "model_lang": [
+        "en",
+        "zh"
+    ],
+    "model_ability": [
+        "chat",
+        "vision"
+    ],
+    "model_description": "CogVLM2-Video achieves state-of-the-art performance on multiple video question answering tasks.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "THUDM/cogvlm2-video-llama3-chat",
+        "model_revision": "f375ead7d8202ebe2c3d09f1068abdddeb2929fa"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "LLAMA3",
+      "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n\n",
+      "inter_message_sep": "<|eot_id|>",
+      "stop_token_ids": [
+        128001,
+        128009
+      ],
+      "stop": [
+        "<|end_of_text|>",
+        "<|eot_id|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 8192,

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -271,6 +271,8 @@ VLLM_CLASSES: List[Type[LLM]] = []
 MLX_CLASSES: List[Type[LLM]] = []
+LMDEPLOY_CLASSES: List[Type[LLM]] = []
 LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
 SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
@@ -1002,6 +1004,11 @@ def register_llm(llm_family: LLMFamilyV1, persist: bool):
     if not is_valid_model_name(llm_family.model_name):
         raise ValueError(f"Invalid model name {llm_family.model_name}.")
+    for spec in llm_family.model_specs:
+        model_uri = spec.model_uri
+        if model_uri and not is_valid_model_uri(model_uri):
+            raise ValueError(f"Invalid model URI {model_uri}.")
     with UD_LLM_FAMILIES_LOCK:
         for family in BUILTIN_LLM_FAMILIES + UD_LLM_FAMILIES:
             if llm_family.model_name == family.model_name:
@@ -1013,12 +1020,6 @@ def register_llm(llm_family: LLMFamilyV1, persist: bool):
         generate_engine_config_by_model_family(llm_family)
     if persist:
-        # We only validate model URL when persist is True.
-        for spec in llm_family.model_specs:
-            model_uri = spec.model_uri
-            if model_uri and not is_valid_model_uri(model_uri):
-                raise ValueError(f"Invalid model URI {model_uri}.")
         persist_path = os.path.join(
             XINFERENCE_MODEL_DIR, "llm", f"{llm_family.model_name}.json"
         )

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -4778,10 +4778,10 @@
             "model_revision": "master"
         },
         {
-            "model_format": "pytorch",
+            "model_format": "awq",
             "model_size_in_billions": 2,
             "quantizations": [
-              "none"
+              "Int4"
             ],
             "model_hub": "modelscope",
             "model_id": "OpenGVLab/InternVL2-2B-AWQ",
@@ -4812,10 +4812,10 @@
             "model_revision": "master"
         },
         {
-            "model_format": "pytorch",
+            "model_format": "awq",
             "model_size_in_billions": 8,
             "quantizations": [
-              "none"
+              "Int4"
             ],
             "model_hub": "modelscope",
             "model_id": "OpenGVLab/InternVL2-8B-AWQ",
@@ -4834,10 +4834,10 @@
             "model_revision": "master"
         },
         {
-            "model_format": "pytorch",
+            "model_format": "awq",
             "model_size_in_billions": 26,
             "quantizations": [
-              "none"
+              "Int4"
             ],
             "model_hub": "modelscope",
             "model_id": "OpenGVLab/InternVL2-26B-AWQ",
@@ -4856,10 +4856,10 @@
             "model_revision": "master"
         },
         {
-            "model_format": "pytorch",
+            "model_format": "awq",
             "model_size_in_billions": 40,
             "quantizations": [
-              "none"
+              "Int4"
             ],
             "model_hub": "modelscope",
             "model_id": "OpenGVLab/InternVL2-40B-AWQ",
@@ -4878,10 +4878,10 @@
             "model_revision": "master"
         },
         {
-            "model_format": "pytorch",
+            "model_format": "awq",
             "model_size_in_billions": 76,
             "quantizations": [
-              "none"
+              "Int4"
             ],
             "model_hub": "modelscope",
             "model_id": "OpenGVLab/InternVL2-Llama3-76B-AWQ",
@@ -4962,6 +4962,52 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "cogvlm2-video-llama3-chat",
+    "model_lang": [
+        "en",
+        "zh"
+    ],
+    "model_ability": [
+        "chat",
+        "vision"
+    ],
+    "model_description": "CogVLM2-Video achieves state-of-the-art performance on multiple video question answering tasks.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 12,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "ZhipuAI/cogvlm2-video-llama3-chat",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "LLAMA3",
+      "system_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n\n",
+      "inter_message_sep": "<|eot_id|>",
+      "stop_token_ids": [
+        128001,
+        128009
+      ],
+      "stop": [
+        "<|end_of_text|>",
+        "<|eot_id|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 8192,

xinference/model/llm/lmdeploy/__init__.py ADDED Viewed

File without changes

xinference 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl

Potentially problematic release.

xinference 0.14.2py3-none-any.whl → 0.14.4py3-none-any.whl