PyPI - xinference - Versions diffs - 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

xinference 1.6.0.post1py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (124) hide show

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -22,6 +22,7 @@ import logging
 import os
 import re
 import sys
+import warnings
 from glob import glob
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
@@ -197,8 +198,6 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             return getattr(module, class_name)
     def load(self):
-        from transformers import BitsAndBytesConfig, T5EncoderModel
         if "text2image" in self._abilities or "image2image" in self._abilities:
             from diffusers import AutoPipelineForText2Image as AutoPipelineModel
         elif "inpainting" in self._abilities:
@@ -227,58 +226,15 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
                     self._get_controlnet_model(*cn) for cn in controlnet
                 ]
+        # quantizations
+        # text_encoder
         quantize_text_encoder = self._kwargs.pop("quantize_text_encoder", None)
-        if quantize_text_encoder and not self._gguf_model_path:
-            try:
-                import bitsandbytes  # noqa: F401
-            except ImportError:
-                error_message = "Failed to import module 'bitsandbytes'"
-                installation_guide = [
-                    "Please make sure 'bitsandbytes' is installed. ",
-                    "You can install it by `pip install bitsandbytes`\n",
-                ]
-                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-            for text_encoder_name in quantize_text_encoder.split(","):
-                quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-                quantization_kwargs = {}
-                if torch_dtype:
-                    quantization_kwargs["torch_dtype"] = torch_dtype
-                text_encoder = T5EncoderModel.from_pretrained(
-                    self._model_path,
-                    subfolder=text_encoder_name,
-                    quantization_config=quantization_config,
-                    **quantization_kwargs,
-                )
-                self._kwargs[text_encoder_name] = text_encoder
-                self._kwargs["device_map"] = "balanced"
+        self._quantize_text_encoder(quantize_text_encoder)
+        # transformer
         if self._gguf_model_path:
-            from diffusers import GGUFQuantizationConfig
-            # GGUF transformer
-            self._kwargs["transformer"] = self._get_layer_cls(
-                "transformer"
-            ).from_single_file(
-                self._gguf_model_path,
-                quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
-                torch_dtype=torch_dtype,
-                config=os.path.join(self._model_path, "transformer"),
-            )
-        elif self._kwargs.get("transformer_nf4"):
-            nf4_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_compute_dtype=torch_dtype,
-            )
-            model_nf4 = self._get_layer_cls("transformer").from_pretrained(
-                self._model_path,
-                subfolder="transformer",
-                quantization_config=nf4_config,
-                torch_dtype=torch_dtype,
-            )
-            self._kwargs["transformer"] = model_nf4
+            self._quantize_transformer_gguf()
+        else:
+            self._quantize_transformer()
         logger.debug(
             "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
@@ -308,6 +264,133 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
                     cache_branch_id=self._kwargs.get("deepcache_cache_branch_id", 0),
                 )
+    def _get_quantize_config(self, method: str, quantization: str, module: str):
+        if method == "bnb":
+            try:
+                import bitsandbytes  # noqa: F401
+            except ImportError:
+                error_message = "Failed to import module 'bitsandbytes'"
+                installation_guide = [
+                    "Please make sure 'bitsandbytes' is installed. ",
+                    "You can install it by `pip install bitsandbytes`\n",
+                ]
+                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+            if module.startswith("diffusers."):
+                from diffusers import BitsAndBytesConfig
+            else:
+                assert module.startswith("transformers.")
+                from transformers import BitsAndBytesConfig
+            if quantization == "4-bit":
+                return BitsAndBytesConfig(load_in_4bit=True)
+            elif quantization == "8-bit":
+                return BitsAndBytesConfig(load_in_8bit=True)
+            elif quantization == "nf4":
+                return BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_quant_type="nf4",
+                    bnb_4bit_compute_dtype=self._torch_dtype,
+                )
+        elif method == "torchao":
+            try:
+                import torchao  # noqa: F401
+            except ImportError:
+                error_message = "Failed to import module 'torchao'"
+                installation_guide = [
+                    "Please make sure 'torchao' is installed. ",
+                    "You can install it by `pip install torchao`\n",
+                ]
+                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+            if module.startswith("diffusers."):
+                from diffusers import TorchAoConfig
+            else:
+                assert module.startswith("transformers.")
+                from transformers import TorchAoConfig
+            return TorchAoConfig(quantization)
+        else:
+            raise ValueError(f"Unknown quantization method for image model: {method}")
+    def _quantize_text_encoder(self, quantize_text_encoder: Optional[str]):
+        if self._gguf_model_path:
+            # skip quantization when gguf applied to transformer
+            return
+        if not quantize_text_encoder:
+            return
+        quantization_method = self._kwargs.pop("text_encoder_quantize_method", "bnb")
+        quantization = self._kwargs.pop("text_encoder_quantization", "8-bit")
+        torch_dtype = self._torch_dtype
+        for text_encoder_name in quantize_text_encoder.split(","):
+            quantization_kwargs: Dict[str, Any] = {}
+            if torch_dtype:
+                quantization_kwargs["torch_dtype"] = torch_dtype
+            text_encoder_cls = self._get_layer_cls(text_encoder_name)
+            quantization_config = self._get_quantize_config(
+                quantization_method, quantization, text_encoder_cls.__module__
+            )
+            text_encoder = text_encoder_cls.from_pretrained(
+                self._model_path,
+                subfolder=text_encoder_name,
+                quantization_config=quantization_config,
+                **quantization_kwargs,
+            )
+            self._kwargs[text_encoder_name] = text_encoder
+        else:
+            if not self._kwargs.get("device_map"):
+                self._kwargs["device_map"] = "balanced"
+    def _quantize_transformer(self):
+        quantization = None
+        nf4 = self._kwargs.pop("transformer_nf4", None)
+        if nf4:
+            warnings.warn(
+                "`transformer_nf4` is deprecated, please use `transformer_quantization=nf4`",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+            quantization = "nf4"
+        method = self._kwargs.pop("transformer_quantize_method", "bnb")
+        if not quantization:
+            quantization = self._kwargs.pop("transformer_quantization", None)
+        if not quantization:
+            # skip if no quantization specified
+            return
+        torch_dtype = self._torch_dtype
+        transformer_cls = self._get_layer_cls("transformer")
+        quantization_config = self._get_quantize_config(
+            method, quantization, transformer_cls.__module__
+        )
+        transformer_model = transformer_cls.from_pretrained(
+            self._model_path,
+            subfolder="transformer",
+            quantization_config=quantization_config,
+            torch_dtype=torch_dtype,
+        )
+        self._kwargs["transformer"] = transformer_model
+    def _quantize_transformer_gguf(self):
+        from diffusers import GGUFQuantizationConfig
+        # GGUF transformer
+        torch_dtype = self._torch_dtype
+        self._kwargs["transformer"] = self._get_layer_cls(
+            "transformer"
+        ).from_single_file(
+            self._gguf_model_path,
+            quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
+            torch_dtype=torch_dtype,
+            config=os.path.join(self._model_path, "transformer"),
+        )
     def _load_to_device(self, model):
         if self._kwargs.get("cpu_offload", False):
             logger.debug("CPU offloading model")
@@ -321,7 +404,15 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         if self._kwargs.get("attention_slicing", False):
             model.enable_attention_slicing()
         if self._kwargs.get("vae_tiling", False):
-            model.enable_vae_tiling()
+            try:
+                model.enable_vae_tiling()
+            except AttributeError:
+                model.vae.enable_tiling()
+        if self._kwargs.get("vae_slicing", False):
+            try:
+                model.enable_vae_slicing()
+            except AttributeError:
+                model.vae.enable_slicing()
     def get_max_num_images_for_batching(self):
         return self._kwargs.get("max_num_images", 16)

xinference/model/llm/__init__.py CHANGED Viewed

@@ -73,7 +73,7 @@ def generate_engine_config_by_model_family(model_family):
         model_size_in_billions = spec.model_size_in_billions
         quantizations = spec.quantizations
         for quantization in quantizations:
-            # traverse all supported engines to match the name, format, size in billions and quatization of model
+            # traverse all supported engines to match the name, format, size in billions and quantization of model
             for engine in SUPPORTED_ENGINES:
                 if not check_format_with_engine(
                     model_format, engine
@@ -107,6 +107,10 @@ def generate_engine_config_by_model_family(model_family):
                                     "llm_class": cls,
                                 }
                             )
+                            if hasattr(spec, "multimodal_projectors"):
+                                engine_params[-1][
+                                    "multimodal_projectors"
+                                ] = spec.multimodal_projectors
                         engines[engine] = engine_params
                         break
     LLM_ENGINES[model_name] = engines
@@ -163,36 +167,9 @@ def _install():
     from .lmdeploy.core import LMDeployChatModel, LMDeployModel
     from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
     from .sglang.core import SGLANGChatModel, SGLANGModel, SGLANGVisionModel
-    from .transformers.chatglm import ChatglmPytorchChatModel
-    from .transformers.cogagent import CogAgentChatModel
-    from .transformers.cogvlm2 import CogVLM2Model
-    from .transformers.cogvlm2_video import CogVLM2VideoModel
     from .transformers.core import PytorchChatModel, PytorchModel
-    from .transformers.deepseek_v2 import (
-        DeepSeekV2PytorchChatModel,
-        DeepSeekV2PytorchModel,
-    )
-    from .transformers.deepseek_vl import DeepSeekVLChatModel
-    from .transformers.deepseek_vl2 import DeepSeekVL2ChatModel
-    from .transformers.gemma3 import Gemma3ChatModel, Gemma3TextChatModel
-    from .transformers.glm4v import Glm4VModel
-    from .transformers.glm_edge_v import GlmEdgeVModel
-    from .transformers.minicpmv25 import MiniCPMV25Model
-    from .transformers.minicpmv26 import MiniCPMV26Model
-    from .transformers.opt import OptPytorchModel
-    from .transformers.ovis2 import Ovis2ChatModel
-    from .transformers.qwen2_audio import Qwen2AudioChatModel
-    from .transformers.qwen_vl import QwenVLChatModel
     from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
-    try:
-        from .transformers.omnilmm import OmniLMMModel
-    except ImportError as e:
-        # For quite old transformers version,
-        # import will generate error
-        OmniLMMModel = None
-        warnings.warn(f"Cannot import OmniLLMModel due to reason: {e}")
     # register llm classes.
     LLAMA_CLASSES.extend(
         [
@@ -203,32 +180,7 @@ def _install():
     VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
     MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
     LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
-    TRANSFORMERS_CLASSES.extend(
-        [
-            ChatglmPytorchChatModel,
-            PytorchChatModel,
-            QwenVLChatModel,
-            Qwen2AudioChatModel,
-            DeepSeekVLChatModel,
-            DeepSeekVL2ChatModel,
-            PytorchModel,
-            CogVLM2Model,
-            CogVLM2VideoModel,
-            MiniCPMV25Model,
-            MiniCPMV26Model,
-            Glm4VModel,
-            DeepSeekV2PytorchModel,
-            DeepSeekV2PytorchChatModel,
-            OptPytorchModel,
-            GlmEdgeVModel,
-            CogAgentChatModel,
-            Gemma3TextChatModel,
-            Gemma3ChatModel,
-            Ovis2ChatModel,
-        ]
-    )
-    if OmniLMMModel:  # type: ignore
-        TRANSFORMERS_CLASSES.append(OmniLMMModel)
+    TRANSFORMERS_CLASSES.extend([PytorchChatModel, PytorchModel])
     # support 4 engines for now
     SUPPORTED_ENGINES["vLLM"] = VLLM_CLASSES

xinference/model/llm/core.py CHANGED Viewed

@@ -160,12 +160,14 @@ class LLMDescription(ModelDescription):
         llm_family: "LLMFamilyV1",
         llm_spec: "LLMSpecV1",
         quantization: Optional[str],
+        multimodal_projector: Optional[str] = None,
         model_path: Optional[str] = None,
     ):
         super().__init__(address, devices, model_path=model_path)
         self._llm_family = llm_family
         self._llm_spec = llm_spec
         self._quantization = quantization
+        self._multimodal_projector = multimodal_projector
     @property
     def spec(self):
@@ -185,6 +187,7 @@ class LLMDescription(ModelDescription):
             "model_family": self._llm_family.model_family
             or self._llm_family.model_name,
             "quantization": self._quantization,
+            "multimodal_projector": self._multimodal_projector,
             "model_hub": self._llm_spec.model_hub,
             "revision": self._llm_spec.model_revision,
             "context_length": self._llm_family.context_length,
@@ -204,6 +207,7 @@ class LLMDescription(ModelDescription):
             "model_file_location": model_file_location,
             "cache_status": cache_status,
             "quantization": self._quantization,
+            "multimodal_projector": self._multimodal_projector,
             "model_format": self._llm_spec.model_format,
             "model_size_in_billions": self._llm_spec.model_size_in_billions,
         }
@@ -212,10 +216,19 @@ class LLMDescription(ModelDescription):
 def generate_llm_description(llm_family: "LLMFamilyV1") -> Dict[str, List[Dict]]:
     res = defaultdict(list)
     for spec in llm_family.model_specs:
+        multimodal_projectors = getattr(spec, "multimodal_projectors", None)
         for q in spec.quantizations:
-            res[llm_family.model_name].append(
-                LLMDescription(None, None, llm_family, spec, q).to_version_info()
-            )
+            if multimodal_projectors:
+                for mmproj in multimodal_projectors:
+                    res[llm_family.model_name].append(
+                        LLMDescription(
+                            None, None, llm_family, spec, q, mmproj
+                        ).to_version_info()
+                    )
+            else:
+                res[llm_family.model_name].append(
+                    LLMDescription(None, None, llm_family, spec, q).to_version_info()
+                )
     return res
@@ -260,8 +273,9 @@ def create_llm_model_instance(
     )
     logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
+    multimodal_projector = kwargs.get("multimodal_projector")
     if not model_path:
-        model_path = cache(llm_family, llm_spec, quantization)
+        model_path = cache(llm_family, llm_spec, quantization, multimodal_projector)
     peft_model = peft_model_config.peft_model if peft_model_config else None
     if peft_model is not None:
@@ -288,5 +302,5 @@ def create_llm_model_instance(
             model_uid, llm_family, llm_spec, quantization, model_path, kwargs
         )
     return model, LLMDescription(
-        subpool_addr, devices, llm_family, llm_spec, quantization
+        subpool_addr, devices, llm_family, llm_spec, quantization, multimodal_projector
     )

xinference/model/llm/llama_cpp/core.py CHANGED Viewed

@@ -15,6 +15,7 @@ import concurrent.futures
 import importlib.util
 import logging
 import os
+import pprint
 import queue
 from typing import Iterator, List, Optional, Union
@@ -24,6 +25,7 @@ from ....types import ChatCompletion, ChatCompletionChunk, Completion, Completio
 from ..core import LLM
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import ChatModelMixin
+from .memory import estimate_gpu_layers
 logger = logging.getLogger(__name__)
@@ -95,7 +97,12 @@ class XllamaCppModel(LLM, ChatModelMixin):
     def load(self):
         try:
-            from xllamacpp import CommonParams, Server
+            from xllamacpp import (
+                CommonParams,
+                Server,
+                get_device_info,
+                ggml_backend_dev_type,
+            )
         except ImportError:
             error_message = "Failed to import module 'xllamacpp'"
             installation_guide = ["Please make sure 'xllamacpp' is installed. "]
@@ -135,6 +142,15 @@ class XllamaCppModel(LLM, ChatModelMixin):
                 if os.path.exists(legacy_model_file_path):
                     model_path = legacy_model_file_path
+        multimodal_projector = self._llamacpp_model_config.get(
+            "multimodal_projector", ""
+        )
+        mmproj = (
+            os.path.join(self.model_path, multimodal_projector)
+            if multimodal_projector
+            else ""
+        )
         try:
             params = CommonParams()
             # Compatible with xllamacpp changes
@@ -142,6 +158,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
                 params.model = model_path
             except Exception:
                 params.model.path = model_path
+            params.mmproj.path = mmproj
             if self.model_family.chat_template:
                 params.chat_template = self.model_family.chat_template
             # This is the default value, could be overwritten by _llamacpp_model_config
@@ -165,6 +182,41 @@ class XllamaCppModel(LLM, ChatModelMixin):
                 # Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
                 # 0x7FFFFFFF is INT32 max, will be auto set to all layers
                 params.n_gpu_layers = 0x7FFFFFFF
+                try:
+                    device_info = get_device_info()
+                    gpus = [
+                        info
+                        for info in device_info
+                        if info["type"]
+                        == ggml_backend_dev_type.GGML_BACKEND_DEVICE_TYPE_GPU
+                    ]
+                    if gpus:
+                        logger.info(
+                            "Try to estimate num gpu layers, n_ctx: %s, n_batch: %s, n_parallel: %s, gpus:\n%s",
+                            params.n_ctx,
+                            params.n_batch,
+                            params.n_parallel,
+                            pprint.pformat(gpus),
+                        )
+                        estimate = estimate_gpu_layers(
+                            gpus=gpus,
+                            model_path=model_path,
+                            projectors=[mmproj] if mmproj else [],
+                            context_length=params.n_ctx,
+                            batch_size=params.n_batch,
+                            num_parallel=params.n_parallel,
+                            kv_cache_type="",
+                        )
+                        logger.info("Estimate num gpu layers: %s", estimate)
+                        if estimate.tensor_split:
+                            params.tensor_split = estimate.tensor_split
+                        else:
+                            params.n_gpu_layers = estimate.layers
+                except Exception as e:
+                    logger.exception(
+                        "Estimate num gpu layers for llama.cpp backend failed: %s", e
+                    )
             self._llm = Server(params)
             self._executor = concurrent.futures.ThreadPoolExecutor(
                 max_workers=max(10, n_threads)
@@ -207,11 +259,13 @@ class XllamaCppModel(LLM, ChatModelMixin):
                     q.put(res)
                 except Exception as e:
                     logger.exception("handle_completions callback failed: %s", e)
+                    q.put(_Error(str(e)))
             try:
                 self._llm.handle_completions(prompt_json, _error_callback, _ok_callback)
             except Exception as ex:
                 logger.exception("handle_completions failed: %s", ex)
+                q.put(_Error(str(ex)))
             q.put(_Done)
         assert self._executor
@@ -271,6 +325,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
                     q.put(res)
                 except Exception as e:
                     logger.exception("handle_chat_completions callback failed: %s", e)
+                    q.put(_Error(str(e)))
             try:
                 self._llm.handle_chat_completions(
@@ -278,6 +333,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
                 )
             except Exception as ex:
                 logger.exception("handle_chat_completions failed: %s", ex)
+                q.put(_Error(str(ex)))
             q.put(_Done)
         assert self._executor
@@ -288,7 +344,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
             def _to_iterator():
                 while (r := q.get()) is not _Done:
                     if type(r) is _Error:
-                        raise Exception("Got error in chat stream: %s", r.msg)
+                        raise Exception(f"Got error in chat stream: {r.msg}")
                     # Get valid keys (O(1) lookup)
                     chunk_keys = ChatCompletionChunk.__annotations__
                     # The chunk may contain additional keys (e.g., system_fingerprint),
@@ -302,5 +358,5 @@ class XllamaCppModel(LLM, ChatModelMixin):
         else:
             r = q.get()
             if type(r) is _Error:
-                raise Exception("Got error in chat: %s", r.msg)
+                raise Exception(f"Got error in chat: {r.msg}")
             return self._to_chat_completion(r, self.reasoning_parser)

xinference 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl

Potentially problematic release.

xinference 1.6.0.post1py3-none-any.whl → 1.7.0py3-none-any.whl