PyPI - xinference - Versions diffs - 1.6.0.post1__py3-none-any.whl → 1.6.1__py3-none-any.whl - Mend

xinference 1.6.0.post1py3-none-any.whl → 1.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-05-17T15:09:06+0800",
+ "date": "2025-05-30T19:36:43+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "1adc5d3e5cffb2752cd3e05ca782c4cfe3c0ce57",
- "version": "1.6.0.post1"
+ "full-revisionid": "72cc5e39040bdc49981b240c2b59af998554a75f",
+ "version": "1.6.1"
 }
 '''  # END VERSION_JSON

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -1017,7 +1017,7 @@ class Client:
         model_path: Optional[str]
             Model path, if gguf format, should be the file path, otherwise, should be directory of the model.
         **kwargs:
-            Any other parameters been specified.
+            Any other parameters been specified. e.g. multimodal_projector for multimodal inference with the llama.cpp backend.
         Returns
         -------

xinference/conftest.py CHANGED Viewed

@@ -304,10 +304,3 @@ def setup_with_auth():
             os.remove(auth_file)
         except:
             pass
-@pytest.fixture
-def set_use_xllamacpp():
-    os.environ["USE_XLLAMACPP"] = "1"
-    yield
-    del os.environ["USE_XLLAMACPP"]

xinference/core/media_interface.py CHANGED Viewed

@@ -19,7 +19,7 @@ import os
 import threading
 import time
 import uuid
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 import gradio as gr
 import PIL.Image
@@ -463,7 +463,7 @@ class MediaInterface:
     def image2video_interface(self) -> "gr.Blocks":
         def image_generate_video(
-            image: "PIL.Image",
+            image: "PIL.Image.Image",
             prompt: str,
             negative_prompt: str,
             num_frames: int,
@@ -653,13 +653,14 @@ class MediaInterface:
                 with open(prompt_speech_file, "rb") as f:
                     prompt_speech_bytes = f.read()
+            kw: Dict[str, Any] = {}
+            if prompt_speech_bytes:
+                kw["prompt_speech"] = prompt_speech_bytes
+            if prompt_text:
+                kw["prompt_text"] = prompt_text
             response = model.speech(
-                input=input_text,
-                voice=voice,
-                speed=speed,
-                response_format="mp3",
-                prompt_speech=prompt_speech_bytes,
-                prompt_text=prompt_text,
+                input=input_text, voice=voice, speed=speed, response_format="mp3", **kw
             )
             # Write to a temp .mp3 file and return its path

xinference/core/model.py CHANGED Viewed

@@ -71,12 +71,8 @@ except ImportError:
     OutOfMemoryError = _OutOfMemoryError
-XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = [
-    "qwen-vl-chat",
-    "cogvlm2",
-    "glm-4v",
-    "MiniCPM-V-2.6",
-]
+# !!!!! DO NOT add model_name to this list, using `register_batching_multimodal_models` below instead.
+XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = []
 XINFERENCE_TEXT_TO_IMAGE_BATCHING_ALLOWED_MODELS = ["FLUX.1-dev", "FLUX.1-schnell"]
 XINFERENCE_TEST_OUT_OF_MEMORY_ERROR = bool(
@@ -84,6 +80,16 @@ XINFERENCE_TEST_OUT_OF_MEMORY_ERROR = bool(
 )
+def register_batching_multimodal_models(*model_names: str):
+    def decorator(cls):
+        for name in model_names:
+            if name not in XINFERENCE_BATCHING_ALLOWED_VISION_MODELS:
+                XINFERENCE_BATCHING_ALLOWED_VISION_MODELS.append(name)
+        return cls
+    return decorator
 def request_limit(fn):
     """
     Used by ModelActor.
@@ -977,6 +983,7 @@ class ModelActor(xo.StatelessActor, CancelMixin):
                 response_format,
                 temperature,
                 timestamp_granularities,
+                **kwargs,
             )
         raise AttributeError(
             f"Model {self._model.model_spec} is not for creating transcriptions."

xinference/core/scheduler.py CHANGED Viewed

@@ -272,15 +272,6 @@ class InferenceRequest:
         )
-def _get_valid_batch_kv_cache(cache, skipped_indexes: Set[int]):
-    batch_size = cache.key_cache[0].shape[0]
-    batch_slices = [num for num in range(batch_size) if num not in skipped_indexes]
-    for idx in range(len(cache)):
-        cache.key_cache[idx] = cache.key_cache[idx][batch_slices, ::].contiguous()
-        cache.value_cache[idx] = cache.value_cache[idx][batch_slices, ::].contiguous()
-    return cache
 class SchedulerActor(xo.StatelessActor):
     @classmethod
     def gen_uid(cls, model_uid: str, replica_id: str):
@@ -409,7 +400,7 @@ class SchedulerActor(xo.StatelessActor):
         # Some requests have been completed. Batch size needs to be reduced for kv cache.
         if stopped_batch_indexes and len(self._running_queue) > 0:
             kv_cache = self._running_queue[0].kv_cache
-            reduced_kv_cache = _get_valid_batch_kv_cache(
+            reduced_kv_cache = self._model.build_reduced_kv_cache(
                 kv_cache, stopped_batch_indexes
             )
             for r in self._running_queue:

xinference/core/worker.py CHANGED Viewed

@@ -533,16 +533,6 @@ class WorkerActor(xo.StatelessActor):
                 existing_model_uids.append(rep_uid)
             if idx in self._gpu_to_embedding_model_uids:
                 existing_model_uids.extend(self._gpu_to_embedding_model_uids[idx])
-            # If user has run the vLLM model on the GPU that was forced to be specified,
-            # it is not possible to force this GPU to be allocated again
-            if idx in self._user_specified_gpu_to_model_uids:
-                for rep_uid, _ in self._user_specified_gpu_to_model_uids[idx]:
-                    is_vllm_model = await self.is_model_vllm_backend(rep_uid)
-                    if is_vllm_model:
-                        raise RuntimeError(
-                            f"User specified GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
-                            f"therefore cannot allocate GPU memory for a new model."
-                        )
             if existing_model_uids:
                 logger.warning(

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -218,13 +218,65 @@
       "batch_size_s": 300
     }
   },
+  {
+    "model_name": "paraformer-zh-hotword",
+    "model_family": "funasr",
+    "model_id": "JunHowie/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
+    "model_revision": "26d622993683d7b0c517ee5ec9c1c8bdde76e324",
+    "model_ability": ["audio2text"],
+    "multilingual": false,
+    "default_model_config": {
+      "vad_model": "fsmn-vad",
+      "punc_model": "ct-punc"
+    },
+    "default_transcription_config": {
+      "batch_size_s": 300
+    }
+  },
+  {
+    "model_name": "paraformer-zh-long",
+    "model_family": "funasr",
+    "model_id": "JunHowie/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    "model_revision": "b6d8cb81645e34056cd3dda41e5624a740587de3",
+    "model_ability": ["audio2text"],
+    "multilingual": false,
+    "default_model_config": {
+      "vad_model": "fsmn-vad",
+      "punc_model": "ct-punc"
+    },
+    "default_transcription_config": {
+      "batch_size_s": 300
+    }
+  },
+  {
+    "model_name": "paraformer-zh-spk",
+    "model_family": "funasr",
+    "model_id": "JunHowie/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
+    "model_revision": "36abd64af4392fe02bf76453bc86c081cf1ca6da",
+    "model_ability": ["audio2text"],
+    "multilingual": false,
+    "default_model_config": {
+      "vad_model": "fsmn-vad",
+      "punc_model": "ct-punc"
+    },
+    "default_transcription_config": {
+      "batch_size_s": 300
+    }
+  },
   {
     "model_name": "ChatTTS",
     "model_family": "ChatTTS",
     "model_id": "2Noise/ChatTTS",
     "model_revision": "1a3c04a8b0651689bd9242fbb55b1f4b5a9aef84",
     "model_ability": ["text2audio"],
-    "multilingual": true
+    "multilingual": true,
+    "virtualenv": {
+      "packages": [
+        "ChatTTS>=0.2.1",
+        "#system_torch#",
+        "#system_numpy#"
+      ]
+    }
   },
   {
     "model_name": "CosyVoice-300M",

xinference/model/audio/model_spec_modelscope.json CHANGED Viewed

@@ -51,6 +51,55 @@
     "model_name": "paraformer-zh",
     "model_family": "funasr",
     "model_hub": "modelscope",
+    "model_id": "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    "model_revision": "master",
+    "model_ability": ["audio2text"],
+    "multilingual": false,
+    "default_model_config": {
+      "vad_model": "fsmn-vad",
+      "punc_model": "ct-punc"
+    },
+    "default_transcription_config": {
+      "batch_size_s": 300
+    }
+  },
+  {
+    "model_name": "paraformer-zh-hotword",
+    "model_family": "funasr",
+    "model_hub": "modelscope",
+    "model_id": "iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
+    "model_revision": "master",
+    "model_ability": ["audio2text"],
+    "multilingual": false,
+    "default_model_config": {
+      "vad_model": "fsmn-vad",
+      "punc_model": "ct-punc"
+    },
+    "default_transcription_config": {
+      "hotword": "",
+      "batch_size_s": 300
+    }
+  },
+    {
+    "model_name": "paraformer-zh-long",
+    "model_family": "funasr",
+    "model_hub": "modelscope",
+    "model_id": "iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    "model_revision": "master",
+    "model_ability": ["audio2text"],
+    "multilingual": false,
+    "default_model_config": {
+      "vad_model": "fsmn-vad",
+      "punc_model": "ct-punc"
+    },
+    "default_transcription_config": {
+      "batch_size_s": 300
+    }
+  },
+  {
+    "model_name": "paraformer-zh-spk",
+    "model_family": "funasr",
+    "model_hub": "modelscope",
     "model_id": "iic/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
     "model_revision": "master",
     "model_ability": ["audio2text"],
@@ -70,7 +119,14 @@
     "model_id": "AI-ModelScope/ChatTTS",
     "model_revision": "master",
     "model_ability": ["text2audio"],
-    "multilingual": true
+    "multilingual": true,
+    "virtualenv": {
+      "packages": [
+        "ChatTTS>=0.2.1",
+        "#system_torch#",
+        "#system_numpy#"
+      ]
+    }
   },
   {
     "model_name": "CosyVoice-300M",

xinference/model/embedding/core.py CHANGED Viewed

@@ -651,19 +651,27 @@ class EmbeddingModel:
                 img = Image.open(image_data)
                 return img
-            objs: list[dict[str, str]] = []
-            for item in sentences:
-                if isinstance(item, dict):
-                    if item.get("text") is not None:
-                        objs.append(item["text"])
-                    elif item.get("image") is not None:
-                        if re.match(r"^data:image/.+;base64,", item["image"]):
-                            image = base64_to_image(item["image"])
-                            objs.append(image)
+            objs: list[str] = []
+            if isinstance(sentences, str):
+                objs.append(sentences)
+            else:
+                for item in sentences:
+                    if isinstance(item, dict):
+                        if item.get("text") is not None:
+                            objs.append(item["text"])
+                        elif item.get("image") is not None:
+                            if re.match(r"^data:image/.+;base64,", item["image"]):
+                                image = base64_to_image(item["image"])
+                                objs.append(image)
+                            else:
+                                objs.append(item["image"])
                         else:
-                            objs.append(item["image"])
+                            raise ValueError("Please check the input data.")
+                    elif isinstance(item, str):
+                        objs.append(item)
                     else:
-                        logger.error("Please check the input data.")
+                        raise ValueError("Please check the input data.")
             all_embeddings, all_token_nums = encode(
                 self._model,
                 objs,

xinference/model/image/model_spec.json CHANGED Viewed

@@ -303,7 +303,16 @@
     "model_ability": [
       "text2image",
       "image2image"
-    ]
+    ],
+    "default_model_config": {
+      "variant": "fp16"
+    },
+    "virtualenv": {
+      "packages": [
+        "diffusers>=0.30.0",
+        "#system_numpy#"
+      ]
+    }
   },
   {
     "model_name": "stable-diffusion-inpainting",

xinference/model/image/model_spec_modelscope.json CHANGED Viewed

@@ -307,6 +307,26 @@
       }
     ]
   },
+  {
+    "model_name": "kolors",
+    "model_family": "stable_diffusion",
+    "model_hub": "modelscope",
+    "model_id": "JunHowie/Kolors-diffusers",
+    "model_revision": "master",
+    "model_ability": [
+      "text2image",
+      "image2image"
+    ],
+    "default_model_config": {
+      "variant": "fp16"
+    },
+    "virtualenv": {
+      "packages": [
+        "diffusers>=0.30.0",
+        "#system_numpy#"
+      ]
+    }
+  },
   {
     "model_name": "GOT-OCR2_0",
     "model_family": "ocr",

xinference/model/llm/__init__.py CHANGED Viewed

@@ -73,7 +73,7 @@ def generate_engine_config_by_model_family(model_family):
         model_size_in_billions = spec.model_size_in_billions
         quantizations = spec.quantizations
         for quantization in quantizations:
-            # traverse all supported engines to match the name, format, size in billions and quatization of model
+            # traverse all supported engines to match the name, format, size in billions and quantization of model
             for engine in SUPPORTED_ENGINES:
                 if not check_format_with_engine(
                     model_format, engine
@@ -107,6 +107,10 @@ def generate_engine_config_by_model_family(model_family):
                                     "llm_class": cls,
                                 }
                             )
+                            if hasattr(spec, "multimodal_projectors"):
+                                engine_params[-1][
+                                    "multimodal_projectors"
+                                ] = spec.multimodal_projectors
                         engines[engine] = engine_params
                         break
     LLM_ENGINES[model_name] = engines
@@ -163,36 +167,9 @@ def _install():
     from .lmdeploy.core import LMDeployChatModel, LMDeployModel
     from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
     from .sglang.core import SGLANGChatModel, SGLANGModel, SGLANGVisionModel
-    from .transformers.chatglm import ChatglmPytorchChatModel
-    from .transformers.cogagent import CogAgentChatModel
-    from .transformers.cogvlm2 import CogVLM2Model
-    from .transformers.cogvlm2_video import CogVLM2VideoModel
     from .transformers.core import PytorchChatModel, PytorchModel
-    from .transformers.deepseek_v2 import (
-        DeepSeekV2PytorchChatModel,
-        DeepSeekV2PytorchModel,
-    )
-    from .transformers.deepseek_vl import DeepSeekVLChatModel
-    from .transformers.deepseek_vl2 import DeepSeekVL2ChatModel
-    from .transformers.gemma3 import Gemma3ChatModel, Gemma3TextChatModel
-    from .transformers.glm4v import Glm4VModel
-    from .transformers.glm_edge_v import GlmEdgeVModel
-    from .transformers.minicpmv25 import MiniCPMV25Model
-    from .transformers.minicpmv26 import MiniCPMV26Model
-    from .transformers.opt import OptPytorchModel
-    from .transformers.ovis2 import Ovis2ChatModel
-    from .transformers.qwen2_audio import Qwen2AudioChatModel
-    from .transformers.qwen_vl import QwenVLChatModel
     from .vllm.core import VLLMChatModel, VLLMModel, VLLMVisionModel
-    try:
-        from .transformers.omnilmm import OmniLMMModel
-    except ImportError as e:
-        # For quite old transformers version,
-        # import will generate error
-        OmniLMMModel = None
-        warnings.warn(f"Cannot import OmniLLMModel due to reason: {e}")
     # register llm classes.
     LLAMA_CLASSES.extend(
         [
@@ -203,32 +180,7 @@ def _install():
     VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
     MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
     LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
-    TRANSFORMERS_CLASSES.extend(
-        [
-            ChatglmPytorchChatModel,
-            PytorchChatModel,
-            QwenVLChatModel,
-            Qwen2AudioChatModel,
-            DeepSeekVLChatModel,
-            DeepSeekVL2ChatModel,
-            PytorchModel,
-            CogVLM2Model,
-            CogVLM2VideoModel,
-            MiniCPMV25Model,
-            MiniCPMV26Model,
-            Glm4VModel,
-            DeepSeekV2PytorchModel,
-            DeepSeekV2PytorchChatModel,
-            OptPytorchModel,
-            GlmEdgeVModel,
-            CogAgentChatModel,
-            Gemma3TextChatModel,
-            Gemma3ChatModel,
-            Ovis2ChatModel,
-        ]
-    )
-    if OmniLMMModel:  # type: ignore
-        TRANSFORMERS_CLASSES.append(OmniLMMModel)
+    TRANSFORMERS_CLASSES.extend([PytorchChatModel, PytorchModel])
     # support 4 engines for now
     SUPPORTED_ENGINES["vLLM"] = VLLM_CLASSES

xinference/model/llm/core.py CHANGED Viewed

@@ -160,12 +160,14 @@ class LLMDescription(ModelDescription):
         llm_family: "LLMFamilyV1",
         llm_spec: "LLMSpecV1",
         quantization: Optional[str],
+        multimodal_projector: Optional[str] = None,
         model_path: Optional[str] = None,
     ):
         super().__init__(address, devices, model_path=model_path)
         self._llm_family = llm_family
         self._llm_spec = llm_spec
         self._quantization = quantization
+        self._multimodal_projector = multimodal_projector
     @property
     def spec(self):
@@ -185,6 +187,7 @@ class LLMDescription(ModelDescription):
             "model_family": self._llm_family.model_family
             or self._llm_family.model_name,
             "quantization": self._quantization,
+            "multimodal_projector": self._multimodal_projector,
             "model_hub": self._llm_spec.model_hub,
             "revision": self._llm_spec.model_revision,
             "context_length": self._llm_family.context_length,
@@ -204,6 +207,7 @@ class LLMDescription(ModelDescription):
             "model_file_location": model_file_location,
             "cache_status": cache_status,
             "quantization": self._quantization,
+            "multimodal_projector": self._multimodal_projector,
             "model_format": self._llm_spec.model_format,
             "model_size_in_billions": self._llm_spec.model_size_in_billions,
         }
@@ -212,10 +216,19 @@ class LLMDescription(ModelDescription):
 def generate_llm_description(llm_family: "LLMFamilyV1") -> Dict[str, List[Dict]]:
     res = defaultdict(list)
     for spec in llm_family.model_specs:
+        multimodal_projectors = getattr(spec, "multimodal_projectors", None)
         for q in spec.quantizations:
-            res[llm_family.model_name].append(
-                LLMDescription(None, None, llm_family, spec, q).to_version_info()
-            )
+            if multimodal_projectors:
+                for mmproj in multimodal_projectors:
+                    res[llm_family.model_name].append(
+                        LLMDescription(
+                            None, None, llm_family, spec, q, mmproj
+                        ).to_version_info()
+                    )
+            else:
+                res[llm_family.model_name].append(
+                    LLMDescription(None, None, llm_family, spec, q).to_version_info()
+                )
     return res
@@ -260,8 +273,9 @@ def create_llm_model_instance(
     )
     logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
+    multimodal_projector = kwargs.get("multimodal_projector")
     if not model_path:
-        model_path = cache(llm_family, llm_spec, quantization)
+        model_path = cache(llm_family, llm_spec, quantization, multimodal_projector)
     peft_model = peft_model_config.peft_model if peft_model_config else None
     if peft_model is not None:
@@ -288,5 +302,5 @@ def create_llm_model_instance(
             model_uid, llm_family, llm_spec, quantization, model_path, kwargs
         )
     return model, LLMDescription(
-        subpool_addr, devices, llm_family, llm_spec, quantization
+        subpool_addr, devices, llm_family, llm_spec, quantization, multimodal_projector
     )

xinference 1.6.0.post1__py3-none-any.whl → 1.6.1__py3-none-any.whl

Potentially problematic release.

xinference 1.6.0.post1py3-none-any.whl → 1.6.1py3-none-any.whl