PyPI - xinference - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

xinference 1.1.0py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (104) hide show

xinference/_compat.py +2 -0
xinference/_version.py +3 -3
xinference/api/restful_api.py +23 -1
xinference/core/model.py +1 -6
xinference/core/utils.py +10 -6
xinference/model/audio/core.py +5 -0
xinference/model/audio/cosyvoice.py +25 -3
xinference/model/audio/f5tts.py +15 -10
xinference/model/audio/f5tts_mlx.py +260 -0
xinference/model/audio/fish_speech.py +35 -111
xinference/model/audio/model_spec.json +19 -3
xinference/model/audio/model_spec_modelscope.json +9 -0
xinference/model/audio/utils.py +32 -0
xinference/model/image/core.py +69 -1
xinference/model/image/model_spec.json +127 -4
xinference/model/image/model_spec_modelscope.json +130 -4
xinference/model/image/stable_diffusion/core.py +45 -13
xinference/model/llm/llm_family.json +47 -0
xinference/model/llm/llm_family.py +15 -36
xinference/model/llm/llm_family_modelscope.json +49 -0
xinference/model/llm/mlx/core.py +68 -13
xinference/model/llm/transformers/core.py +1 -0
xinference/model/llm/transformers/qwen2_vl.py +2 -0
xinference/model/llm/utils.py +1 -0
xinference/model/llm/vllm/core.py +11 -2
xinference/thirdparty/cosyvoice/bin/average_model.py +92 -0
xinference/thirdparty/cosyvoice/bin/export_jit.py +12 -2
xinference/thirdparty/cosyvoice/bin/export_onnx.py +112 -0
xinference/thirdparty/cosyvoice/bin/export_trt.sh +9 -0
xinference/thirdparty/cosyvoice/bin/inference.py +5 -7
xinference/thirdparty/cosyvoice/bin/train.py +42 -8
xinference/thirdparty/cosyvoice/cli/cosyvoice.py +96 -25
xinference/thirdparty/cosyvoice/cli/frontend.py +77 -30
xinference/thirdparty/cosyvoice/cli/model.py +330 -80
xinference/thirdparty/cosyvoice/dataset/dataset.py +6 -2
xinference/thirdparty/cosyvoice/dataset/processor.py +76 -14
xinference/thirdparty/cosyvoice/flow/decoder.py +92 -13
xinference/thirdparty/cosyvoice/flow/flow.py +99 -9
xinference/thirdparty/cosyvoice/flow/flow_matching.py +110 -13
xinference/thirdparty/cosyvoice/flow/length_regulator.py +5 -4
xinference/thirdparty/cosyvoice/hifigan/discriminator.py +140 -0
xinference/thirdparty/cosyvoice/hifigan/generator.py +58 -42
xinference/thirdparty/cosyvoice/hifigan/hifigan.py +67 -0
xinference/thirdparty/cosyvoice/llm/llm.py +139 -6
xinference/thirdparty/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
xinference/thirdparty/cosyvoice/tokenizer/tokenizer.py +279 -0
xinference/thirdparty/cosyvoice/transformer/embedding.py +2 -2
xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +7 -7
xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +318 -0
xinference/thirdparty/cosyvoice/utils/common.py +28 -1
xinference/thirdparty/cosyvoice/utils/executor.py +69 -7
xinference/thirdparty/cosyvoice/utils/file_utils.py +2 -12
xinference/thirdparty/cosyvoice/utils/frontend_utils.py +9 -5
xinference/thirdparty/cosyvoice/utils/losses.py +20 -0
xinference/thirdparty/cosyvoice/utils/scheduler.py +1 -2
xinference/thirdparty/cosyvoice/utils/train_utils.py +101 -45
xinference/thirdparty/fish_speech/fish_speech/conversation.py +94 -83
xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +63 -20
xinference/thirdparty/fish_speech/fish_speech/text/clean.py +1 -26
xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +1 -1
xinference/thirdparty/fish_speech/fish_speech/tokenizer.py +152 -0
xinference/thirdparty/fish_speech/fish_speech/train.py +2 -2
xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1 -1
xinference/thirdparty/fish_speech/tools/{post_api.py → api_client.py} +7 -13
xinference/thirdparty/fish_speech/tools/api_server.py +98 -0
xinference/thirdparty/fish_speech/tools/download_models.py +5 -5
xinference/thirdparty/fish_speech/tools/fish_e2e.py +2 -2
xinference/thirdparty/fish_speech/tools/inference_engine/__init__.py +192 -0
xinference/thirdparty/fish_speech/tools/inference_engine/reference_loader.py +125 -0
xinference/thirdparty/fish_speech/tools/inference_engine/utils.py +39 -0
xinference/thirdparty/fish_speech/tools/inference_engine/vq_manager.py +57 -0
xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +2 -2
xinference/thirdparty/fish_speech/tools/llama/generate.py +117 -89
xinference/thirdparty/fish_speech/tools/run_webui.py +104 -0
xinference/thirdparty/fish_speech/tools/schema.py +11 -28
xinference/thirdparty/fish_speech/tools/server/agent/__init__.py +57 -0
xinference/thirdparty/fish_speech/tools/server/agent/generate.py +119 -0
xinference/thirdparty/fish_speech/tools/server/agent/generation_utils.py +122 -0
xinference/thirdparty/fish_speech/tools/server/agent/pre_generation_utils.py +72 -0
xinference/thirdparty/fish_speech/tools/server/api_utils.py +75 -0
xinference/thirdparty/fish_speech/tools/server/exception_handler.py +27 -0
xinference/thirdparty/fish_speech/tools/server/inference.py +45 -0
xinference/thirdparty/fish_speech/tools/server/model_manager.py +122 -0
xinference/thirdparty/fish_speech/tools/server/model_utils.py +129 -0
xinference/thirdparty/fish_speech/tools/server/views.py +246 -0
xinference/thirdparty/fish_speech/tools/webui/__init__.py +173 -0
xinference/thirdparty/fish_speech/tools/webui/inference.py +91 -0
xinference/thirdparty/fish_speech/tools/webui/variables.py +14 -0
xinference/thirdparty/matcha/utils/utils.py +2 -2
{xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/METADATA +11 -6
{xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/RECORD +95 -74
xinference/thirdparty/cosyvoice/bin/__init__.py +0 -0
xinference/thirdparty/cosyvoice/bin/export_trt.py +0 -8
xinference/thirdparty/cosyvoice/flow/__init__.py +0 -0
xinference/thirdparty/cosyvoice/hifigan/__init__.py +0 -0
xinference/thirdparty/cosyvoice/llm/__init__.py +0 -0
xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
xinference/thirdparty/fish_speech/tools/api.py +0 -943
xinference/thirdparty/fish_speech/tools/msgpack_api.py +0 -95
xinference/thirdparty/fish_speech/tools/webui.py +0 -548
{xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/LICENSE +0 -0
{xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/WHEEL +0 -0
{xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/entry_points.txt +0 -0
{xinference-1.1.0.dist-info → xinference-1.1.1.dist-info}/top_level.txt +0 -0

xinference/model/image/model_spec_modelscope.json CHANGED Viewed

@@ -12,8 +12,24 @@
     ],
     "default_model_config": {
       "quantize": true,
-      "quantize_text_encoder": "text_encoder_2"
-    }
+      "quantize_text_encoder": "text_encoder_2",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "Xorbits/FLUX.1-schnell-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q2_K",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "flux1-schnell-{quantization}.gguf"
   },
   {
     "model_name": "FLUX.1-dev",
@@ -28,8 +44,24 @@
     ],
     "default_model_config": {
       "quantize": true,
-      "quantize_text_encoder": "text_encoder_2"
-    }
+      "quantize_text_encoder": "text_encoder_2",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "AI-ModelScope/FLUX.1-dev-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q2_K",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "flux1-dev-{quantization}.gguf"
   },
   {
     "model_name": "sd3-medium",
@@ -47,6 +79,100 @@
       "quantize_text_encoder": "text_encoder_3"
     }
   },
+  {
+    "model_name": "sd3.5-medium",
+    "model_family": "stable_diffusion",
+    "model_hub": "modelscope",
+    "model_id": "AI-ModelScope/stable-diffusion-3.5-medium",
+    "model_revision": "master",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16"
+    },
+    "gguf_model_id": "Xorbits/stable-diffusion-3.5-medium-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q3_K_M",
+      "Q3_K_S",
+      "Q4_0",
+      "Q4_1",
+      "Q4_K_M",
+      "Q4_K_S",
+      "Q5_0",
+      "Q5_1",
+      "Q5_K_M",
+      "Q5_K_S",
+      "Q6_K",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_medium-{quantization}.gguf"
+  },
+  {
+    "model_name": "sd3.5-large",
+    "model_family": "stable_diffusion",
+    "model_hub": "modelscope",
+    "model_id": "AI-ModelScope/stable-diffusion-3.5-large",
+    "model_revision": "master",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16",
+      "transformer_nf4": true
+    },
+    "gguf_model_id": "Xorbits/stable-diffusion-3.5-large-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q4_0",
+      "Q4_1",
+      "Q5_0",
+      "Q5_1",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_large-{quantization}.gguf"
+  },
+  {
+    "model_name": "sd3.5-large-turbo",
+    "model_family": "stable_diffusion",
+    "model_hub": "modelscope",
+    "model_id": "AI-ModelScope/stable-diffusion-3.5-large-turbo",
+    "model_revision": "master",
+    "model_ability": [
+      "text2image",
+      "image2image",
+      "inpainting"
+    ],
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder_3",
+      "torch_dtype": "bfloat16",
+      "transformer_nf4": true
+    },
+    "default_generate_config": {
+      "guidance_scale": 1.0,
+      "num_inference_steps": 4
+    },
+    "gguf_model_id": "Xorbits/stable-diffusion-3.5-large-turbo-gguf",
+    "gguf_quantizations": [
+      "F16",
+      "Q4_0",
+      "Q4_1",
+      "Q5_0",
+      "Q5_1",
+      "Q8_0"
+    ],
+    "gguf_model_file_name_template": "sd3.5_large_turbo-{quantization}.gguf"
+  },
   {
     "model_name": "sd-turbo",
     "model_family": "stable_diffusion",

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -14,8 +14,10 @@
 import contextlib
 import gc
+import importlib
 import inspect
 import itertools
+import json
 import logging
 import os
 import re
@@ -86,6 +88,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         lora_load_kwargs: Optional[Dict] = None,
         lora_fuse_kwargs: Optional[Dict] = None,
         model_spec: Optional["ImageModelFamilyV1"] = None,
+        gguf_model_path: Optional[str] = None,
         **kwargs,
     ):
         self._model_uid = model_uid
@@ -109,6 +112,8 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         self._model_spec = model_spec
         self._abilities = model_spec.model_ability or []  # type: ignore
         self._kwargs = kwargs
+        # gguf
+        self._gguf_model_path = gguf_model_path
     @property
     def model_ability(self):
@@ -184,7 +189,17 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             self._model.fuse_lora(**self._lora_fuse_kwargs)
             logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
+    def _get_layer_cls(self, layer: str):
+        with open(os.path.join(self._model_path, "model_index.json")) as f:  # type: ignore
+            model_index = json.load(f)
+            layer_info = model_index[layer]
+            module_name, class_name = layer_info
+            module = importlib.import_module(module_name)
+            return getattr(module, class_name)
     def load(self):
+        from transformers import BitsAndBytesConfig, T5EncoderModel
         if "text2image" in self._abilities or "image2image" in self._abilities:
             from diffusers import AutoPipelineForText2Image as AutoPipelineModel
         elif "inpainting" in self._abilities:
@@ -200,7 +215,9 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
                 glob(os.path.join(self._model_path, "*/*.safetensors"))
             )
         if isinstance(torch_dtype, str):
-            self._kwargs["torch_dtype"] = getattr(torch, torch_dtype)
+            self._torch_dtype = torch_dtype = self._kwargs["torch_dtype"] = getattr(
+                torch, torch_dtype
+            )
         controlnet = self._kwargs.get("controlnet")
         if controlnet is not None:
@@ -212,18 +229,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
                 ]
         quantize_text_encoder = self._kwargs.pop("quantize_text_encoder", None)
-        if quantize_text_encoder:
-            try:
-                from transformers import BitsAndBytesConfig, T5EncoderModel
-            except ImportError:
-                error_message = "Failed to import module 'transformers'"
-                installation_guide = [
-                    "Please make sure 'transformers' is installed. ",
-                    "You can install it by `pip install transformers`\n",
-                ]
-                raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        if quantize_text_encoder and not self._gguf_model_path:
             try:
                 import bitsandbytes  # noqa: F401
             except ImportError:
@@ -249,6 +255,32 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
                 self._kwargs[text_encoder_name] = text_encoder
                 self._kwargs["device_map"] = "balanced"
+        if self._gguf_model_path:
+            from diffusers import GGUFQuantizationConfig
+            # GGUF transformer
+            self._kwargs["transformer"] = self._get_layer_cls(
+                "transformer"
+            ).from_single_file(
+                self._gguf_model_path,
+                quantization_config=GGUFQuantizationConfig(compute_dtype=torch_dtype),
+                torch_dtype=torch_dtype,
+                config=os.path.join(self._model_path, "transformer"),
+            )
+        elif self._kwargs.get("transformer_nf4"):
+            nf4_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch_dtype,
+            )
+            model_nf4 = self._get_layer_cls("transformer").from_pretrained(
+                self._model_path,
+                subfolder="transformer",
+                quantization_config=nf4_config,
+                torch_dtype=torch_dtype,
+            )
+            self._kwargs["transformer"] = model_nf4
         logger.debug(
             "Loading model from %s, kwargs: %s", self._model_path, self._kwargs
         )

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -8942,5 +8942,52 @@
       "<|user|>",
       "<|observation|>"
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "QvQ-72B-Preview",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "QVQ-72B-Preview is an experimental research model developed by the Qwen team, focusing on enhancing visual reasoning capabilities.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/QVQ-72B-Preview"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_id": "mlx-community/QVQ-72B-Preview-{quantization}"
+      }
+    ],
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ]
   }
 ]

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -972,46 +972,25 @@ def match_llm(
         return spec
     # priority: download_hub > download_from_modelscope() and download_from_csghub()
-    if download_hub == "modelscope":
-        all_families = (
-            BUILTIN_MODELSCOPE_LLM_FAMILIES
-            + BUILTIN_LLM_FAMILIES
-            + user_defined_llm_families
-        )
-    elif download_hub == "openmind_hub":
-        all_families = (
-            BUILTIN_OPENMIND_HUB_LLM_FAMILIES
-            + BUILTIN_LLM_FAMILIES
-            + user_defined_llm_families
-        )
-    elif download_hub == "csghub":
-        all_families = (
-            BUILTIN_CSGHUB_LLM_FAMILIES
-            + BUILTIN_LLM_FAMILIES
-            + user_defined_llm_families
-        )
-    elif download_hub == "huggingface":
-        all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
+    # set base model
+    base_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
+    hub_families_map = {
+        "modelscope": BUILTIN_MODELSCOPE_LLM_FAMILIES,
+        "openmind_hub": BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
+        "csghub": BUILTIN_CSGHUB_LLM_FAMILIES,
+    }
+    if download_hub == "huggingface":
+        all_families = base_families
+    elif download_hub in hub_families_map:
+        all_families = hub_families_map[download_hub] + base_families
     elif download_from_modelscope():
-        all_families = (
-            BUILTIN_MODELSCOPE_LLM_FAMILIES
-            + BUILTIN_LLM_FAMILIES
-            + user_defined_llm_families
-        )
+        all_families = BUILTIN_MODELSCOPE_LLM_FAMILIES + base_families
     elif download_from_openmind_hub():
-        all_families = (
-            BUILTIN_OPENMIND_HUB_LLM_FAMILIES
-            + BUILTIN_LLM_FAMILIES
-            + user_defined_llm_families
-        )
+        all_families = BUILTIN_OPENMIND_HUB_LLM_FAMILIES + base_families
     elif download_from_csghub():
-        all_families = (
-            BUILTIN_CSGHUB_LLM_FAMILIES
-            + BUILTIN_LLM_FAMILIES
-            + user_defined_llm_families
-        )
+        all_families = BUILTIN_CSGHUB_LLM_FAMILIES + base_families
     else:
-        all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
+        all_families = base_families
     for family in all_families:
         if model_name != family.model_name:

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -6673,5 +6673,54 @@
       "<|user|>",
       "<|observation|>"
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "QvQ-72B-Preview",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "QVQ-72B-Preview is an experimental research model developed by the Qwen team, focusing on enhancing visual reasoning capabilities.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/QVQ-72B-Preview",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_id": "mlx-community/QVQ-72B-Preview-{quantization}",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ]
   }
 ]

xinference/model/llm/mlx/core.py CHANGED Viewed

@@ -173,7 +173,9 @@ class MLXModel(LLM):
             return False
         return True
-    def _get_prompt_cache(self, prompt, lora_name: Optional[str] = None):
+    def _get_prompt_cache(
+        self, prompt, lora_name: Optional[str] = None, model: Any = None
+    ):
         from mlx_lm.models.cache import make_prompt_cache
         assert self._prompt_cache is not None
@@ -185,7 +187,9 @@ class MLXModel(LLM):
             or self._prompt_cache.tokens != prompt[:cache_len]
         ):
             self._prompt_cache.model_key = model_key
-            self._prompt_cache.cache = make_prompt_cache(self._model, self._max_kv_size)
+            self._prompt_cache.cache = make_prompt_cache(
+                model or self._model, self._max_kv_size
+            )
             self._prompt_cache.tokens = []
             logger.debug("Making new prompt cache for %s", self.model_uid)
         else:
@@ -458,6 +462,8 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        self._prompt_cache = PromptCache()
         return load(self.model_path)
     def load(self):
@@ -471,13 +477,52 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
         self._model, self._processor = self._load_model(**kwargs)
         self._tokenizer = self._processor.tokenizer
+    def _generate_stream_inner_no_image(self, **kwargs):
+        import mlx.nn as nn
+        from mlx_lm.utils import make_sampler, stream_generate
+        # For mlx-lm, the model(inputs) will return logits,
+        # but the language model in mlx-vlm will return an object
+        # https://github.com/Blaizzy/mlx-vlm/blob/3f5e1620072440afb7496940f67ac1c7fc64056f/mlx_vlm/models/base.py#L260
+        # so we cannot pass the language model to stream_generate directly
+        # we wrap here to just let model(inputs) return logits to pass stream_generate
+        class ModelWrapper(nn.Module):
+            def __init__(self, model):
+                super().__init__()
+                self._model = model.language_model
+            @property
+            def layers(self):
+                return self._model.layers
+            def __call__(self, *args, **kwargs):
+                return self._model(*args, **kwargs).logits
+        sampler = make_sampler(
+            temp=kwargs.pop("temperature"), top_p=kwargs.pop("top_p")
+        )
+        prompt_token_ids = kwargs.pop("prompt_token_ids")
+        yield from stream_generate(
+            ModelWrapper(self._model),
+            self._tokenizer,
+            prompt_token_ids,
+            sampler=sampler,
+            **kwargs,
+        )
     def _generate_stream_inner(self, **kwargs):
         import mlx.core as mx
         from mlx_lm.utils import GenerationResponse
         from mlx_vlm.utils import generate_step
-        max_tokens = kwargs.pop("max_tokens")
         inputs = kwargs["prompt_token_ids"]
+        if not isinstance(inputs, tuple):
+            # no images
+            yield from self._generate_stream_inner_no_image(**kwargs)
+            return
+        max_tokens = kwargs.pop("max_tokens")
         input_ids, pixel_values, mask = inputs[:3]
         kwargs = {
@@ -549,16 +594,26 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
         else:
             image_token_index = None
-        inputs = prepare_inputs(
-            None,
-            self._processor,
-            images,
-            prompt_str,
-            image_token_index,
-            kwargs.get("resize_shape"),
-        )
-        input_ids = inputs[0]
-        return inputs, len(input_ids)
+        if not images:
+            prompt = prompt["prompt"]  # type: ignore
+            prompt_token_ids = self._tokenizer.encode(prompt)
+            prompt_token_ids = self._get_prompt_cache(
+                prompt_token_ids,
+                kwargs.get("lora_name"),
+                model=self._model.language_model,
+            )
+            return prompt_token_ids, len(prompt_token_ids)
+        else:
+            inputs = prepare_inputs(
+                None,
+                self._processor,
+                images,
+                prompt_str,
+                image_token_index,
+                kwargs.get("resize_shape"),
+            )
+            input_ids = inputs[0]
+            return inputs, len(input_ids)
     def chat(
         self,

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -69,6 +69,7 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
     "deepseek-v2.5",
     "deepseek-v2-chat-0628",
     "glm-edge-v",
+    "QvQ-72B-Preview",
 ]

xinference/model/llm/transformers/qwen2_vl.py CHANGED Viewed

@@ -47,6 +47,8 @@ class Qwen2VLChatModel(PytorchChatModel):
         llm_family = model_family.model_family or model_family.model_name
         if "qwen2-vl-instruct".lower() in llm_family.lower():
             return True
+        if "qvq-72b-preview".lower() in llm_family.lower():
+            return True
         return False
     def load(self):

xinference/model/llm/utils.py CHANGED Viewed

@@ -52,6 +52,7 @@ QWEN_TOOL_CALL_FAMILY = [
     "qwen2-instruct",
     "qwen2-moe-instruct",
     "qwen2.5-instruct",
+    "qwen2.5-coder-instruct",
 ]
 GLM4_TOOL_CALL_FAMILY = [

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -70,6 +70,7 @@ class VLLMModelConfig(TypedDict, total=False):
     max_model_len: Optional[int]
     limit_mm_per_prompt: Optional[Dict[str, int]]
     guided_decoding_backend: Optional[str]
+    scheduling_policy: Optional[str]
 class VLLMGenerateConfig(TypedDict, total=False):
@@ -187,10 +188,14 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
 if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
+if VLLM_INSTALLED and vllm.__version__ >= "0.6.2":
+    VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
 if VLLM_INSTALLED and vllm.__version__ >= "0.6.3":
     VLLM_SUPPORTED_MODELS.append("llama-3.2-vision")
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("llama-3.2-vision-instruct")
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct")
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("QvQ-72B-Preview")
 class VLLMModel(LLM):
@@ -244,7 +249,6 @@ class VLLMModel(LLM):
             multiprocessing.set_start_method("fork", force=True)
         self._model_config = self._sanitize_model_config(self._model_config)
         if self.lora_modules is None:
             self.lora_requests = []
         else:
@@ -327,7 +331,9 @@ class VLLMModel(LLM):
         model_config.setdefault("quantization", None)
         model_config.setdefault("max_model_len", None)
         model_config.setdefault("guided_decoding_backend", "outlines")
+        # Add scheduling policy if vLLM version is 0.6.3 or higher
+        if vllm.__version__ >= "0.6.3":
+            model_config.setdefault("scheduling_policy", "fcfs")
         return model_config
     @staticmethod
@@ -859,6 +865,9 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
                 "image": 2,  # default 2 images all chat
             }
         )
+        # Add scheduling policy if vLLM version is 0.6.3 or higher
+        if vllm.__version__ >= "0.6.3":
+            model_config.setdefault("scheduling_policy", "fcfs")
         return model_config

xinference 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

Potentially problematic release.

xinference 1.1.0py3-none-any.whl → 1.1.1py3-none-any.whl