PyPI - xinference - Versions diffs - 1.8.1rc1__py3-none-any.whl → 1.9.0__py3-none-any.whl - Mend

xinference 1.8.1rc1py3-none-any.whl → 1.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (64) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-08-03T12:12:02+0800",
+ "date": "2025-08-16T21:34:08+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "2adec1c027044a920632ad7626f8f278eef83361",
- "version": "1.8.1.rc1"
+ "full-revisionid": "38e0401f83799f57d42ef948c57782466b8e4777",
+ "version": "1.9.0"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -2249,8 +2249,9 @@ class RESTfulAPI(CancelMixin):
                 )
         if body.tools and body.stream:
             is_vllm = await model.is_vllm_backend()
+            is_sglang = await model.is_sglang_backend()
             if not (
-                (is_vllm and model_family in QWEN_TOOL_CALL_FAMILY)
+                ((is_vllm or is_sglang) and model_family in QWEN_TOOL_CALL_FAMILY)
                 or (not is_vllm and model_family in GLM4_TOOL_CALL_FAMILY)
             ):
                 raise HTTPException(

xinference/core/model.py CHANGED Viewed

@@ -365,6 +365,11 @@ class ModelActor(xo.StatelessActor, CancelMixin):
         return isinstance(self._model, VLLMModel)
+    def is_sglang_backend(self) -> bool:
+        from ..model.llm.sglang.core import SGLANGModel
+        return isinstance(self._model, SGLANGModel)
     async def load(self):
         try:
             # Change process title for model

xinference/core/supervisor.py CHANGED Viewed

@@ -476,7 +476,7 @@ class SupervisorActor(xo.StatelessActor):
     async def _to_rerank_model_reg(
         self, model_spec: "RerankModelFamilyV2", is_builtin: bool
     ) -> Dict[str, Any]:
-        from ..model.cache_manager import CacheManager
+        from ..model.rerank.cache_manager import RerankCacheManager as CacheManager
         instance_cnt = await self.get_instance_count(model_spec.model_name)
         version_cnt = await self.get_model_version_count(model_spec.model_name)
@@ -712,9 +712,8 @@ class SupervisorActor(xo.StatelessActor):
             from ..model.rerank import BUILTIN_RERANK_MODELS
             from ..model.rerank.custom import get_user_defined_reranks
-            for model_name, families in BUILTIN_RERANK_MODELS.items():
+            for model_name, family in BUILTIN_RERANK_MODELS.items():
                 if detailed:
-                    family = [x for x in families if x.model_hub == "huggingface"][0]
                     ret.append(await self._to_rerank_model_reg(family, is_builtin=True))
                 else:
                     ret.append({"model_name": model_name, "is_builtin": True})

xinference/core/worker.py CHANGED Viewed

@@ -817,10 +817,7 @@ class WorkerActor(xo.StatelessActor):
             # we specify python_path explicitly
             # sometimes uv would find other versions.
             python_path = pathlib.Path(sys.executable)
-        kw = {}
-        if XINFERENCE_VIRTUAL_ENV_SKIP_INSTALLED:
-            kw["skip_installed"] = XINFERENCE_VIRTUAL_ENV_SKIP_INSTALLED
-        virtual_env_manager.create_env(python_path=python_path, **kw)
+        virtual_env_manager.create_env(python_path=python_path)
         return virtual_env_manager
     @classmethod
@@ -847,6 +844,8 @@ class WorkerActor(xo.StatelessActor):
             packages.extend(virtual_env_packages)
         conf.pop("packages", None)
         conf.pop("inherit_pip_config", None)
+        if XINFERENCE_VIRTUAL_ENV_SKIP_INSTALLED:
+            conf["skip_installed"] = XINFERENCE_VIRTUAL_ENV_SKIP_INSTALLED
         logger.info(
             "Installing packages %s in virtual env %s, with settings(%s)",

xinference/deploy/local.py CHANGED Viewed

@@ -152,6 +152,11 @@ def main(
     logging_conf: Optional[Dict] = None,
     auth_config_file: Optional[str] = None,
 ):
+    # force to set spawn,
+    # cuda may be inited in xoscar virtualenv
+    # which will raise error after sub pool is created
+    multiprocessing.set_start_method("spawn")
     supervisor_address = f"{host}:{get_next_port()}"
     local_cluster = run_in_subprocess(
         supervisor_address, metrics_exporter_host, metrics_exporter_port, logging_conf

xinference/deploy/worker.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import asyncio
 import logging
+import multiprocessing
 import os
 from typing import Any, Optional
@@ -81,6 +82,11 @@ def main(
     metrics_exporter_port: Optional[int] = None,
     logging_conf: Optional[dict] = None,
 ):
+    # force to set spawn,
+    # cuda may be inited in xoscar virtualenv
+    # which will raise error after sub pool is created
+    multiprocessing.set_start_method("spawn")
     loop = asyncio.get_event_loop()
     task = loop.create_task(
         _start_worker(

xinference/model/core.py CHANGED Viewed

@@ -81,6 +81,9 @@ def create_model_instance(
         return create_rerank_model_instance(
             model_uid,
             model_name,
+            model_engine,
+            model_format,
+            quantization,
             download_hub,
             model_path,
             **kwargs,

xinference/model/embedding/sentence_transformers/core.py CHANGED Viewed

@@ -19,8 +19,8 @@ from typing import List, Optional, Union, no_type_check
 import numpy as np
 import torch
-from ....device_utils import is_device_available
 from ....types import Embedding, EmbeddingData, EmbeddingUsage
+from ...utils import is_flash_attn_available
 from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
 logger = logging.getLogger(__name__)
@@ -85,13 +85,12 @@ class SentenceTransformerEmbeddingModel(EmbeddingModel):
             )
         elif "qwen3" in self.model_family.model_name.lower():
             # qwen3 embedding
-            flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
             flash_attn_enabled = self._kwargs.get(
-                "enable_flash_attn", is_device_available("cuda")
+                "enable_flash_attn", is_flash_attn_available()
             )
             model_kwargs = {"device_map": "auto"}
             tokenizer_kwargs = {}
-            if flash_attn_installed and flash_attn_enabled:
+            if flash_attn_enabled:
                 model_kwargs["attn_implementation"] = "flash_attention_2"
                 model_kwargs["torch_dtype"] = "bfloat16"
                 tokenizer_kwargs["padding_side"] = "left"

xinference/model/embedding/vllm/core.py CHANGED Viewed

@@ -17,6 +17,7 @@ import logging
 from typing import List, Union
 from ....types import Embedding, EmbeddingData, EmbeddingUsage
+from ...utils import cache_clean
 from ..core import EmbeddingModel, EmbeddingModelFamilyV2, EmbeddingSpecV1
 logger = logging.getLogger(__name__)
@@ -42,13 +43,14 @@ class VLLMEmbeddingModel(EmbeddingModel):
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-        self._model = LLM(model=self._model_path, task="embed")
+        self._model = LLM(model=self._model_path, task="embed", **self._kwargs)
         self._tokenizer = self._model.get_tokenizer()
     @staticmethod
     def _get_detailed_instruct(task_description: str, query: str) -> str:
         return f"Instruct: {task_description}\nQuery:{query}"
+    @cache_clean
     def create_embedding(
         self,
         sentences: Union[str, List[str]],
@@ -60,8 +62,7 @@ class VLLMEmbeddingModel(EmbeddingModel):
         normalize_embedding = kwargs.get("normalize_embedding", True)
         if not normalize_embedding:
             raise ValueError(
-                "vllm embedding engine does not support "
-                "setting `normalize_embedding=False`"
+                "vllm embedding engine does not support setting `normalize_embedding=False`"
             )
         assert self._model is not None

xinference/model/image/model_spec.json CHANGED Viewed

@@ -175,6 +175,75 @@
       "no_build_isolation": true
     }
   },
+  {
+    "version": 2,
+    "model_name": "Qwen-Image",
+    "model_family": "stable_diffusion",
+    "model_ability": [
+      "text2image"
+    ],
+    "model_src": {
+      "huggingface": {
+        "model_id": "Qwen/Qwen-Image",
+        "model_revision": "4516c4d3058302ff35cd86c62ffa645d039fefad",
+        "gguf_model_id": "city96/Qwen-Image-gguf",
+        "gguf_quantizations": [
+          "F16",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "gguf_model_file_name_template": "qwen-image-{quantization}.gguf"
+      },
+      "modelscope": {
+        "model_id": "Qwen/Qwen-Image",
+        "model_revision": "master",
+        "gguf_model_id": "city96/Qwen-Image-gguf",
+        "gguf_quantizations": [
+          "F16",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_1",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_1",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "gguf_model_file_name_template": "qwen-image-{quantization}.gguf"
+      }
+    },
+    "default_model_config": {
+      "quantize": true,
+      "quantize_text_encoder": "text_encoder",
+      "torch_dtype": "bfloat16"
+    },
+    "default_generate_config": {
+      "guidance_scale": 1.0
+    },
+    "virtualenv": {
+      "packages": [
+        "git+https://github.com/huggingface/diffusers.git",
+        "peft>=0.17.0",
+        "#system_torch#",
+        "#system_numpy#"
+      ],
+      "no_build_isolation": true
+    }
+  },
   {
     "version": 2,
     "model_name": "sd3-medium",

xinference/model/image/stable_diffusion/core.py CHANGED Viewed

@@ -254,6 +254,14 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
                 self._model = FluxKontextPipeline.from_pretrained(
                     self._model_path, **self._kwargs
                 )
+            elif "qwen" in self._model_spec.model_name.lower():
+                # TODO: remove this branch when auto pipeline supports
+                # Qwen-Image
+                from diffusers import DiffusionPipeline
+                self._model = DiffusionPipeline.from_pretrained(
+                    self._model_path, **self._kwargs
+                )
             else:
                 raise
         self._load_to_device(self._model)
@@ -348,11 +356,19 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
             return
         if not quantize_text_encoder:
+            logger.debug("No text encoder quantization")
             return
         quantization_method = self._kwargs.pop("text_encoder_quantize_method", "bnb")
         quantization = self._kwargs.pop("text_encoder_quantization", "8-bit")
+        logger.debug(
+            "Quantize text encoder %s with method %s, quantization %s",
+            quantize_text_encoder,
+            quantization_method,
+            quantization,
+        )
         torch_dtype = self._torch_dtype
         for text_encoder_name in quantize_text_encoder.split(","):
             quantization_kwargs: Dict[str, Any] = {}
@@ -389,8 +405,13 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         if not quantization:
             # skip if no quantization specified
+            logger.debug("No transformer quantization")
             return
+        logger.debug(
+            "Quantize transformer with %s, quantization %s", method, quantization
+        )
         torch_dtype = self._torch_dtype
         transformer_cls = self._get_layer_cls("transformer")
         quantization_config = self._get_quantize_config(
@@ -409,6 +430,7 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
         # GGUF transformer
         torch_dtype = self._torch_dtype
+        logger.debug("Quantize transformer with gguf file %s", self._gguf_model_path)
         self._kwargs["transformer"] = self._get_layer_cls(
             "transformer"
         ).from_single_file(

xinference/model/llm/cache_manager.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import logging
 import os
 from typing import TYPE_CHECKING, Optional
@@ -81,7 +95,7 @@ class LLMCacheManager(CacheManager):
         if not IS_NEW_HUGGINGFACE_HUB:
             use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
-        if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
+        if self._model_format in ["pytorch", "gptq", "awq", "fp8", "bnb", "mlx"]:
             download_dir = retry_download(
                 huggingface_hub.snapshot_download,
                 self._model_name,
@@ -144,7 +158,7 @@ class LLMCacheManager(CacheManager):
         if self.get_cache_status():
             return cache_dir
-        if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
+        if self._model_format in ["pytorch", "gptq", "awq", "bnb", "fp8", "bnb", "mlx"]:
             download_dir = retry_download(
                 snapshot_download,
                 self._model_name,
@@ -234,7 +248,7 @@ class LLMCacheManager(CacheManager):
         if self.get_cache_status():
             return cache_dir
-        if self._model_format in ["pytorch", "gptq", "awq", "fp8", "mlx"]:
+        if self._model_format in ["pytorch", "gptq", "awq", "fp8", "bnb", "mlx"]:
             download_dir = retry_download(
                 snapshot_download,
                 self._model_name,

xinference/model/llm/harmony.py ADDED Viewed

@@ -0,0 +1,245 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from copy import deepcopy
+from typing import TYPE_CHECKING, AsyncGenerator, Dict, Union
+if TYPE_CHECKING:
+    from ...types import ChatCompletion, ChatCompletionChunk
+class HarmonyStreamParser:
+    def __init__(self):
+        # Current channel: either 'analysis', 'final', or None if not started yet
+        self.current_channel = None
+        # Buffer for accumulating text when looking for 'assistantfinal' marker
+        self.buffer = ""
+    def feed(self, text):
+        """
+        Feed a chunk of text into the parser and return parsed segments.
+        Each segment is a dict:
+        {
+            "channel": "analysis" | "final",
+            "content": <string>
+        }
+        The parser detects 'assistantfinal' markers inside reasoning text,
+        splits the reasoning and final content correctly, and switches the channel.
+        """
+        segments = []
+        # If we are currently in 'analysis' mode
+        if self.current_channel == "analysis":
+            # Add text to buffer and check for 'assistantfinal' marker
+            self.buffer += text
+            if "assistantfinal" in self.buffer:
+                # Split reasoning and final content
+                before, after = self.buffer.split("assistantfinal", 1)
+                if before:
+                    segments.append({"channel": "analysis", "content": before})
+                # Switch to final channel
+                self.current_channel = "final"
+                self.buffer = ""
+                if after:
+                    segments.append({"channel": "final", "content": after})
+                return segments
+            else:
+                # Check if buffer ends with partial 'assistantfinal'
+                if any(
+                    self.buffer.endswith("assistantfinal"[:i])
+                    for i in range(1, len("assistantfinal") + 1)
+                ):
+                    # Don't emit anything yet, wait for more text
+                    return segments
+                else:
+                    # Emit what we have so far and keep buffer for next time
+                    if self.buffer:
+                        segments.append({"channel": "analysis", "content": self.buffer})
+                        self.buffer = ""
+                    return segments
+        # If we are currently in 'final' mode
+        if self.current_channel == "final":
+            # Check if this is actually a new message starting with 'analysis'
+            if text.startswith("analysis"):
+                # Reset parser state for new message
+                self.current_channel = None
+                self.buffer = ""
+                # Re-process this text with the new state
+                return self.feed(text)
+            else:
+                segments.append({"channel": "final", "content": text})
+                return segments
+        # If no channel has been started yet
+        if text.startswith("analysis"):
+            self.current_channel = "analysis"
+            rest = text[len("analysis") :]
+            if "assistantfinal" in rest:
+                # Split immediately if marker is found in the first chunk
+                before, after = rest.split("assistantfinal", 1)
+                if before:
+                    segments.append({"channel": "analysis", "content": before})
+                self.current_channel = "final"
+                if after:
+                    segments.append({"channel": "final", "content": after})
+            else:
+                # Start buffering for potential 'assistantfinal' marker
+                self.buffer = rest
+                # Check if buffer ends with partial 'assistantfinal'
+                if any(
+                    self.buffer.endswith("assistantfinal"[:i])
+                    for i in range(1, len("assistantfinal") + 1)
+                ):
+                    # Don't emit anything yet, wait for more text
+                    pass
+                else:
+                    # Emit what we have so far
+                    if self.buffer:
+                        segments.append({"channel": "analysis", "content": self.buffer})
+                        self.buffer = ""
+        elif text.startswith("assistantfinal"):
+            self.current_channel = "final"
+            rest = text[len("assistantfinal") :]
+            if rest:
+                segments.append({"channel": "final", "content": rest})
+        return segments
+async def async_stream_harmony_chat_completion(
+    chunks: Union[
+        "ChatCompletion",
+        AsyncGenerator["ChatCompletionChunk", None],
+    ],
+) -> AsyncGenerator["ChatCompletion", None]:
+    """
+    Parse Harmony-formatted content from either a full ChatCompletion (non-streaming)
+    or an async stream of ChatCompletionChunk (streaming), using the HarmonyStreamParser defined in this file.
+    Yields parsed objects incrementally.
+    """
+    # --- Non-streaming: ChatCompletion ---
+    if isinstance(chunks, dict) and chunks.get("object") == "chat.completion":
+        out_data = deepcopy(chunks)
+        for choice in out_data["choices"]:
+            parser = HarmonyStreamParser()
+            msg = choice["message"]
+            # Backup original content & reasoning
+            original_content = msg.get("content") or ""
+            original_reasoning = msg.get("reasoning_content") or ""
+            # Reset fields before parsing
+            msg["content"] = ""
+            msg["reasoning_content"] = ""
+            msg.setdefault("tool_calls", [])
+            # Feed original content
+            for seg in parser.feed(original_content):
+                ch, c = seg["channel"], seg["content"]
+                if ch == "final":
+                    msg["content"] += c
+                elif ch == "analysis":
+                    msg["reasoning_content"] += c
+                elif ch == "tool":
+                    msg["tool_calls"].append(c)
+            # Feed original reasoning_content
+            for seg in parser.feed(original_reasoning):
+                if seg["channel"] == "analysis":
+                    msg["reasoning_content"] += seg["content"]
+                elif seg["channel"] == "tool":
+                    msg["tool_calls"].append(seg["content"])
+            # Clean up reasoning_content: set to None if no reasoning content was parsed
+            if not msg["reasoning_content"] and not original_reasoning:
+                msg["reasoning_content"] = None  # type: ignore
+        yield out_data
+    else:
+        # Streaming: handle async generator
+        parsers_per_choice = {}
+        async for chunk in chunks:  # type: ignore
+            out_chunk = {  # type: ignore
+                "id": chunk["id"],
+                "model": chunk["model"],
+                "object": chunk["object"],
+                "created": chunk["created"],
+                "choices": [],
+            }
+            for i, choice in enumerate(chunk["choices"]):
+                delta = choice.get("delta", {})
+                text = delta.get("content") or ""  # type: ignore
+                if i not in parsers_per_choice:
+                    parsers_per_choice[i] = HarmonyStreamParser()
+                # Feed text to parser and collect current delta only
+                curr_delta: Dict[str, object] = {
+                    "content": "",
+                    "reasoning_content": "",
+                    "tool_calls": [],
+                }
+                for seg in parsers_per_choice[i].feed(text):
+                    ch = seg["channel"]
+                    c = seg["content"]
+                    if ch == "final":
+                        curr_delta["content"] += c  # type: ignore
+                    elif ch == "analysis":
+                        curr_delta["reasoning_content"] += c  # type: ignore
+                    elif ch == "tool":
+                        curr_delta["tool_calls"].append(c)  # type: ignore
+                if curr_delta["reasoning_content"]:
+                    if not curr_delta["content"]:
+                        curr_delta["content"] = None
+                elif curr_delta["content"]:
+                    if not curr_delta["reasoning_content"]:
+                        curr_delta["reasoning_content"] = None
+                elif (
+                    choice.get("finish_reason") is not None
+                    and not curr_delta["reasoning_content"]
+                ):
+                    # For the final chunk, if there's no new reasoning content,
+                    # don't include empty reasoning_content to avoid clearing existing state
+                    curr_delta["reasoning_content"] = None
+                out_chunk["choices"].append(  # type: ignore
+                    {
+                        "index": i,
+                        "delta": curr_delta,
+                        "finish_reason": choice.get("finish_reason"),
+                    }
+                )
+            # Only yield if we have either content or reasoning_content
+            has_content = any(
+                choice["delta"].get("content")  # type: ignore
+                or choice["delta"].get("reasoning_content")  # type: ignore
+                or choice.get("finish_reason") is not None  # type: ignore
+                for choice in out_chunk["choices"]  # type: ignore
+            )
+            if has_content:
+                yield out_chunk  # type: ignore

xinference 1.8.1rc1__py3-none-any.whl → 1.9.0__py3-none-any.whl

Potentially problematic release.

xinference 1.8.1rc1py3-none-any.whl → 1.9.0py3-none-any.whl