PyPI - xinference - Versions diffs - 1.7.0__py3-none-any.whl → 1.7.1__py3-none-any.whl - Mend

xinference 1.7.0py3-none-any.whl → 1.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (83) hide show

xinference/model/llm/mlx/distributed_models/qwen2.py ADDED Viewed

@@ -0,0 +1,82 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Any, Optional
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_lm.models.base import create_attention_mask
+from mlx_lm.models.qwen2 import Model as _Model
+from mlx_lm.models.qwen2 import ModelArgs
+from mlx_lm.models.qwen2 import Qwen2Model as _Qwen2Model
+from .core import DistributedModelMixin
+logger = logging.getLogger(__name__)
+class Qwen2Model(_Qwen2Model, DistributedModelMixin):
+    def __init__(self, *args, **kwargs):
+        _Qwen2Model.__init__(self, *args, **kwargs)
+        DistributedModelMixin.__init__(self)
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+        input_embeddings: Optional[mx.array] = None,
+    ) -> mx.array:
+        if input_embeddings is not None:
+            h = input_embeddings
+        else:
+            h = self.embed_tokens(x)
+        pipeline_rank = self.rank
+        pipeline_size = self.world_size
+        if mask is None:
+            mask = create_attention_mask(h, cache)
+        if cache is None:
+            cache = [None] * self.num_layers
+        # Receive from the previous process in the pipeline
+        if pipeline_rank < pipeline_size - 1:
+            # wait for previous result
+            h = self._wait_prev_stage_result()
+        for i in range(self.num_layers):
+            h = self.layers[self.start_idx + i](h, mask, cache[i])
+        mx.eval(h)
+        # Send to the next process in the pipeline
+        if pipeline_rank != 0:
+            self._send_stage_result(h)
+            h = self._get_result()
+        else:
+            self._broadcast_result(h)
+        return self.norm(h)
+class Model(_Model):
+    def __init__(self, args: ModelArgs):
+        nn.Module.__init__(self)
+        self.args = args
+        self.model_type = args.model_type
+        self.model = Qwen2Model(args)
+        if not args.tie_word_embeddings:
+            self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)

xinference/model/llm/mlx/distributed_models/qwen3.py ADDED Viewed

@@ -0,0 +1,82 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Any, Optional
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_lm.models.base import create_attention_mask
+from mlx_lm.models.qwen3 import Model as _Model
+from mlx_lm.models.qwen3 import ModelArgs
+from mlx_lm.models.qwen3 import Qwen3Model as _Qwen3Model
+from .core import DistributedModelMixin
+logger = logging.getLogger(__name__)
+class Qwen3Model(_Qwen3Model, DistributedModelMixin):
+    def __init__(self, *args, **kwargs):
+        _Qwen3Model.__init__(self, *args, **kwargs)
+        DistributedModelMixin.__init__(self)
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+        input_embeddings: Optional[mx.array] = None,
+    ) -> mx.array:
+        if input_embeddings is not None:
+            h = input_embeddings
+        else:
+            h = self.embed_tokens(x)
+        pipeline_rank = self.rank
+        pipeline_size = self.world_size
+        if mask is None:
+            mask = create_attention_mask(h, cache)
+        if cache is None:
+            cache = [None] * self.num_layers
+        # Receive from the previous process in the pipeline
+        if pipeline_rank < pipeline_size - 1:
+            # wait for previous result
+            h = self._wait_prev_stage_result()
+        for i in range(self.num_layers):
+            h = self.layers[self.start_idx + i](h, mask, cache[i])
+        mx.eval(h)
+        # Send to the next process in the pipeline
+        if pipeline_rank != 0:
+            self._send_stage_result(h)
+            h = self._get_result()
+        else:
+            self._broadcast_result(h)
+        return self.norm(h)
+class Model(_Model):
+    def __init__(self, args: ModelArgs):
+        nn.Module.__init__(self)
+        self.args = args
+        self.model_type = args.model_type
+        self.model = Qwen3Model(args)
+        if not args.tie_word_embeddings:
+            self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)

xinference/model/llm/mlx/distributed_models/qwen3_moe.py ADDED Viewed

@@ -0,0 +1,76 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_lm.models.base import create_attention_mask
+from mlx_lm.models.qwen3_moe import Model as _Model
+from mlx_lm.models.qwen3_moe import ModelArgs
+from mlx_lm.models.qwen3_moe import Qwen3MoeModel as _Qwen3MoeModel
+from .core import DistributedModelMixin
+logger = logging.getLogger(__name__)
+class Qwen3MoeModel(_Qwen3MoeModel, DistributedModelMixin):
+    def __init__(self, *args, **kwargs):
+        _Qwen3MoeModel.__init__(self, *args, **kwargs)
+        DistributedModelMixin.__init__(self)
+    def __call__(
+        self,
+        inputs: mx.array,
+        mask: mx.array = None,
+        cache=None,
+    ):
+        h = self.embed_tokens(inputs)
+        pipeline_rank = self.rank
+        pipeline_size = self.world_size
+        if mask is None:
+            mask = create_attention_mask(h, cache)
+        if cache is None:
+            cache = [None] * self.num_layers
+        # Receive from the previous process in the pipeline
+        if pipeline_rank < pipeline_size - 1:
+            # wait for previous result
+            h = self._wait_prev_stage_result()
+        for i in range(self.num_layers):
+            h = self.layers[self.start_idx + i](h, mask, cache[i])
+        mx.eval(h)
+        # Send to the next process in the pipeline
+        if pipeline_rank != 0:
+            self._send_stage_result(h)
+            h = self._get_result()
+        else:
+            self._broadcast_result(h)
+        return self.norm(h)
+class Model(_Model):
+    def __init__(self, args: ModelArgs):
+        nn.Module.__init__(self)
+        self.args = args
+        self.model_type = args.model_type
+        self.model = Qwen3MoeModel(args)
+        self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)

xinference/model/llm/reasoning_parser.py CHANGED Viewed

@@ -222,6 +222,12 @@ class ReasoningParser:
             ],
         )
+    def is_enable_thinking(self):
+        from .core import chat_context_var
+        context = chat_context_var.get({})
+        return context.get("enable_thinking", self.enable_thinking)
     async def prepare_reasoning_content_streaming(
         self, chunks: AsyncGenerator[CompletionChunk, None]
     ):
@@ -237,7 +243,7 @@ class ReasoningParser:
         # If reasoning_start_tag is not set, or disable thinking for hybrid model like qwen3,
         # yield chunks as is
-        if not self.reasoning_start_tag or not self.enable_thinking:
+        if not self.reasoning_start_tag or not self.is_enable_thinking():
             async for chunk in chunks:
                 yield chunk
             return
@@ -266,7 +272,7 @@ class ReasoningParser:
                         continue
                     assert isinstance(delta, dict)
                     text = delta.get("content")
-                    if text is None:
+                    if not text:
                         continue
                     # If the first chunk doesn't contain the reasoning_start_tag
                     if self.reasoning_start_tag not in text:
@@ -277,7 +283,7 @@ class ReasoningParser:
                 else:
                     # For standard completion chunks
                     text = choices[0].get("text")
-                    if text is None:
+                    if not text:
                         continue
                     # If the first chunk doesn't contain the reasoning_start_tag
                     if self.reasoning_start_tag not in text:
@@ -304,7 +310,7 @@ class ReasoningParser:
         """
         # If reasoning_start_tag is not set, or disable thinking for hybrid model like qwen3,
         # yield chunks as is
-        if not self.reasoning_start_tag or not self.enable_thinking:
+        if not self.reasoning_start_tag or not self.is_enable_thinking():
             for chunk in chunks:
                 yield chunk
             return
@@ -365,7 +371,7 @@ class ReasoningParser:
             completion: The completion object containing model output,
                 which can be either a chat completion or a standard completion.
         """
-        if not self.reasoning_start_tag or not self.enable_thinking:
+        if not self.reasoning_start_tag or not self.is_enable_thinking():
             return completion
         if completion.get("object") == "chat.completion" and completion.get("choices"):
@@ -399,7 +405,7 @@ class ReasoningParser:
                 or an empty list if no modification is needed
         """
         chunks: List[ChatCompletionChunk] = []
-        if not self.reasoning_start_tag or not self.enable_thinking:
+        if not self.reasoning_start_tag or not self.is_enable_thinking():
             return chunks
         choices = chunk.get("choices")

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -33,6 +33,7 @@ from ....types import (
     CompletionUsage,
 )
 from .. import LLM, LLMFamilyV1, LLMSpecV1
+from ..core import chat_context_var
 from ..llm_family import CustomLLMFamilyV1
 from ..utils import ChatModelMixin, generate_completion_chunk
@@ -582,16 +583,17 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
         request_id: Optional[str] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
         assert self.model_family.chat_template is not None
-        full_context_kwargs = (
+        chat_template_kwargs = (
             self._get_chat_template_kwargs_from_generate_config(
                 generate_config, self.reasoning_parser
             )
             or {}
         )
+        chat_context_var.set(chat_template_kwargs)
+        full_context_kwargs = chat_template_kwargs.copy()
         full_prompt = self.get_full_context(
             messages, self.model_family.chat_template, **full_context_kwargs
         )
         generate_config = self._sanitize_chat_config(generate_config)
         stream = generate_config.get("stream", None)
         if stream:
@@ -656,14 +658,16 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
         chat_template: str = (
             self.model_family.chat_template if self.model_family.chat_template else ""
         )
-        full_context_kwargs = (
+        chat_template_kwargs = (
             self._get_chat_template_kwargs_from_generate_config(
                 generate_config, self.reasoning_parser
             )
             or {}
         )
+        chat_context_var.set(chat_template_kwargs)
+        full_context_kwargs = chat_template_kwargs.copy()
         prompt = self.get_full_context(messages, chat_template, **full_context_kwargs)
         images, video_inputs = process_vision_info(messages)
         if video_inputs:
             raise ValueError("Not support video input now.")

xinference/model/llm/transformers/chatglm.py CHANGED Viewed

@@ -22,6 +22,7 @@ import torch
 from ....core.scheduler import InferenceRequest
 from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig
+from ..core import chat_context_var
 from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
 from ..utils import (
     GLM4_TOOL_CALL_FAMILY,
@@ -464,12 +465,14 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                     tools = list(tools) if tools is not None else None
                     tool_choice = r.generate_config.get("tool_choice", "none")
-                    full_context_kwargs = (
+                    chat_template_kwargs = (
                         self._get_chat_template_kwargs_from_generate_config(
                             r.generate_config, self.reasoning_parser
                         )
                         or {}
                     )
+                    chat_context_var.set(chat_template_kwargs)
+                    full_context_kwargs = chat_template_kwargs.copy()
                     r.prompt = self._process_messages(
                         r.prompt, tools=tools, tool_choice=tool_choice
                     )

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -37,7 +37,7 @@ from ....types import (
     PytorchModelConfig,
 )
 from ...utils import select_device
-from ..core import LLM
+from ..core import LLM, chat_context_var
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import (
     DEEPSEEK_TOOL_CALL_FAMILY,
@@ -725,12 +725,14 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):
         model_family = self.model_family.model_family or self.model_family.model_name
-        full_context_kwargs = (
+        chat_template_kwargs = (
             self._get_chat_template_kwargs_from_generate_config(
                 generate_config, self.reasoning_parser
             )
             or {}
         )
+        chat_context_var.set(chat_template_kwargs)
+        full_context_kwargs = chat_template_kwargs.copy()
         if (
             tools
             and model_family in QWEN_TOOL_CALL_FAMILY

xinference/model/llm/transformers/multimodal/cogagent.py CHANGED Viewed

@@ -20,6 +20,7 @@ from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union
 import torch
 from .....model.utils import select_device
+from ...core import chat_context_var
 from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
 from ...utils import _decode_image, parse_messages
 from ..core import register_non_default_model
@@ -33,8 +34,8 @@ logger = logging.getLogger(__name__)
 class CogAgentChatModel(PytorchMultiModalModel):
     def __init__(self, *args, **kws):
         super().__init__(*args, **kws)
-        self._platform: Optional[Literal["Mac", "WIN", "Mobile"]] = "Mac"
-        self._format: Optional[
+        self._platform: Optional[Literal["Mac", "WIN", "Mobile"]] = "Mac"  # type: ignore
+        self._format: Optional[  # type: ignore
             Literal[
                 "(Answer in Action-Operation-Sensitive format.)",
                 "(Answer in Status-Plan-Action-Operation format.)",
@@ -187,9 +188,14 @@ class CogAgentChatModel(PytorchMultiModalModel):
             "return_tensors": "pt",
             "return_dict": True,
         }
-        full_context_kwargs.update(
-            self._get_chat_template_kwargs_from_generate_config(generate_config, self.reasoning_parser) or {}  # type: ignore
+        chat_template_kwargs = (
+            self._get_chat_template_kwargs_from_generate_config(
+                generate_config, self.reasoning_parser
+            )
+            or {}
         )
+        chat_context_var.set(chat_template_kwargs)
+        full_context_kwargs.update(chat_template_kwargs)
         assert self.model_family.chat_template is not None
         inputs = self.get_full_context(
             [{"role": "user", "image": image, "content": query}],

xinference/model/llm/transformers/multimodal/intern_vl.py CHANGED Viewed

@@ -83,7 +83,7 @@ class InternVLChatModel(PytorchMultiModalModel):
     def load_multimodal_model(self):
         from transformers import AutoModel
-        kwargs: Dict[str, Any] = {
+        kwargs: Dict[str, Any] = {  # type: ignore
             "torch_dtype": torch.bfloat16,
             "low_cpu_mem_usage": True,
             "trust_remote_code": True,

xinference/model/llm/utils.py CHANGED Viewed

@@ -167,13 +167,7 @@ class ChatModelMixin:
         generate_config: Optional[Union[dict, Any]],
         reasoning_parser: Optional[ReasoningParser] = None,
     ) -> Optional[dict]:
-        if reasoning_parser and not reasoning_parser.enable_thinking:
-            # hybrid model like qwen3,
-            # disabled thinking
-            return {"enable_thinking": False}
-        if not generate_config:
-            return None
-        if "chat_template_kwargs" in generate_config:
+        if generate_config and "chat_template_kwargs" in generate_config:
             kwargs = generate_config["chat_template_kwargs"]
             if isinstance(kwargs, str):
                 try:
@@ -190,6 +184,10 @@ class ChatModelMixin:
                     f"`chat_template_kwargs` but be a JSON parsable str "
                     f"or dict, got: {kwargs}"
                 )
+        elif reasoning_parser and not reasoning_parser.enable_thinking:
+            # hybrid model like qwen3,
+            # disabled thinking
+            return {"enable_thinking": False}
         return None
     @staticmethod
@@ -220,7 +218,7 @@ class ChatModelMixin:
         _messages = [x for x in messages]  # copy for not modifying the origin messages
         _messages.append({"role": "assistant", "content": ""})
-        if model_family == "internvl2":
+        if "internvl" in model_family.lower():
             system_prompt = (
                 messages[0]["content"] if messages[0]["role"] == "system" else ""
             )
@@ -558,14 +556,24 @@ class ChatModelMixin:
     @classmethod
     def _handle_qwen_tool_result(cls, text: str) -> List[Tuple]:
         text: str = text.strip()  # type: ignore
-        contents: List[str] = text.split(QWEN_TOOL_CALL_SYMBOLS[1])
+        def split_into_blocks(text: str) -> list[str]:
+            # Match blocks starting with <think> or <tool_call> and ending with </think> or </tool_call>
+            pattern = r"(<(think|tool_call)>.*?</\2>)"
+            blocks = re.findall(pattern, text, re.DOTALL)
+            return [match[0] for match in blocks]
+        contents = split_into_blocks(text)
         results: List[Tuple] = []
         for content in contents:
             content = content.strip()
             if content:
-                pos = content.find(QWEN_TOOL_CALL_SYMBOLS[0])
-                if pos != -1:
-                    content = content[pos + len(QWEN_TOOL_CALL_SYMBOLS[0]) :]
+                pos1 = content.find(QWEN_TOOL_CALL_SYMBOLS[0])
+                if pos1 != -1:
+                    content = content[pos1 + len(QWEN_TOOL_CALL_SYMBOLS[0]) :]
+                pos2 = content.find(QWEN_TOOL_CALL_SYMBOLS[1])
+                if pos2 != -1:
+                    content = content[:pos2]
                 content = content.strip()
                 try:
                     res = json.loads(content)
@@ -580,8 +588,12 @@ class ChatModelMixin:
         return results
     @classmethod
-    def _eval_qwen_chat_arguments(cls, c) -> List[Tuple]:
+    def _eval_qwen_chat_arguments(
+        cls, c, tool_call_text: Optional[str] = None
+    ) -> List[Tuple]:
         text = c["choices"][0]["text"]
+        if tool_call_text:
+            text = tool_call_text
         return cls._handle_qwen_tool_result(text)
     @classmethod
@@ -662,12 +674,14 @@ class ChatModelMixin:
         return results
     @classmethod
-    def _eval_tool_arguments(cls, model_family, c):
+    def _eval_tool_arguments(
+        cls, model_family, c, tool_call_text: Optional[str] = None
+    ):
         family = model_family.model_family or model_family.model_name
         if family in GLM4_TOOL_CALL_FAMILY:
             result = cls._eval_glm_chat_arguments(c)
         elif family in QWEN_TOOL_CALL_FAMILY:
-            result = cls._eval_qwen_chat_arguments(c)
+            result = cls._eval_qwen_chat_arguments(c, tool_call_text)
         elif family in LLAMA3_TOOL_CALL_FAMILY:
             result = cls._eval_llama3_chat_arguments(c)
         elif family in DEEPSEEK_TOOL_CALL_FAMILY:
@@ -687,15 +701,17 @@ class ChatModelMixin:
         c,
         chunk_id=None,
         reasoning_parser: Optional[ReasoningParser] = None,
+        tool_call_text: Optional[str] = None,
     ):
         _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
-        tool_result = cls._eval_tool_arguments(model_family, c)
+        tool_result = cls._eval_tool_arguments(model_family, c, tool_call_text)
         tool_calls = []
         failed_contents = []
         for content, func, args in tool_result:
             if func:
                 tool_calls.append(
                     {
+                        "index": 0,
                         "id": f"call_{_id}",
                         "type": "function",
                         "function": {
@@ -782,9 +798,12 @@ class ChatModelMixin:
                     }
                 )
             else:
-                failed_contents.append(content)
+                if content:
+                    failed_contents.append(content)
         finish_reason = "tool_calls" if tool_calls else "stop"
+        content = ". ".join(failed_contents) if failed_contents else None
         # fix: qwen tool_call content field return null
         family = model_family.model_family or model_family.model_name
         if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:

xinference 1.7.0__py3-none-any.whl → 1.7.1__py3-none-any.whl

Potentially problematic release.

xinference 1.7.0py3-none-any.whl → 1.7.1py3-none-any.whl