PyPI - xinference - Versions diffs - 1.9.1__py3-none-any.whl → 1.10.1__py3-none-any.whl - Mend

xinference 1.9.1py3-none-any.whl → 1.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (334) hide show

xinference/model/llm/transformers/multimodal/qwen2_vl.py CHANGED Viewed

@@ -27,11 +27,19 @@ logger = logging.getLogger(__name__)
 @register_batching_multimodal_models(
-    "qwen2-vl-instruct", "qwen2.5-vl-instruct", "QvQ-72B-Preview"
+    "qwen2-vl-instruct",
+    "qwen2.5-vl-instruct",
+    "QvQ-72B-Preview",
+    "Qwen3-VL-Instruct",
+    "Qwen3-VL-Thinking",
 )
 @register_transformer
 @register_non_default_model(
-    "qwen2-vl-instruct", "qwen2.5-vl-instruct", "QvQ-72B-Preview"
+    "qwen2-vl-instruct",
+    "qwen2.5-vl-instruct",
+    "QvQ-72B-Preview",
+    "Qwen3-VL-Instruct",
+    "Qwen3-VL-Thinking",
 )
 class Qwen2VLChatModel(PytorchMultiModalModel):
     def _sanitize_model_config(
@@ -47,7 +55,7 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb", "fp8"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
         if "qwen2-vl-instruct".lower() in llm_family.lower():
@@ -56,6 +64,8 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
             return True
         if "qvq-72b-preview".lower() in llm_family.lower():
             return True
+        if "qwen3-vl" in llm_family.lower():
+            return True
         return False
     def decide_device(self):
@@ -85,13 +95,19 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
         except ImportError:
             Qwen2_5_VLForConditionalGeneration = None
+        try:
+            from transformers import AutoModelForImageTextToText
+        except ImportError:
+            AutoModelForImageTextToText = None
         kwargs = self.apply_bnb_quantization()
         llm_family = self.model_family.model_family or self.model_family.model_name
-        model_cls = (
-            Qwen2_5_VLForConditionalGeneration
-            if "qwen2.5" in llm_family
-            else Qwen2VLForConditionalGeneration
-        )
+        if "qwen2.5" in llm_family:
+            model_cls = Qwen2_5_VLForConditionalGeneration
+        elif "qwen3" in llm_family:
+            model_cls = AutoModelForImageTextToText
+        else:
+            model_cls = Qwen2VLForConditionalGeneration
         if model_cls is None:
             raise ImportError("`transformers` version is too old, please upgrade it")
         device = "auto" if self._device == "cuda" else self._device
@@ -118,6 +134,16 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
                 torch_dtype="float16",
                 **kwargs,
             ).eval()
+        elif device == "mps":
+            # MacOS special, see https://github.com/QwenLM/Qwen2.5-VL/issues/761
+            self._model = model_cls.from_pretrained(
+                self.model_path,
+                torch_dtype="bfloat16",
+                device_map=device,
+                attn_implementation="eager",
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+            ).eval()
         else:
             self._model = model_cls.from_pretrained(
                 self.model_path,

xinference/model/llm/utils.py CHANGED Viewed

@@ -51,6 +51,7 @@ from ...types import (
 )
 from .core import chat_context_var
 from .reasoning_parser import ReasoningParser
+from .tool_parsers.glm4_tool_parser import Glm4ToolParser
 logger = logging.getLogger(__name__)
@@ -70,6 +71,10 @@ QWEN_TOOL_CALL_FAMILY = [
     "Qwen3-Thinking",
     "Qwen3-Instruct",
     "Qwen3-Coder",
+    "Qwen3-VL-Instruct",
+    "Qwen3-VL-Thinking",
+    "Qwen3-Next-Instruct",
+    "Qwen3-Next-Thinking",
 ]
 GLM4_TOOL_CALL_FAMILY = [
@@ -95,6 +100,13 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
 class ChatModelMixin:
+    def __init__(self):
+        self.model_family = None
+        self.model_uid = None
+        self.reasoning_parser = None
+        self.tool_parser = None
     @staticmethod
     @functools.lru_cache
     def _compile_jinja_template(chat_template):
@@ -339,9 +351,7 @@ class ChatModelMixin:
         assert choices is not None
         usage = (
             chunk["usage"]
-            if choices[0]["finish_reason"] is not None
-            and reasoning_parser
-            and reasoning_parser.check_content_parser()
+            if choices and choices[0]["finish_reason"] is not None or not choices
             else None
         )
         chat_chunk = {
@@ -590,16 +600,41 @@ class ChatModelMixin:
                 pos2 = content.find(QWEN_TOOL_CALL_SYMBOLS[1])
                 if pos2 != -1:
                     content = content[:pos2]
+                # Skip empty content after extraction
+                if not content.strip():
+                    continue
                 try:
                     res = json.loads(content, strict=False)
-                    results.append((None, res["name"], res["arguments"]))
-                except Exception as e:
+                    if isinstance(res, dict):
+                        # Check if required fields exist
+                        if "name" in res and "arguments" in res:
+                            results.append((None, res["name"], res["arguments"]))
+                        else:
+                            logger.warning(
+                                "Missing required fields in qwen tool call: %s", content
+                            )
+                            results.append((content, None, None))
+                    else:
+                        logger.warning(
+                            "Qwen tool call result is not a dict: %s", content
+                        )
+                        results.append((content, None, None))
+                except json.JSONDecodeError as e:
                     logger.error(
                         "Can't parse single qwen tool call output: %s. Error: %s",
                         content,
                         e,
                     )
                     results.append((content, None, None))
+                except Exception as e:
+                    logger.error(
+                        "Unexpected error parsing qwen tool call: %s. Error: %s",
+                        content,
+                        e,
+                    )
+                    results.append((content, None, None))
         return results
     @classmethod
@@ -757,47 +792,64 @@ class ChatModelMixin:
         logger.debug(f"Tool call content: {result}")
         return result
-    @classmethod
     def _post_process_completion_chunk(
-        cls,
+        self,
         model_family,
         model_uid,
         c,
         chunk_id=None,
-        reasoning_parser: Optional[ReasoningParser] = None,
-        tool_call_text: Optional[str] = None,
+        previous_texts: List[str] = [""],
     ):
+        if not c.get("choices"):
+            return c
         _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
-        tool_result = cls._eval_tool_arguments(model_family, c, tool_call_text)
+        tool_result = None
+        finish_reason = None
+        if isinstance(self.tool_parser, Glm4ToolParser):
+            tool_result = self.tool_parser.extract_tool_calls_streaming(
+                [],
+                c,
+                c,
+            )
+        else:
+            finish_reason = c["choices"][0]["finish_reason"]
+            delta_text = c["choices"][0]["delta"]["content"]
+            current_text = (
+                previous_texts[-1] + delta_text if previous_texts else delta_text
+            )
+            tool_result = self.tool_parser.extract_tool_calls_streaming(
+                previous_texts,
+                current_text,
+                delta_text,
+            )
+            previous_texts[-1] = current_text
+        if tool_result is None and not finish_reason:
+            return None
         tool_calls = []
         failed_contents = []
-        for content, func, args in tool_result:
-            if func:
-                tool_calls.append(
-                    {
-                        "index": 0,
-                        "id": f"call_{_id}",
-                        "type": "function",
-                        "function": {
-                            "name": func,
-                            "arguments": json.dumps(args, ensure_ascii=False),
-                        },
-                    }
-                )
-            else:
-                failed_contents.append(content)
-        finish_reason = "tool_calls" if tool_calls else "stop"
+        content, func, args = tool_result if tool_result else ("", None, None)
+        if func:
+            tool_calls.append(
+                {
+                    "index": 0,
+                    "id": f"call_{_id}",
+                    "type": "function",
+                    "function": {
+                        "name": func,
+                        "arguments": json.dumps(args, ensure_ascii=False),
+                    },
+                }
+            )
+        else:
+            failed_contents.append(content)
-        content = "".join(failed_contents) if failed_contents else None
+        finish_reason = "tool_calls" if tool_calls else finish_reason
-        # fix: qwen tool_call content field return null
-        family = model_family.model_family or model_family.model_name
-        if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
-            content = ""
+        content = "".join(failed_contents) if failed_contents else None
         d = {
             "role": "assistant",
-            "content": content,
+            "content": content if content else "",
             "tool_calls": tool_calls,
         }
@@ -805,11 +857,7 @@ class ChatModelMixin:
             usage = c.get("usage")
             assert "prompt_tokens" in usage
         except Exception:
-            usage = {
-                "prompt_tokens": -1,
-                "completion_tokens": -1,
-                "total_tokens": -1,
-            }
+            usage = None
         return {
             "id": "chat" + f"cmpl-{_id}",
             "model": model_uid,
@@ -826,29 +874,32 @@ class ChatModelMixin:
             "usage": usage,
         }
-    @classmethod
     def _post_process_completion(
-        cls,
+        self,
         model_family,
         model_uid,
         c,
-        reasoning_parser: Optional[ReasoningParser] = None,
     ):
-        if reasoning_parser:
-            c = reasoning_parser.prepare_reasoning_content(c)
+        if not self.tool_parser:
+            return self._get_final_chat_completion_chunk(c)
+        if self.reasoning_parser:
+            c = self.reasoning_parser.prepare_reasoning_content(c)
         _id = str(uuid.uuid4())
         reasoning_content = None
-        if reasoning_parser and reasoning_parser.check_content_parser():
+        if self.reasoning_parser and self.reasoning_parser.check_content_parser():
             text = c["choices"][0]["text"]
-            reasoning_content, content = reasoning_parser.extract_reasoning_content(
-                text
+            reasoning_content, content = (
+                self.reasoning_parser.extract_reasoning_content(text)
             )
             c["choices"][0]["text"] = content
-        tool_result = cls._eval_tool_arguments(model_family, c)
         tool_calls = []
         failed_contents = []
+        if isinstance(self.tool_parser, Glm4ToolParser):
+            tool_result = self.tool_parser.extract_tool_calls(c)
+        else:
+            text = c["choices"][0]["text"]
+            tool_result = self.tool_parser.extract_tool_calls(text)
         for content, func, args in tool_result:
             if func:
                 tool_calls.append(
@@ -868,14 +919,9 @@ class ChatModelMixin:
         content = "".join(failed_contents) if failed_contents else None
-        # fix: qwen tool_call content field return null
-        family = model_family.model_family or model_family.model_name
-        if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
-            content = ""
         m = {
             "role": "assistant",
-            "content": content,
+            "content": content if content else "",
             "tool_calls": tool_calls,
         }
         # add only reasoning_content is None
@@ -943,6 +989,45 @@ class ChatModelMixin:
         return transformed_messages
+    async def _async_to_tool_completion_chunks(
+        self,
+        chunks: AsyncGenerator[CompletionChunk, None],
+        ctx: Optional[Dict[str, Any]] = None,
+    ) -> AsyncGenerator[ChatCompletionChunk, None]:
+        def set_context():
+            if ctx:
+                chat_context_var.set(ctx)
+        i = 0
+        previous_texts = [""]
+        previous_tools_texts = [""]
+        full_text = ""
+        if self.reasoning_parser:
+            set_context()
+            chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
+        async for completion_chunk in chunks:
+            set_context()
+            chat_chunk = self._to_chat_completion_chunk(
+                completion_chunk, self.reasoning_parser, previous_texts
+            )
+            if (
+                chat_chunk["choices"]
+                and "reasoning_content" in chat_chunk["choices"][0]["delta"]
+                and chat_chunk["choices"][0]["delta"]["reasoning_content"] is not None
+            ):
+                yield chat_chunk
+                continue
+            processed_chunk = self._post_process_completion_chunk(
+                self.model_family,
+                self.model_uid,
+                chat_chunk,
+                previous_texts=previous_tools_texts,
+            )
+            if processed_chunk:
+                yield processed_chunk
+            i += 1
+        logger.debug("Chat finished, output: %s", full_text)
 def get_model_version(
     model_name: str,

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -264,6 +264,9 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.4"):
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.5"):
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen3")
+if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.0"):
+    VLLM_SUPPORTED_CHAT_MODELS.append("Baichuan-M2")
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.1"):
     VLLM_SUPPORTED_CHAT_MODELS.append("minicpm4")
@@ -282,10 +285,15 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.0"):
 if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
     VLLM_SUPPORTED_CHAT_MODELS.append("gpt-oss")
-    VLLM_SUPPORTED_CHAT_MODELS.append("seed-oss")
-if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.1.1"):
+if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.2"):
     VLLM_SUPPORTED_CHAT_MODELS.append("seed-oss")
+    VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Next-Instruct")
+    VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Next-Thinking")
+if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.2"):
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("Qwen3-VL-Instruct")
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("Qwen3-VL-Instruct")
 class VLLMModel(LLM):
@@ -393,6 +401,7 @@ class VLLMModel(LLM):
         self.prepare_parse_reasoning_content(
             reasoning_content, enable_thinking=enable_thinking
         )
+        self.prepare_parse_tool_calls()
         if (
             isinstance(self.model_spec, LlamaCppLLMSpecV2)
@@ -773,7 +782,6 @@ class VLLMModel(LLM):
         sanitized = VLLMGenerateConfig()
         response_format = generate_config.pop("response_format", None)
-        guided_decoding_backend = generate_config.get("guided_decoding_backend", None)
         guided_json_object = None
         guided_json = None
@@ -784,8 +792,6 @@ class VLLMModel(LLM):
                 json_schema = response_format.get("json_schema")
                 assert json_schema is not None
                 guided_json = json_schema.get("json_schema")
-                if guided_decoding_backend is None:
-                    guided_decoding_backend = "outlines"
         sanitized.setdefault("lora_name", generate_config.get("lora_name", None))
         sanitized.setdefault("n", generate_config.get("n", 1))
@@ -833,10 +839,6 @@ class VLLMModel(LLM):
             "guided_json_object",
             generate_config.get("guided_json_object", guided_json_object),
         )
-        sanitized.setdefault(
-            "guided_decoding_backend",
-            generate_config.get("guided_decoding_backend", guided_decoding_backend),
-        )
         return sanitized
@@ -940,9 +942,21 @@ class VLLMModel(LLM):
     async def _get_tokenizer(self, lora_request: Any) -> Any:
         try:
-            return await self._engine.get_tokenizer(lora_request)  # type: ignore
+            # vLLM 0.11.0+ get_tokenizer doesn't accept lora_request parameter
+            if (
+                VLLM_VERSION >= version.parse("0.11.0")
+                or VLLM_VERSION.base_version >= "0.11.0"
+            ):
+                return await self._engine.get_tokenizer()  # type: ignore
+            else:
+                return await self._engine.get_tokenizer(lora_request)  # type: ignore
         except AttributeError:
-            return await self._engine.get_tokenizer_async(lora_request)  # type: ignore
+            # Fallback to get_tokenizer_async for older versions
+            try:
+                return await self._engine.get_tokenizer_async(lora_request)  # type: ignore
+            except (AttributeError, TypeError):
+                # If all else fails, try without parameters
+                return await self._engine.get_tokenizer()  # type: ignore
     def _tokenize(self, tokenizer: Any, prompt: str, config: dict) -> List[int]:
         truncate_prompt_tokens = config.get("truncate_prompt_tokens")
@@ -1023,23 +1037,65 @@ class VLLMModel(LLM):
             # guided decoding only available for vllm >= 0.6.3
             from vllm.sampling_params import GuidedDecodingParams
-            guided_options = GuidedDecodingParams.from_optional(
-                json=sanitized_generate_config.pop("guided_json", None),
-                regex=sanitized_generate_config.pop("guided_regex", None),
-                choice=sanitized_generate_config.pop("guided_choice", None),
-                grammar=sanitized_generate_config.pop("guided_grammar", None),
-                json_object=sanitized_generate_config.pop("guided_json_object", None),
-                backend=sanitized_generate_config.pop("guided_decoding_backend", None),
-                whitespace_pattern=sanitized_generate_config.pop(
-                    "guided_whitespace_pattern", None
-                ),
+            # Extract guided decoding parameters
+            guided_params: dict[str, Any] = {}
+            guided_json = sanitized_generate_config.pop("guided_json", None)
+            if guided_json:
+                guided_params["json"] = guided_json
+            guided_regex = sanitized_generate_config.pop("guided_regex", None)
+            if guided_regex:
+                guided_params["regex"] = guided_regex
+            guided_choice = sanitized_generate_config.pop("guided_choice", None)
+            if guided_choice:
+                guided_params["choice"] = guided_choice
+            guided_grammar = sanitized_generate_config.pop("guided_grammar", None)
+            if guided_grammar:
+                guided_params["grammar"] = guided_grammar
+            guided_json_object = sanitized_generate_config.pop(
+                "guided_json_object", None
             )
+            if guided_json_object:
+                guided_params["json_object"] = guided_json_object
-            sampling_params = SamplingParams(
-                guided_decoding=guided_options, **sanitized_generate_config
+            guided_backend = sanitized_generate_config.pop(
+                "guided_decoding_backend", None
             )
+            if guided_backend:
+                guided_params["_backend"] = guided_backend
+            guided_whitespace_pattern = sanitized_generate_config.pop(
+                "guided_whitespace_pattern", None
+            )
+            if guided_whitespace_pattern:
+                guided_params["whitespace_pattern"] = guided_whitespace_pattern
+            # Create GuidedDecodingParams if we have any guided parameters
+            guided_options = None
+            if guided_params:
+                try:
+                    guided_options = GuidedDecodingParams(**guided_params)
+                except Exception as e:
+                    logger.warning(f"Failed to create GuidedDecodingParams: {e}")
+                    guided_options = None
+            # Use structured_outputs for vLLM >= 0.11.0, guided_decoding for older versions
+            if (
+                VLLM_VERSION >= version.parse("0.11.0")
+                or VLLM_VERSION.base_version >= "0.11.0"
+            ):
+                sampling_params = SamplingParams(
+                    structured_outputs=guided_options, **sanitized_generate_config
+                )
+            else:
+                sampling_params = SamplingParams(
+                    guided_decoding=guided_options, **sanitized_generate_config
+                )
         else:
-            # ignore generate configs
+            # ignore generate configs for older versions
             sanitized_generate_config.pop("guided_json", None)
             sanitized_generate_config.pop("guided_regex", None)
             sanitized_generate_config.pop("guided_choice", None)
@@ -1242,6 +1298,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     ) -> Dict:
         if not generate_config:
             generate_config = {}
         if "reasoning" in getattr(self.model_family, "model_ability", []):
             generate_config.pop("stop", None)
             generate_config.pop("stop_token_ids", None)
@@ -1255,6 +1312,19 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
                 generate_config["stop_token_ids"] = (
                     self.model_family.stop_token_ids.copy()
                 )
+        # if response_format exists，generate guided_json
+        if "response_format" in generate_config:
+            resp_format = generate_config["response_format"]
+            if (
+                isinstance(resp_format, dict)
+                and resp_format.get("type") == "json_schema"
+                and "json_schema" in resp_format
+            ):
+                schema = resp_format["json_schema"].get("schema_")
+                if schema:
+                    generate_config["guided_json"] = schema
         return generate_config
     @staticmethod
@@ -1291,59 +1361,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         return processed_messages
-    async def _async_to_tool_completion_chunks(
-        self,
-        chunks: AsyncGenerator[CompletionChunk, None],
-        ctx: Optional[Dict[str, Any]] = {},
-    ) -> AsyncGenerator[ChatCompletionChunk, None]:
-        def set_context():
-            if ctx:
-                chat_context_var.set(ctx)
-        i = 0
-        previous_texts = [""]
-        tool_call = False
-        tool_call_texts = [""]
-        full_text = ""
-        if self.reasoning_parser:
-            set_context()
-            chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
-        async for chunk in chunks:
-            set_context()
-            if i == 0:
-                for first_chunk in self._get_first_chat_completion_chunk(
-                    chunk, self.reasoning_parser
-                ):
-                    yield first_chunk
-            # usage
-            choices = chunk.get("choices")
-            if not choices:
-                yield self._get_final_chat_completion_chunk(chunk)
-            else:
-                full_text += chunk["choices"][0]["text"]
-                if self.is_tool_call_chunk_start(chunk):
-                    tool_call = True
-                if tool_call:
-                    tool_call_text = tool_call_texts[-1]
-                    tool_call_text += chunk["choices"][0]["text"]
-                    tool_call_texts.append(tool_call_text)
-                    if self.is_tool_call_chunk_end(chunk):
-                        yield self._post_process_completion_chunk(
-                            self.model_family,
-                            self.model_uid,
-                            chunk,
-                            reasoning_parser=self.reasoning_parser,
-                            tool_call_text=tool_call_text,
-                        )
-                        tool_call = False
-                        tool_call_texts = [""]
-                else:
-                    yield self._to_chat_completion_chunk(
-                        chunk, self.reasoning_parser, previous_texts
-                    )
-            i += 1
-        logger.debug("Chat finished, output: %s", full_text)
     @vllm_check
     async def async_chat(
         self,
@@ -1408,7 +1425,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             assert not isinstance(c, AsyncGenerator)
             if tools:
                 return self._post_process_completion(
-                    self.model_family, self.model_uid, c, self.reasoning_parser
+                    self.model_family, self.model_uid, c
                 )
             return self._to_chat_completion(c, self.reasoning_parser)

xinference/thirdparty/audiotools/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+__version__ = "0.7.4"
+from .core import AudioSignal
+from .core import STFTParams
+from .core import Meter
+from .core import util
+from . import metrics
+from . import data
+from . import ml
+from .data import datasets
+from .data import transforms

xinference/thirdparty/audiotools/core/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from . import util
+from .audio_signal import AudioSignal
+from .audio_signal import STFTParams
+from .loudness import Meter

xinference 1.9.1__py3-none-any.whl → 1.10.1__py3-none-any.whl

Potentially problematic release.

xinference 1.9.1py3-none-any.whl → 1.10.1py3-none-any.whl