PyPI - xinference - Versions diffs - 1.10.0__py3-none-any.whl → 1.10.1__py3-none-any.whl - Mend

xinference 1.10.0py3-none-any.whl → 1.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (317) hide show

xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py CHANGED Viewed

@@ -23,12 +23,27 @@ class DeepseekR1ToolParser(ToolParser):
         Initialize the DeepSeek R1 tool parser.
         """
         super().__init__()
+        # Sentinel tokens for streaming mode
+        self.think_start_token: str = "<think>"
+        self.think_end_token: str = "</think>"
+        self.tool_call_start_token: str = "<｜tool▁call▁begin｜>"
+        self.tool_call_end_token: str = "<｜tool▁call▁end｜>"
         # Regex pattern to match DeepSeek R1 tool call format
         self.tool_calls_regex = (
             r"<\｜tool▁call▁begin｜>function<\｜tool▁sep｜>([^\n]+)\n"
             r"```json\n(.*?)\n```<\｜tool▁call▁end｜>"
         )
+        # Regex pattern to match the entire tool-calls wrapper block.
+        # We intentionally do NOT match <think> blocks here so that the
+        # "text before" chunk will include both the think block and any
+        # narrative text up to the tool calls wrapper, yielding exactly two
+        # blocks when there is a single tool calls section:
+        # [before_text_including_think, tool_calls_wrapper_block]
+        self.content_regex = r"(<\｜tool▁calls▁begin｜>.*?<\｜tool▁calls▁end｜>)"
     def extract_tool_calls(
         self, model_output: str
     ) -> List[Tuple[Optional[str], Optional[str], Optional[dict]]]:
@@ -56,49 +71,96 @@ class DeepseekR1ToolParser(ToolParser):
             >>> print(result)
             [(None, 'get_current_weather', {'location': 'Beijing'})]
         """
-        matches = re.findall(self.tool_calls_regex, model_output, re.DOTALL)
-        if not matches:
-            # No tool calls found, return the original output as content
+        # If no tool call tokens, return original output as content
+        if self.tool_call_start_token not in model_output:
             return [(model_output, None, None)]
+        # Get all content blocks (text, thinking blocks, tool calls)
+        function_calls = self._get_function_calls(model_output)
         # Use set for deduplication of identical tool calls
         tool_calls = set()
         results: List[Tuple[Optional[str], Optional[str], Optional[dict]]] = []
-        for func_name, raw_json in matches:
-            func_and_args = None
-            try:
-                # Parse JSON arguments
-                func_and_args = json.loads(raw_json)
-                # Create hashable representation for deduplication
-                arguments_hashable = frozenset(func_and_args.items())
-                tool_call_tuple = (
-                    None,  # No content error
-                    func_name,
-                    func_and_args,
+        for content_block in function_calls:
+            # Check if this block is a tool call
+            if (
+                self.tool_call_start_token in content_block
+                and self.tool_call_end_token in content_block
+            ):
+                # Extract function name and arguments from tool call block
+                matches = re.findall(self.tool_calls_regex, content_block, re.DOTALL)
+                if not matches:
+                    # Malformed tool call, treat as regular content
+                    results.append((content_block, None, None))
+                    continue
+                func_name, raw_json = matches[0]  # Take the first match
+                func_and_args = None
+                try:
+                    # Parse JSON arguments
+                    func_and_args = json.loads(raw_json)
+                    # Create hashable representation for deduplication
+                    arguments_hashable = frozenset(func_and_args.items())
+                    tool_call_tuple = (
+                        None,  # No content error
+                        func_name,
+                        func_and_args,
+                    )
+                except Exception as e:
+                    # JSON parsing failed, treat as raw content
+                    logger.warning(
+                        f"Failed to parse tool call JSON: {raw_json}, error: {e}"
+                    )
+                    tool_call_tuple = (raw_json, None, None)
+                    arguments_hashable = None
+                # Create deduplication key
+                dedup_key = (
+                    (func_name, arguments_hashable)
+                    if func_and_args is not None
+                    else raw_json
                 )
-            except Exception as e:
-                # JSON parsing failed, treat as raw content
-                logger.warning(
-                    f"Failed to parse tool call JSON: {raw_json}, error: {e}"
-                )
-                tool_call_tuple = (raw_json, None, None)
-                arguments_hashable = None
-            # Create deduplication key
-            dedup_key = (
-                (func_name, arguments_hashable)
-                if func_and_args is not None
-                else raw_json
-            )
-            # Add to results if not already seen
-            if dedup_key not in tool_calls:
-                tool_calls.add(dedup_key)
-                results.append(tool_call_tuple)
+                # Add to results if not already seen
+                if dedup_key not in tool_calls:
+                    tool_calls.add(dedup_key)
+                    results.append(tool_call_tuple)
+            else:
+                # This is regular content (text or thinking block), add as-is
+                if content_block.strip():  # Only add non-empty content
+                    results.append((content_block, None, None))
         return results
+    def _get_function_calls(self, model_output: str) -> List[str]:
+        """
+        Extract all function calls and content blocks from model output.
+        Parses the model output to separate thinking blocks, tool calls,
+        and regular content into individual components.
+        Args:
+            model_output (str): The complete model output to parse.
+        Returns:
+            List[str]: List of content blocks (text, thinking blocks, tool calls).
+        """
+        functions_calls = []
+        last_end = 0
+        for m in re.finditer(self.content_regex, model_output, re.DOTALL):
+            # Add any text before the current match
+            if m.start() > last_end:
+                functions_calls.append(model_output[last_end : m.start()])
+            # Add the matched content (think or tool_call block)
+            functions_calls.append(m.group(0))
+            last_end = m.end()
+        # Add any remaining text after the last match
+        if last_end < len(model_output):
+            functions_calls.append(model_output[last_end:])
+        return functions_calls
     def extract_tool_calls_streaming(
         self, previous_text: List[str], current_text: str, delta_text: str
     ) -> Optional[Any]:

xinference/model/llm/transformers/multimodal/qwen2_vl.py CHANGED Viewed

@@ -27,11 +27,19 @@ logger = logging.getLogger(__name__)
 @register_batching_multimodal_models(
-    "qwen2-vl-instruct", "qwen2.5-vl-instruct", "QvQ-72B-Preview"
+    "qwen2-vl-instruct",
+    "qwen2.5-vl-instruct",
+    "QvQ-72B-Preview",
+    "Qwen3-VL-Instruct",
+    "Qwen3-VL-Thinking",
 )
 @register_transformer
 @register_non_default_model(
-    "qwen2-vl-instruct", "qwen2.5-vl-instruct", "QvQ-72B-Preview"
+    "qwen2-vl-instruct",
+    "qwen2.5-vl-instruct",
+    "QvQ-72B-Preview",
+    "Qwen3-VL-Instruct",
+    "Qwen3-VL-Thinking",
 )
 class Qwen2VLChatModel(PytorchMultiModalModel):
     def _sanitize_model_config(
@@ -47,7 +55,7 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb", "fp8"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
         if "qwen2-vl-instruct".lower() in llm_family.lower():
@@ -56,6 +64,8 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
             return True
         if "qvq-72b-preview".lower() in llm_family.lower():
             return True
+        if "qwen3-vl" in llm_family.lower():
+            return True
         return False
     def decide_device(self):
@@ -85,13 +95,19 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
         except ImportError:
             Qwen2_5_VLForConditionalGeneration = None
+        try:
+            from transformers import AutoModelForImageTextToText
+        except ImportError:
+            AutoModelForImageTextToText = None
         kwargs = self.apply_bnb_quantization()
         llm_family = self.model_family.model_family or self.model_family.model_name
-        model_cls = (
-            Qwen2_5_VLForConditionalGeneration
-            if "qwen2.5" in llm_family
-            else Qwen2VLForConditionalGeneration
-        )
+        if "qwen2.5" in llm_family:
+            model_cls = Qwen2_5_VLForConditionalGeneration
+        elif "qwen3" in llm_family:
+            model_cls = AutoModelForImageTextToText
+        else:
+            model_cls = Qwen2VLForConditionalGeneration
         if model_cls is None:
             raise ImportError("`transformers` version is too old, please upgrade it")
         device = "auto" if self._device == "cuda" else self._device
@@ -118,6 +134,16 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
                 torch_dtype="float16",
                 **kwargs,
             ).eval()
+        elif device == "mps":
+            # MacOS special, see https://github.com/QwenLM/Qwen2.5-VL/issues/761
+            self._model = model_cls.from_pretrained(
+                self.model_path,
+                torch_dtype="bfloat16",
+                device_map=device,
+                attn_implementation="eager",
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+            ).eval()
         else:
             self._model = model_cls.from_pretrained(
                 self.model_path,

xinference/model/llm/utils.py CHANGED Viewed

@@ -71,6 +71,10 @@ QWEN_TOOL_CALL_FAMILY = [
     "Qwen3-Thinking",
     "Qwen3-Instruct",
     "Qwen3-Coder",
+    "Qwen3-VL-Instruct",
+    "Qwen3-VL-Thinking",
+    "Qwen3-Next-Instruct",
+    "Qwen3-Next-Thinking",
 ]
 GLM4_TOOL_CALL_FAMILY = [
@@ -347,9 +351,7 @@ class ChatModelMixin:
         assert choices is not None
         usage = (
             chunk["usage"]
-            if choices[0]["finish_reason"] is not None
-            and reasoning_parser
-            and reasoning_parser.check_content_parser()
+            if choices and choices[0]["finish_reason"] is not None or not choices
             else None
         )
         chat_chunk = {
@@ -798,7 +800,11 @@ class ChatModelMixin:
         chunk_id=None,
         previous_texts: List[str] = [""],
     ):
+        if not c.get("choices"):
+            return c
         _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
+        tool_result = None
+        finish_reason = None
         if isinstance(self.tool_parser, Glm4ToolParser):
             tool_result = self.tool_parser.extract_tool_calls_streaming(
                 [],
@@ -851,11 +857,7 @@ class ChatModelMixin:
             usage = c.get("usage")
             assert "prompt_tokens" in usage
         except Exception:
-            usage = {
-                "prompt_tokens": -1,
-                "completion_tokens": -1,
-                "total_tokens": -1,
-            }
+            usage = None
         return {
             "id": "chat" + f"cmpl-{_id}",
             "model": model_uid,
@@ -1009,7 +1011,8 @@ class ChatModelMixin:
                 completion_chunk, self.reasoning_parser, previous_texts
             )
             if (
-                "reasoning_content" in chat_chunk["choices"][0]["delta"]
+                chat_chunk["choices"]
+                and "reasoning_content" in chat_chunk["choices"][0]["delta"]
                 and chat_chunk["choices"][0]["delta"]["reasoning_content"] is not None
             ):
                 yield chat_chunk

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -264,6 +264,9 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.4"):
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.5"):
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen3")
+if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.0"):
+    VLLM_SUPPORTED_CHAT_MODELS.append("Baichuan-M2")
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.1"):
     VLLM_SUPPORTED_CHAT_MODELS.append("minicpm4")
@@ -282,10 +285,15 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.0"):
 if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
     VLLM_SUPPORTED_CHAT_MODELS.append("gpt-oss")
-    VLLM_SUPPORTED_CHAT_MODELS.append("seed-oss")
-if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.1.1"):
+if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.2"):
     VLLM_SUPPORTED_CHAT_MODELS.append("seed-oss")
+    VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Next-Instruct")
+    VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Next-Thinking")
+if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.2"):
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("Qwen3-VL-Instruct")
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("Qwen3-VL-Instruct")
 class VLLMModel(LLM):
@@ -934,9 +942,21 @@ class VLLMModel(LLM):
     async def _get_tokenizer(self, lora_request: Any) -> Any:
         try:
-            return await self._engine.get_tokenizer(lora_request)  # type: ignore
+            # vLLM 0.11.0+ get_tokenizer doesn't accept lora_request parameter
+            if (
+                VLLM_VERSION >= version.parse("0.11.0")
+                or VLLM_VERSION.base_version >= "0.11.0"
+            ):
+                return await self._engine.get_tokenizer()  # type: ignore
+            else:
+                return await self._engine.get_tokenizer(lora_request)  # type: ignore
         except AttributeError:
-            return await self._engine.get_tokenizer_async(lora_request)  # type: ignore
+            # Fallback to get_tokenizer_async for older versions
+            try:
+                return await self._engine.get_tokenizer_async(lora_request)  # type: ignore
+            except (AttributeError, TypeError):
+                # If all else fails, try without parameters
+                return await self._engine.get_tokenizer()  # type: ignore
     def _tokenize(self, tokenizer: Any, prompt: str, config: dict) -> List[int]:
         truncate_prompt_tokens = config.get("truncate_prompt_tokens")
@@ -1017,23 +1037,65 @@ class VLLMModel(LLM):
             # guided decoding only available for vllm >= 0.6.3
             from vllm.sampling_params import GuidedDecodingParams
-            guided_options = GuidedDecodingParams.from_optional(
-                json=sanitized_generate_config.pop("guided_json", None),
-                regex=sanitized_generate_config.pop("guided_regex", None),
-                choice=sanitized_generate_config.pop("guided_choice", None),
-                grammar=sanitized_generate_config.pop("guided_grammar", None),
-                json_object=sanitized_generate_config.pop("guided_json_object", None),
-                backend=sanitized_generate_config.pop("guided_decoding_backend", None),
-                whitespace_pattern=sanitized_generate_config.pop(
-                    "guided_whitespace_pattern", None
-                ),
+            # Extract guided decoding parameters
+            guided_params: dict[str, Any] = {}
+            guided_json = sanitized_generate_config.pop("guided_json", None)
+            if guided_json:
+                guided_params["json"] = guided_json
+            guided_regex = sanitized_generate_config.pop("guided_regex", None)
+            if guided_regex:
+                guided_params["regex"] = guided_regex
+            guided_choice = sanitized_generate_config.pop("guided_choice", None)
+            if guided_choice:
+                guided_params["choice"] = guided_choice
+            guided_grammar = sanitized_generate_config.pop("guided_grammar", None)
+            if guided_grammar:
+                guided_params["grammar"] = guided_grammar
+            guided_json_object = sanitized_generate_config.pop(
+                "guided_json_object", None
             )
+            if guided_json_object:
+                guided_params["json_object"] = guided_json_object
-            sampling_params = SamplingParams(
-                guided_decoding=guided_options, **sanitized_generate_config
+            guided_backend = sanitized_generate_config.pop(
+                "guided_decoding_backend", None
             )
+            if guided_backend:
+                guided_params["_backend"] = guided_backend
+            guided_whitespace_pattern = sanitized_generate_config.pop(
+                "guided_whitespace_pattern", None
+            )
+            if guided_whitespace_pattern:
+                guided_params["whitespace_pattern"] = guided_whitespace_pattern
+            # Create GuidedDecodingParams if we have any guided parameters
+            guided_options = None
+            if guided_params:
+                try:
+                    guided_options = GuidedDecodingParams(**guided_params)
+                except Exception as e:
+                    logger.warning(f"Failed to create GuidedDecodingParams: {e}")
+                    guided_options = None
+            # Use structured_outputs for vLLM >= 0.11.0, guided_decoding for older versions
+            if (
+                VLLM_VERSION >= version.parse("0.11.0")
+                or VLLM_VERSION.base_version >= "0.11.0"
+            ):
+                sampling_params = SamplingParams(
+                    structured_outputs=guided_options, **sanitized_generate_config
+                )
+            else:
+                sampling_params = SamplingParams(
+                    guided_decoding=guided_options, **sanitized_generate_config
+                )
         else:
-            # ignore generate configs
+            # ignore generate configs for older versions
             sanitized_generate_config.pop("guided_json", None)
             sanitized_generate_config.pop("guided_regex", None)
             sanitized_generate_config.pop("guided_choice", None)
@@ -1236,6 +1298,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     ) -> Dict:
         if not generate_config:
             generate_config = {}
         if "reasoning" in getattr(self.model_family, "model_ability", []):
             generate_config.pop("stop", None)
             generate_config.pop("stop_token_ids", None)
@@ -1249,6 +1312,19 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
                 generate_config["stop_token_ids"] = (
                     self.model_family.stop_token_ids.copy()
                 )
+        # if response_format exists，generate guided_json
+        if "response_format" in generate_config:
+            resp_format = generate_config["response_format"]
+            if (
+                isinstance(resp_format, dict)
+                and resp_format.get("type") == "json_schema"
+                and "json_schema" in resp_format
+            ):
+                schema = resp_format["json_schema"].get("schema_")
+                if schema:
+                    generate_config["guided_json"] = schema
         return generate_config
     @staticmethod

xinference/thirdparty/audiotools/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+__version__ = "0.7.4"
+from .core import AudioSignal
+from .core import STFTParams
+from .core import Meter
+from .core import util
+from . import metrics
+from . import data
+from . import ml
+from .data import datasets
+from .data import transforms

xinference/thirdparty/audiotools/core/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from . import util
+from .audio_signal import AudioSignal
+from .audio_signal import STFTParams
+from .loudness import Meter

xinference 1.10.0__py3-none-any.whl → 1.10.1__py3-none-any.whl

Potentially problematic release.

xinference 1.10.0py3-none-any.whl → 1.10.1py3-none-any.whl