PyPI - xinference - Versions diffs - 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

xinference 1.9.1py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (34) hide show

xinference/_version.py +3 -3
xinference/api/restful_api.py +415 -1
xinference/constants.py +2 -0
xinference/core/supervisor.py +29 -1
xinference/model/audio/core.py +5 -0
xinference/model/audio/kokoro.py +1 -1
xinference/model/audio/kokoro_zh.py +124 -0
xinference/model/audio/model_spec.json +20 -0
xinference/model/embedding/sentence_transformers/core.py +4 -4
xinference/model/embedding/vllm/core.py +7 -1
xinference/model/image/model_spec.json +2 -3
xinference/model/llm/core.py +10 -0
xinference/model/llm/llama_cpp/core.py +1 -0
xinference/model/llm/llm_family.json +40 -20
xinference/model/llm/llm_family.py +1 -0
xinference/model/llm/mlx/core.py +52 -33
xinference/model/llm/sglang/core.py +2 -44
xinference/model/llm/tool_parsers/__init__.py +58 -0
xinference/model/llm/tool_parsers/abstract_tool_parser.py +33 -0
xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +128 -0
xinference/model/llm/tool_parsers/deepseek_v3_tool_parser.py +145 -0
xinference/model/llm/tool_parsers/glm4_tool_parser.py +123 -0
xinference/model/llm/tool_parsers/llama3_tool_parser.py +77 -0
xinference/model/llm/tool_parsers/qwen_tool_parser.py +320 -0
xinference/model/llm/transformers/core.py +1 -1
xinference/model/llm/utils.py +127 -45
xinference/model/llm/vllm/core.py +2 -61
xinference/types.py +105 -2
{xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/METADATA +7 -3
{xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/RECORD +34 -26
{xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/WHEEL +0 -0
{xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/entry_points.txt +0 -0
{xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/licenses/LICENSE +0 -0
{xinference-1.9.1.dist-info → xinference-1.10.0.dist-info}/top_level.txt +0 -0

xinference/model/llm/tool_parsers/qwen_tool_parser.py ADDED Viewed

@@ -0,0 +1,320 @@
+import json
+import logging
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from . import register_tool_parser
+from .abstract_tool_parser import ToolParser
+logger = logging.getLogger(__name__)
+@register_tool_parser("qwen")
+class QwenToolParser(ToolParser):
+    """
+    Tool parser implementation for Qwen model.
+    This parser handles the specific format used by Qwen for tool calls,
+    which uses XML-like tags for both thinking blocks and tool calls.
+    """
+    def __init__(self):
+        """
+        Initialize the Qwen tool parser.
+        Sets up the XML-like tokens and regex patterns used for parsing
+        Qwen model outputs containing thinking blocks and tool calls.
+        """
+        super().__init__()
+        # Sentinel tokens for streaming mode
+        self.think_start_token: str = "<think>"
+        self.think_end_token: str = "</think>"
+        self.tool_call_start_token: str = "<tool_call>"
+        self.tool_call_end_token: str = "</tool_call>"
+        # Regex patterns for parsing different content types
+        self.think_regex = re.compile("<think>(.*?)</think>", re.DOTALL)
+        self.content_regex = r"(<(think|tool_call)>.*?</\2>)"
+        self.tool_call_complete_regex = re.compile(
+            r"<tool_call>(.*?)</tool_call>", re.DOTALL
+        )
+        self.tool_call_regex = re.compile(
+            r"<tool_call>.*?</tool_call>|<tool_call>.*?$", re.DOTALL
+        )
+    def _parse_json_function_call(
+        self,
+        function_call_str: str,
+    ) -> str:
+        """
+        Parse JSON function call from string.
+        Extracts the JSON content from tool_call XML tags.
+        Args:
+            function_call_str (str): The function call string to parse.
+        Returns:
+            str: Extracted JSON string or original string if no match found.
+        """
+        function_calls = self.tool_call_complete_regex.findall(function_call_str)
+        if len(function_calls) == 0:
+            return function_call_str
+        return function_calls[-1]
+    def _parse_json_function_call_stream(
+        self,
+        function_call_str: str,
+    ) -> Optional[str]:
+        """
+        Parse JSON function call from streaming string.
+        Extracts the JSON content from tool_call XML tags in streaming context.
+        Args:
+            function_call_str (str): The function call string to parse.
+        Returns:
+            Optional[str]: Extracted JSON string or None if no complete match found.
+        """
+        function_calls = self.tool_call_complete_regex.findall(function_call_str)
+        if len(function_calls) == 0:
+            return None
+        return function_calls[-1]
+    def is_contain_think_end_token(self, model_output: str) -> bool:
+        """
+        Check if the model output contains the think end token.
+        Args:
+            model_output (str): The model output to check.
+        Returns:
+            bool: True if think end token is present.
+        """
+        return self.think_end_token in model_output
+    def is_contain_think(self, model_output: str) -> bool:
+        """
+        Check if the model output contains complete thinking blocks.
+        Args:
+            model_output (str): The model output to check.
+        Returns:
+            bool: True if complete thinking blocks are present.
+        """
+        return self.think_regex.search(model_output) is not None
+    def is_contain_tool_call(self, model_output: str) -> bool:
+        """
+        Check if the model output contains complete tool calls.
+        Args:
+            model_output (str): The model output to check.
+        Returns:
+            bool: True if complete tool calls are present.
+        """
+        return self.tool_call_complete_regex.search(model_output) is not None
+    def is_contain_tool_call_start_token(self, model_output: str) -> bool:
+        """
+        Check if the model output contains the tool call start token.
+        Args:
+            model_output (str): The model output to check.
+        Returns:
+            bool: True if tool call start token is present.
+        """
+        return self.tool_call_start_token in model_output
+    def is_contain_tool_call_end_token(self, model_output: str) -> bool:
+        """
+        Check if the model output contains the tool call end token.
+        Args:
+            model_output (str): The model output to check.
+        Returns:
+            bool: True if tool call end token is present.
+        """
+        return self.tool_call_end_token in model_output
+    def _get_function_calls(self, model_output: str) -> List[str]:
+        """
+        Extract all function calls and content blocks from model output.
+        Parses the model output to separate thinking blocks, tool calls,
+        and regular content into individual components.
+        Args:
+            model_output (str): The complete model output to parse.
+        Returns:
+            List[str]: List of content blocks (text, thinking blocks, tool calls).
+        """
+        functions_calls = []
+        last_end = 0
+        for m in re.finditer(self.content_regex, model_output, re.DOTALL):
+            # Add any text before the current match
+            if m.start() > last_end:
+                functions_calls.append(model_output[last_end : m.start()])
+            # Add the matched content (think or tool_call block)
+            functions_calls.append(m.group(0))
+            last_end = m.end()
+        # Add any remaining text after the last match
+        if last_end < len(model_output):
+            functions_calls.append(model_output[last_end:])
+        return functions_calls
+    def _get_function_calls_streaming(self, model_output: str) -> List[str]:
+        """
+        Extract function calls from streaming model output.
+        Finds both complete and incomplete tool calls in streaming context.
+        Args:
+            model_output (str): The streaming model output to parse.
+        Returns:
+            List[str]: List of tool call blocks (complete or incomplete).
+        """
+        matched_ranges = self.tool_call_regex.findall(model_output)
+        return matched_ranges
+    def extract_tool_calls(
+        self, model_output: str
+    ) -> List[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]]:
+        """
+        Extract tool calls from complete model output.
+        Parses the model output to find tool calls and thinking blocks,
+        extracting function names and arguments from JSON content within
+        tool_call XML tags.
+        Args:
+            model_output (str): The complete output string from the model.
+        Returns:
+            List[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]]:
+            A list of tuples where each tuple contains:
+            - content (str or None): Raw content if parsing failed, None if successful
+            - function_name (str or None): Name of the function to call
+            - arguments (dict or None): Function arguments
+        Example:
+            >>> parser = QwenToolParser()
+            >>> output = '<tool_call>\n{"name": "get_weather", "arguments": {"location": "Beijing"}}\n</tool_call>'
+            >>> result = parser.extract_tool_calls(output)
+            >>> print(result)
+            [(None, 'get_weather', {'location': 'Beijing'})]
+        """
+        # If no tool call tokens, return original output as content
+        if self.tool_call_start_token not in model_output:
+            return [(model_output, None, None)]
+        try:
+            function_calls = self._get_function_calls(model_output)
+            if len(function_calls) == 0:
+                return [(model_output, None, None)]
+            results: List[
+                Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]
+            ] = []
+            for function_call in function_calls:
+                try:
+                    parsed_json = self._parse_json_function_call(function_call)
+                    res = json.loads(parsed_json, strict=False)
+                    results.append((None, res["name"], res["arguments"]))
+                except Exception as e:
+                    logger.error(
+                        "Can't parse single qwen tool call output: %s. Error: %s",
+                        function_call,
+                        e,
+                    )
+                    results.append((function_call, None, None))
+            return results
+        except Exception as e:
+            logger.error(
+                "Can't parse qwen tool call output: %s. Error: %s",
+                model_output,
+                e,
+            )
+            return [(model_output, None, None)]
+    def _has_unclosed_tool_call(self, text: str) -> bool:
+        """
+        Check if the text has unclosed tool_call tags.
+        Counts the number of opening and closing tool_call tags to determine
+        if there are any unclosed tool calls in the text.
+        Args:
+            text (str): The text to check for unclosed tags.
+        Returns:
+            bool: True if there are unclosed tool_call tags.
+        """
+        if not text:
+            return True
+        start_count = text.count(self.tool_call_start_token)
+        end_count = text.count(self.tool_call_end_token)
+        return start_count > end_count
+    def extract_tool_calls_streaming(
+        self, previous_text: List[str], current_text: str, delta_text: str
+    ) -> Optional[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]]:
+        """
+        Extract tool calls from streaming output.
+        Processes streaming model output to detect and extract tool calls
+        as they are being generated. Handles incomplete tool calls and
+        determines when a complete tool call is available.
+        Args:
+            previous_text (List[str]): Previous text chunks from the stream.
+            current_text (str): Current accumulated text.
+            delta_text (str): New text delta in this chunk.
+        Returns:
+            Optional[Tuple[Optional[str], Optional[str], Optional[Dict[str, Any]]]]:
+            A tuple containing:
+            - content (str or None): Text content or None for tool calls
+            - function_name (str or None): Name of the function to call
+            - arguments (dict or None): Function arguments
+            Returns None if no complete tool call is ready.
+        Note:
+            This method is designed to work with Qwen's streaming output format
+            and handles partial tool calls during generation.
+        """
+        try:
+            # Check if current output contains tool_call start token
+            if self.is_contain_tool_call_start_token(current_text):
+                function_calls = self._get_function_calls_streaming(current_text)
+                # If the last function call contains thinking, it's not a tool call
+                if self.is_contain_think(function_calls[-1]):
+                    return None
+                # If the previous round's tool_call tags are closed, this is a new tool call
+                if not self._has_unclosed_tool_call(previous_text[-1]):
+                    return None
+                # Parse and return
+                function_call = self._parse_json_function_call_stream(
+                    function_calls[-1]
+                )
+                if function_call is None:
+                    return None
+                res = json.loads(function_call, strict=False)
+                return None, res["name"], res["arguments"]
+            else:
+                # Return delta text as regular content
+                return (delta_text, None, None)
+        except Exception as e:
+            logger.error("Error in Qwen streaming tool call extraction: %s", e)
+            raise

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -332,6 +332,7 @@ class PytorchModel(LLM):
         self.prepare_parse_reasoning_content(
             reasoning_content, enable_thinking=enable_thinking
         )
+        self.prepare_parse_tool_calls()
         logger.debug("Loading Transformers model with kwargs: %s", kwargs)
@@ -983,7 +984,6 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
                 self.model_family,
                 self.model_uid,
                 req.completion[0],
-                self.reasoning_parser,
             )
         else:
             req.completion[0] = self._to_chat_completion(

xinference/model/llm/utils.py CHANGED Viewed

@@ -51,6 +51,7 @@ from ...types import (
 )
 from .core import chat_context_var
 from .reasoning_parser import ReasoningParser
+from .tool_parsers.glm4_tool_parser import Glm4ToolParser
 logger = logging.getLogger(__name__)
@@ -95,6 +96,13 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
 class ChatModelMixin:
+    def __init__(self):
+        self.model_family = None
+        self.model_uid = None
+        self.reasoning_parser = None
+        self.tool_parser = None
     @staticmethod
     @functools.lru_cache
     def _compile_jinja_template(chat_template):
@@ -590,16 +598,41 @@ class ChatModelMixin:
                 pos2 = content.find(QWEN_TOOL_CALL_SYMBOLS[1])
                 if pos2 != -1:
                     content = content[:pos2]
+                # Skip empty content after extraction
+                if not content.strip():
+                    continue
                 try:
                     res = json.loads(content, strict=False)
-                    results.append((None, res["name"], res["arguments"]))
-                except Exception as e:
+                    if isinstance(res, dict):
+                        # Check if required fields exist
+                        if "name" in res and "arguments" in res:
+                            results.append((None, res["name"], res["arguments"]))
+                        else:
+                            logger.warning(
+                                "Missing required fields in qwen tool call: %s", content
+                            )
+                            results.append((content, None, None))
+                    else:
+                        logger.warning(
+                            "Qwen tool call result is not a dict: %s", content
+                        )
+                        results.append((content, None, None))
+                except json.JSONDecodeError as e:
                     logger.error(
                         "Can't parse single qwen tool call output: %s. Error: %s",
                         content,
                         e,
                     )
                     results.append((content, None, None))
+                except Exception as e:
+                    logger.error(
+                        "Unexpected error parsing qwen tool call: %s. Error: %s",
+                        content,
+                        e,
+                    )
+                    results.append((content, None, None))
         return results
     @classmethod
@@ -757,47 +790,60 @@ class ChatModelMixin:
         logger.debug(f"Tool call content: {result}")
         return result
-    @classmethod
     def _post_process_completion_chunk(
-        cls,
+        self,
         model_family,
         model_uid,
         c,
         chunk_id=None,
-        reasoning_parser: Optional[ReasoningParser] = None,
-        tool_call_text: Optional[str] = None,
+        previous_texts: List[str] = [""],
     ):
         _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
-        tool_result = cls._eval_tool_arguments(model_family, c, tool_call_text)
+        if isinstance(self.tool_parser, Glm4ToolParser):
+            tool_result = self.tool_parser.extract_tool_calls_streaming(
+                [],
+                c,
+                c,
+            )
+        else:
+            finish_reason = c["choices"][0]["finish_reason"]
+            delta_text = c["choices"][0]["delta"]["content"]
+            current_text = (
+                previous_texts[-1] + delta_text if previous_texts else delta_text
+            )
+            tool_result = self.tool_parser.extract_tool_calls_streaming(
+                previous_texts,
+                current_text,
+                delta_text,
+            )
+            previous_texts[-1] = current_text
+        if tool_result is None and not finish_reason:
+            return None
         tool_calls = []
         failed_contents = []
-        for content, func, args in tool_result:
-            if func:
-                tool_calls.append(
-                    {
-                        "index": 0,
-                        "id": f"call_{_id}",
-                        "type": "function",
-                        "function": {
-                            "name": func,
-                            "arguments": json.dumps(args, ensure_ascii=False),
-                        },
-                    }
-                )
-            else:
-                failed_contents.append(content)
-        finish_reason = "tool_calls" if tool_calls else "stop"
+        content, func, args = tool_result if tool_result else ("", None, None)
+        if func:
+            tool_calls.append(
+                {
+                    "index": 0,
+                    "id": f"call_{_id}",
+                    "type": "function",
+                    "function": {
+                        "name": func,
+                        "arguments": json.dumps(args, ensure_ascii=False),
+                    },
+                }
+            )
+        else:
+            failed_contents.append(content)
-        content = "".join(failed_contents) if failed_contents else None
+        finish_reason = "tool_calls" if tool_calls else finish_reason
-        # fix: qwen tool_call content field return null
-        family = model_family.model_family or model_family.model_name
-        if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
-            content = ""
+        content = "".join(failed_contents) if failed_contents else None
         d = {
             "role": "assistant",
-            "content": content,
+            "content": content if content else "",
             "tool_calls": tool_calls,
         }
@@ -826,29 +872,32 @@ class ChatModelMixin:
             "usage": usage,
         }
-    @classmethod
     def _post_process_completion(
-        cls,
+        self,
         model_family,
         model_uid,
         c,
-        reasoning_parser: Optional[ReasoningParser] = None,
     ):
-        if reasoning_parser:
-            c = reasoning_parser.prepare_reasoning_content(c)
+        if not self.tool_parser:
+            return self._get_final_chat_completion_chunk(c)
+        if self.reasoning_parser:
+            c = self.reasoning_parser.prepare_reasoning_content(c)
         _id = str(uuid.uuid4())
         reasoning_content = None
-        if reasoning_parser and reasoning_parser.check_content_parser():
+        if self.reasoning_parser and self.reasoning_parser.check_content_parser():
             text = c["choices"][0]["text"]
-            reasoning_content, content = reasoning_parser.extract_reasoning_content(
-                text
+            reasoning_content, content = (
+                self.reasoning_parser.extract_reasoning_content(text)
             )
             c["choices"][0]["text"] = content
-        tool_result = cls._eval_tool_arguments(model_family, c)
         tool_calls = []
         failed_contents = []
+        if isinstance(self.tool_parser, Glm4ToolParser):
+            tool_result = self.tool_parser.extract_tool_calls(c)
+        else:
+            text = c["choices"][0]["text"]
+            tool_result = self.tool_parser.extract_tool_calls(text)
         for content, func, args in tool_result:
             if func:
                 tool_calls.append(
@@ -868,14 +917,9 @@ class ChatModelMixin:
         content = "".join(failed_contents) if failed_contents else None
-        # fix: qwen tool_call content field return null
-        family = model_family.model_family or model_family.model_name
-        if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
-            content = ""
         m = {
             "role": "assistant",
-            "content": content,
+            "content": content if content else "",
             "tool_calls": tool_calls,
         }
         # add only reasoning_content is None
@@ -943,6 +987,44 @@ class ChatModelMixin:
         return transformed_messages
+    async def _async_to_tool_completion_chunks(
+        self,
+        chunks: AsyncGenerator[CompletionChunk, None],
+        ctx: Optional[Dict[str, Any]] = None,
+    ) -> AsyncGenerator[ChatCompletionChunk, None]:
+        def set_context():
+            if ctx:
+                chat_context_var.set(ctx)
+        i = 0
+        previous_texts = [""]
+        previous_tools_texts = [""]
+        full_text = ""
+        if self.reasoning_parser:
+            set_context()
+            chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
+        async for completion_chunk in chunks:
+            set_context()
+            chat_chunk = self._to_chat_completion_chunk(
+                completion_chunk, self.reasoning_parser, previous_texts
+            )
+            if (
+                "reasoning_content" in chat_chunk["choices"][0]["delta"]
+                and chat_chunk["choices"][0]["delta"]["reasoning_content"] is not None
+            ):
+                yield chat_chunk
+                continue
+            processed_chunk = self._post_process_completion_chunk(
+                self.model_family,
+                self.model_uid,
+                chat_chunk,
+                previous_texts=previous_tools_texts,
+            )
+            if processed_chunk:
+                yield processed_chunk
+            i += 1
+        logger.debug("Chat finished, output: %s", full_text)
 def get_model_version(
     model_name: str,

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -393,6 +393,7 @@ class VLLMModel(LLM):
         self.prepare_parse_reasoning_content(
             reasoning_content, enable_thinking=enable_thinking
         )
+        self.prepare_parse_tool_calls()
         if (
             isinstance(self.model_spec, LlamaCppLLMSpecV2)
@@ -773,7 +774,6 @@ class VLLMModel(LLM):
         sanitized = VLLMGenerateConfig()
         response_format = generate_config.pop("response_format", None)
-        guided_decoding_backend = generate_config.get("guided_decoding_backend", None)
         guided_json_object = None
         guided_json = None
@@ -784,8 +784,6 @@ class VLLMModel(LLM):
                 json_schema = response_format.get("json_schema")
                 assert json_schema is not None
                 guided_json = json_schema.get("json_schema")
-                if guided_decoding_backend is None:
-                    guided_decoding_backend = "outlines"
         sanitized.setdefault("lora_name", generate_config.get("lora_name", None))
         sanitized.setdefault("n", generate_config.get("n", 1))
@@ -833,10 +831,6 @@ class VLLMModel(LLM):
             "guided_json_object",
             generate_config.get("guided_json_object", guided_json_object),
         )
-        sanitized.setdefault(
-            "guided_decoding_backend",
-            generate_config.get("guided_decoding_backend", guided_decoding_backend),
-        )
         return sanitized
@@ -1291,59 +1285,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         return processed_messages
-    async def _async_to_tool_completion_chunks(
-        self,
-        chunks: AsyncGenerator[CompletionChunk, None],
-        ctx: Optional[Dict[str, Any]] = {},
-    ) -> AsyncGenerator[ChatCompletionChunk, None]:
-        def set_context():
-            if ctx:
-                chat_context_var.set(ctx)
-        i = 0
-        previous_texts = [""]
-        tool_call = False
-        tool_call_texts = [""]
-        full_text = ""
-        if self.reasoning_parser:
-            set_context()
-            chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
-        async for chunk in chunks:
-            set_context()
-            if i == 0:
-                for first_chunk in self._get_first_chat_completion_chunk(
-                    chunk, self.reasoning_parser
-                ):
-                    yield first_chunk
-            # usage
-            choices = chunk.get("choices")
-            if not choices:
-                yield self._get_final_chat_completion_chunk(chunk)
-            else:
-                full_text += chunk["choices"][0]["text"]
-                if self.is_tool_call_chunk_start(chunk):
-                    tool_call = True
-                if tool_call:
-                    tool_call_text = tool_call_texts[-1]
-                    tool_call_text += chunk["choices"][0]["text"]
-                    tool_call_texts.append(tool_call_text)
-                    if self.is_tool_call_chunk_end(chunk):
-                        yield self._post_process_completion_chunk(
-                            self.model_family,
-                            self.model_uid,
-                            chunk,
-                            reasoning_parser=self.reasoning_parser,
-                            tool_call_text=tool_call_text,
-                        )
-                        tool_call = False
-                        tool_call_texts = [""]
-                else:
-                    yield self._to_chat_completion_chunk(
-                        chunk, self.reasoning_parser, previous_texts
-                    )
-            i += 1
-        logger.debug("Chat finished, output: %s", full_text)
     @vllm_check
     async def async_chat(
         self,
@@ -1408,7 +1349,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             assert not isinstance(c, AsyncGenerator)
             if tools:
                 return self._post_process_completion(
-                    self.model_family, self.model_uid, c, self.reasoning_parser
+                    self.model_family, self.model_uid, c
                 )
             return self._to_chat_completion(c, self.reasoning_parser)

xinference 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl

Potentially problematic release.

xinference 1.9.1py3-none-any.whl → 1.10.0py3-none-any.whl