PyPI - xinference - Versions diffs - 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

xinference 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (92) hide show

xinference/model/llm/utils.py CHANGED Viewed

@@ -51,6 +51,7 @@ from ...types import (
 )
 from .core import chat_context_var
 from .reasoning_parser import ReasoningParser
+from .tool_parsers.glm4_tool_parser import Glm4ToolParser
 logger = logging.getLogger(__name__)
@@ -82,7 +83,7 @@ LLAMA3_TOOL_CALL_FAMILY = [
     "HuatuoGPT-o1-LLaMA-3.1",
 ]
-DEEPSEEK_TOOL_CALL_FAMILY = ["deepseek-v3", "deepseek-r1-0528"]
+DEEPSEEK_TOOL_CALL_FAMILY = ["deepseek-v3", "deepseek-r1-0528", "Deepseek-V3.1"]
 TOOL_CALL_FAMILY = (
     QWEN_TOOL_CALL_FAMILY
@@ -95,6 +96,13 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
 class ChatModelMixin:
+    def __init__(self):
+        self.model_family = None
+        self.model_uid = None
+        self.reasoning_parser = None
+        self.tool_parser = None
     @staticmethod
     @functools.lru_cache
     def _compile_jinja_template(chat_template):
@@ -590,16 +598,41 @@ class ChatModelMixin:
                 pos2 = content.find(QWEN_TOOL_CALL_SYMBOLS[1])
                 if pos2 != -1:
                     content = content[:pos2]
+                # Skip empty content after extraction
+                if not content.strip():
+                    continue
                 try:
                     res = json.loads(content, strict=False)
-                    results.append((None, res["name"], res["arguments"]))
-                except Exception as e:
+                    if isinstance(res, dict):
+                        # Check if required fields exist
+                        if "name" in res and "arguments" in res:
+                            results.append((None, res["name"], res["arguments"]))
+                        else:
+                            logger.warning(
+                                "Missing required fields in qwen tool call: %s", content
+                            )
+                            results.append((content, None, None))
+                    else:
+                        logger.warning(
+                            "Qwen tool call result is not a dict: %s", content
+                        )
+                        results.append((content, None, None))
+                except json.JSONDecodeError as e:
                     logger.error(
                         "Can't parse single qwen tool call output: %s. Error: %s",
                         content,
                         e,
                     )
                     results.append((content, None, None))
+                except Exception as e:
+                    logger.error(
+                        "Unexpected error parsing qwen tool call: %s. Error: %s",
+                        content,
+                        e,
+                    )
+                    results.append((content, None, None))
         return results
     @classmethod
@@ -757,47 +790,60 @@ class ChatModelMixin:
         logger.debug(f"Tool call content: {result}")
         return result
-    @classmethod
     def _post_process_completion_chunk(
-        cls,
+        self,
         model_family,
         model_uid,
         c,
         chunk_id=None,
-        reasoning_parser: Optional[ReasoningParser] = None,
-        tool_call_text: Optional[str] = None,
+        previous_texts: List[str] = [""],
     ):
         _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
-        tool_result = cls._eval_tool_arguments(model_family, c, tool_call_text)
+        if isinstance(self.tool_parser, Glm4ToolParser):
+            tool_result = self.tool_parser.extract_tool_calls_streaming(
+                [],
+                c,
+                c,
+            )
+        else:
+            finish_reason = c["choices"][0]["finish_reason"]
+            delta_text = c["choices"][0]["delta"]["content"]
+            current_text = (
+                previous_texts[-1] + delta_text if previous_texts else delta_text
+            )
+            tool_result = self.tool_parser.extract_tool_calls_streaming(
+                previous_texts,
+                current_text,
+                delta_text,
+            )
+            previous_texts[-1] = current_text
+        if tool_result is None and not finish_reason:
+            return None
         tool_calls = []
         failed_contents = []
-        for content, func, args in tool_result:
-            if func:
-                tool_calls.append(
-                    {
-                        "index": 0,
-                        "id": f"call_{_id}",
-                        "type": "function",
-                        "function": {
-                            "name": func,
-                            "arguments": json.dumps(args, ensure_ascii=False),
-                        },
-                    }
-                )
-            else:
-                failed_contents.append(content)
-        finish_reason = "tool_calls" if tool_calls else "stop"
+        content, func, args = tool_result if tool_result else ("", None, None)
+        if func:
+            tool_calls.append(
+                {
+                    "index": 0,
+                    "id": f"call_{_id}",
+                    "type": "function",
+                    "function": {
+                        "name": func,
+                        "arguments": json.dumps(args, ensure_ascii=False),
+                    },
+                }
+            )
+        else:
+            failed_contents.append(content)
-        content = "".join(failed_contents) if failed_contents else None
+        finish_reason = "tool_calls" if tool_calls else finish_reason
-        # fix: qwen tool_call content field return null
-        family = model_family.model_family or model_family.model_name
-        if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
-            content = ""
+        content = "".join(failed_contents) if failed_contents else None
         d = {
             "role": "assistant",
-            "content": content,
+            "content": content if content else "",
             "tool_calls": tool_calls,
         }
@@ -826,29 +872,32 @@ class ChatModelMixin:
             "usage": usage,
         }
-    @classmethod
     def _post_process_completion(
-        cls,
+        self,
         model_family,
         model_uid,
         c,
-        reasoning_parser: Optional[ReasoningParser] = None,
     ):
-        if reasoning_parser:
-            c = reasoning_parser.prepare_reasoning_content(c)
+        if not self.tool_parser:
+            return self._get_final_chat_completion_chunk(c)
+        if self.reasoning_parser:
+            c = self.reasoning_parser.prepare_reasoning_content(c)
         _id = str(uuid.uuid4())
         reasoning_content = None
-        if reasoning_parser and reasoning_parser.check_content_parser():
+        if self.reasoning_parser and self.reasoning_parser.check_content_parser():
             text = c["choices"][0]["text"]
-            reasoning_content, content = reasoning_parser.extract_reasoning_content(
-                text
+            reasoning_content, content = (
+                self.reasoning_parser.extract_reasoning_content(text)
             )
             c["choices"][0]["text"] = content
-        tool_result = cls._eval_tool_arguments(model_family, c)
         tool_calls = []
         failed_contents = []
+        if isinstance(self.tool_parser, Glm4ToolParser):
+            tool_result = self.tool_parser.extract_tool_calls(c)
+        else:
+            text = c["choices"][0]["text"]
+            tool_result = self.tool_parser.extract_tool_calls(text)
         for content, func, args in tool_result:
             if func:
                 tool_calls.append(
@@ -868,14 +917,9 @@ class ChatModelMixin:
         content = "".join(failed_contents) if failed_contents else None
-        # fix: qwen tool_call content field return null
-        family = model_family.model_family or model_family.model_name
-        if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
-            content = ""
         m = {
             "role": "assistant",
-            "content": content,
+            "content": content if content else "",
             "tool_calls": tool_calls,
         }
         # add only reasoning_content is None
@@ -943,6 +987,44 @@ class ChatModelMixin:
         return transformed_messages
+    async def _async_to_tool_completion_chunks(
+        self,
+        chunks: AsyncGenerator[CompletionChunk, None],
+        ctx: Optional[Dict[str, Any]] = None,
+    ) -> AsyncGenerator[ChatCompletionChunk, None]:
+        def set_context():
+            if ctx:
+                chat_context_var.set(ctx)
+        i = 0
+        previous_texts = [""]
+        previous_tools_texts = [""]
+        full_text = ""
+        if self.reasoning_parser:
+            set_context()
+            chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
+        async for completion_chunk in chunks:
+            set_context()
+            chat_chunk = self._to_chat_completion_chunk(
+                completion_chunk, self.reasoning_parser, previous_texts
+            )
+            if (
+                "reasoning_content" in chat_chunk["choices"][0]["delta"]
+                and chat_chunk["choices"][0]["delta"]["reasoning_content"] is not None
+            ):
+                yield chat_chunk
+                continue
+            processed_chunk = self._post_process_completion_chunk(
+                self.model_family,
+                self.model_uid,
+                chat_chunk,
+                previous_texts=previous_tools_texts,
+            )
+            if processed_chunk:
+                yield processed_chunk
+            i += 1
+        logger.debug("Chat finished, output: %s", full_text)
 def get_model_version(
     model_name: str,

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -273,13 +273,19 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.2"):
     VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Thinking")
     VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Coder")
+    VLLM_SUPPORTED_CHAT_MODELS.append("Deepseek-V3.1")
 if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.0"):
     VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5")
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("glm-4.5v")
+    VLLM_SUPPORTED_CHAT_MODELS.append("KAT-V1")
 if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
     VLLM_SUPPORTED_CHAT_MODELS.append("gpt-oss")
+    VLLM_SUPPORTED_CHAT_MODELS.append("seed-oss")
+if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.1.1"):
+    VLLM_SUPPORTED_CHAT_MODELS.append("seed-oss")
 class VLLMModel(LLM):
@@ -387,6 +393,7 @@ class VLLMModel(LLM):
         self.prepare_parse_reasoning_content(
             reasoning_content, enable_thinking=enable_thinking
         )
+        self.prepare_parse_tool_calls()
         if (
             isinstance(self.model_spec, LlamaCppLLMSpecV2)
@@ -767,7 +774,6 @@ class VLLMModel(LLM):
         sanitized = VLLMGenerateConfig()
         response_format = generate_config.pop("response_format", None)
-        guided_decoding_backend = generate_config.get("guided_decoding_backend", None)
         guided_json_object = None
         guided_json = None
@@ -778,8 +784,6 @@ class VLLMModel(LLM):
                 json_schema = response_format.get("json_schema")
                 assert json_schema is not None
                 guided_json = json_schema.get("json_schema")
-                if guided_decoding_backend is None:
-                    guided_decoding_backend = "outlines"
         sanitized.setdefault("lora_name", generate_config.get("lora_name", None))
         sanitized.setdefault("n", generate_config.get("n", 1))
@@ -827,10 +831,6 @@ class VLLMModel(LLM):
             "guided_json_object",
             generate_config.get("guided_json_object", guided_json_object),
         )
-        sanitized.setdefault(
-            "guided_decoding_backend",
-            generate_config.get("guided_decoding_backend", guided_decoding_backend),
-        )
         return sanitized
@@ -1285,59 +1285,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         return processed_messages
-    async def _async_to_tool_completion_chunks(
-        self,
-        chunks: AsyncGenerator[CompletionChunk, None],
-        ctx: Optional[Dict[str, Any]] = {},
-    ) -> AsyncGenerator[ChatCompletionChunk, None]:
-        def set_context():
-            if ctx:
-                chat_context_var.set(ctx)
-        i = 0
-        previous_texts = [""]
-        tool_call = False
-        tool_call_texts = [""]
-        full_text = ""
-        if self.reasoning_parser:
-            set_context()
-            chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
-        async for chunk in chunks:
-            set_context()
-            if i == 0:
-                for first_chunk in self._get_first_chat_completion_chunk(
-                    chunk, self.reasoning_parser
-                ):
-                    yield first_chunk
-            # usage
-            choices = chunk.get("choices")
-            if not choices:
-                yield self._get_final_chat_completion_chunk(chunk)
-            else:
-                full_text += chunk["choices"][0]["text"]
-                if self.is_tool_call_chunk_start(chunk):
-                    tool_call = True
-                if tool_call:
-                    tool_call_text = tool_call_texts[-1]
-                    tool_call_text += chunk["choices"][0]["text"]
-                    tool_call_texts.append(tool_call_text)
-                    if self.is_tool_call_chunk_end(chunk):
-                        yield self._post_process_completion_chunk(
-                            self.model_family,
-                            self.model_uid,
-                            chunk,
-                            reasoning_parser=self.reasoning_parser,
-                            tool_call_text=tool_call_text,
-                        )
-                        tool_call = False
-                        tool_call_texts = [""]
-                else:
-                    yield self._to_chat_completion_chunk(
-                        chunk, self.reasoning_parser, previous_texts
-                    )
-            i += 1
-        logger.debug("Chat finished, output: %s", full_text)
     @vllm_check
     async def async_chat(
         self,
@@ -1402,7 +1349,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             assert not isinstance(c, AsyncGenerator)
             if tools:
                 return self._post_process_completion(
-                    self.model_family, self.model_uid, c, self.reasoning_parser
+                    self.model_family, self.model_uid, c
                 )
             return self._to_chat_completion(c, self.reasoning_parser)

xinference/model/rerank/core.py CHANGED Viewed

@@ -97,6 +97,8 @@ class RerankModel:
         model_uid: str,
         model_path: str,
         model_family: RerankModelFamilyV2,
+        quantization: Optional[str],
+        *,
         device: Optional[str] = None,
         use_fp16: bool = False,
         **kwargs,
@@ -105,6 +107,7 @@ class RerankModel:
         self._model_spec = model_family.model_specs[0]
         self._model_uid = model_uid
         self._model_path = model_path
+        self._quantization = quantization
         self._device = device
         self._use_fp16 = use_fp16
         self._model = None

xinference/model/rerank/sentence_transformers/core.py CHANGED Viewed

@@ -72,7 +72,7 @@ class SentenceTransformerRerankModel(RerankModel):
         enable_flash_attn = self._kwargs.pop(
             "enable_flash_attn", is_flash_attn_available()
         )
-        if self._auto_detect_type(self._model_path) != "normal" and enable_flash_attn:
+        if enable_flash_attn:
             logger.warning(
                 "flash_attn can only support fp16 and bf16, will force set `use_fp16` to True"
             )

xinference/model/rerank/vllm/core.py CHANGED Viewed

@@ -3,6 +3,7 @@ import uuid
 from typing import List, Optional
 from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
+from ...utils import cache_clean
 from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
 SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"]
@@ -22,9 +23,27 @@ class VLLMRerankModel(RerankModel):
             raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        if self.model_family.model_name in {
+            "Qwen3-Reranker-0.6B",
+            "Qwen3-Reranker-4B",
+            "Qwen3-Reranker-8B",
+        }:
+            if "hf_overrides" not in self._kwargs:
+                self._kwargs["hf_overrides"] = {
+                    "architectures": ["Qwen3ForSequenceClassification"],
+                    "classifier_from_token": ["no", "yes"],
+                    "is_original_qwen3_reranker": True,
+                }
+            elif isinstance(self._kwargs["hf_overrides"], dict):
+                self._kwargs["hf_overrides"].update(
+                    architectures=["Qwen3ForSequenceClassification"],
+                    classifier_from_token=["no", "yes"],
+                    is_original_qwen3_reranker=True,
+                )
         self._model = LLM(model=self._model_path, task="score", **self._kwargs)
         self._tokenizer = self._model.get_tokenizer()
+    @cache_clean
     def rerank(
         self,
         documents: List[str],
@@ -51,14 +70,45 @@ class VLLMRerankModel(RerankModel):
         """
         if kwargs:
             raise RuntimeError("Unexpected keyword arguments: {}".format(kwargs))
+        assert self._model is not None
         documents_size = len(documents)
         query_list = [query] * documents_size
-        assert self._model is not None
-        outputs = self._model.score(
-            documents,
-            query_list,
-            use_tqdm=False,
-        )
+        if self.model_family.model_name in {
+            "Qwen3-Reranker-0.6B",
+            "Qwen3-Reranker-4B",
+            "Qwen3-Reranker-8B",
+        }:
+            instruction = "Given a web search query, retrieve relevant passages that answer the query"
+            prefix = (
+                "<|im_start|>system\nJudge whether the Document meets the requirements based on"
+                " the Query and the Instruct provided. "
+                'Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
+            )
+            suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+            query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
+            document_template = "<Document>: {doc}{suffix}"
+            processed_queries = [
+                query_template.format(
+                    prefix=prefix, instruction=instruction, query=query
+                )
+                for query in query_list
+            ]
+            processed_documents = [
+                document_template.format(doc=doc, suffix=suffix) for doc in documents
+            ]
+            outputs = self._model.score(
+                processed_documents,
+                processed_queries,
+                use_tqdm=False,
+            )
+        else:
+            outputs = self._model.score(
+                documents,
+                query_list,
+                use_tqdm=False,
+            )
         scores = map(lambda scoreoutput: scoreoutput.outputs.score, outputs)
         documents = list(map(lambda doc: Document(text=doc), documents))
         document_parts = list(zip(range(documents_size), scores, documents))

xinference/model/utils.py CHANGED Viewed

@@ -619,8 +619,7 @@ def is_flash_attn_available() -> bool:
                 f"GPU compute capability {compute_capability} < 8.0, "
                 "flash_attn may not work optimally"
             )
-            # Note: Some older GPUs may also support flash_attn, so this is just a warning
-            # This threshold can be adjusted based on actual requirements
+            return False
         # Try to import flash_attn core module to verify correct installation
         try:

xinference/model/video/model_spec.json CHANGED Viewed

@@ -224,7 +224,7 @@
     },
     "virtualenv": {
       "packages": [
-        "git+https://github.com/huggingface/diffusers",
+        "diffusers==0.35.1",
         "ftfy",
         "imageio-ffmpeg",
         "imageio",
@@ -241,5 +241,99 @@
         "model_revision": "master"
       }
     }
+  },
+  {
+    "version": 2,
+    "model_name": "Wan2.2-A14B",
+    "model_family": "Wan",
+    "model_ability": [
+      "text2video"
+    ],
+    "default_model_config": {
+      "torch_dtype": "bfloat16"
+    },
+    "default_generate_config": {},
+    "virtualenv": {
+      "packages": [
+        "diffusers==0.35.1",
+        "ftfy",
+        "imageio-ffmpeg",
+        "imageio",
+        "#system_numpy#"
+      ]
+    },
+    "model_src": {
+      "huggingface": {
+        "model_id": "Wan-AI/Wan2.2-T2V-A14B-Diffusers",
+        "model_revision": "5be7df9619b54f4e2667b2755bc6a756675b5cd7"
+      },
+      "modelscope": {
+        "model_id": "Wan-AI/Wan2.2-T2V-A14B-Diffusers",
+        "model_revision": "master"
+      }
+    }
+  },
+  {
+    "version": 2,
+    "model_name": "Wan2.2-i2v-A14B",
+    "model_family": "Wan",
+    "model_ability": [
+      "image2video"
+    ],
+    "default_model_config": {
+      "torch_dtype": "bfloat16"
+    },
+    "default_generate_config": {},
+    "virtualenv": {
+      "packages": [
+        "diffusers==0.35.1",
+        "ftfy",
+        "imageio-ffmpeg",
+        "imageio",
+        "#system_numpy#"
+      ]
+    },
+    "model_src": {
+      "huggingface": {
+        "model_id": "Wan-AI/Wan2.2-I2V-A14B-Diffusers",
+        "model_revision": "596658fd9ca6b7b71d5057529bbf319ecbc61d74"
+      },
+      "modelscope": {
+        "model_id": "Wan-AI/Wan2.2-I2V-A14B-Diffusers",
+        "model_revision": "master"
+      }
+    }
+  },
+  {
+    "version": 2,
+    "model_name": "Wan2.2-ti2v-5B",
+    "model_family": "Wan",
+    "model_ability": [
+      "text2video",
+      "image2video"
+    ],
+    "default_model_config": {
+      "torch_dtype": "bfloat16"
+    },
+    "default_generate_config": {},
+    "virtualenv": {
+      "packages": [
+        "diffusers==0.35.1",
+        "ftfy",
+        "imageio-ffmpeg",
+        "imageio",
+        "#system_numpy#"
+      ]
+    },
+    "model_src": {
+      "huggingface": {
+        "model_id": "Wan-AI/Wan2.2-TI2V-5B-Diffusers",
+        "model_revision": "b8fff7315c768468a5333511427288870b2e9635"
+      },
+      "modelscope": {
+        "model_id": "Wan-AI/Wan2.2-TI2V-5B-Diffusers",
+        "model_revision": "master"
+      }
+    }
   }
 ]

xinference/thirdparty/cosyvoice/bin/export_jit.py CHANGED Viewed

@@ -61,8 +61,7 @@ def main():
         model = CosyVoice(args.model_dir)
     except Exception:
         try:
-            # NOTE set use_flow_cache=True when export jit for cache inference
-            model = CosyVoice2(args.model_dir, use_flow_cache=True)
+            model = CosyVoice2(args.model_dir)
         except Exception:
             raise TypeError('no valid model_type!')
@@ -93,9 +92,9 @@ def main():
     else:
         # 3. export flow encoder
         flow_encoder = model.model.flow.encoder
-        script = get_optimized_script(flow_encoder, ['forward_chunk'])
+        script = get_optimized_script(flow_encoder)
         script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
-        script = get_optimized_script(flow_encoder.half(), ['forward_chunk'])
+        script = get_optimized_script(flow_encoder.half())
         script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
         logging.info('successfully export flow_encoder')

xinference 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

Potentially problematic release.

xinference 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl