PyPI - xinference - Versions diffs - 1.8.1rc1__py3-none-any.whl → 1.9.1__py3-none-any.whl - Mend

xinference 1.8.1rc1py3-none-any.whl → 1.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (108) hide show

xinference/model/llm/utils.py CHANGED Viewed

@@ -67,6 +67,9 @@ QWEN_TOOL_CALL_FAMILY = [
     "qwen3",
     "HuatuoGPT-o1-Qwen2.5",
     "DianJin-R1",
+    "Qwen3-Thinking",
+    "Qwen3-Instruct",
+    "Qwen3-Coder",
 ]
 GLM4_TOOL_CALL_FAMILY = [
@@ -79,9 +82,7 @@ LLAMA3_TOOL_CALL_FAMILY = [
     "HuatuoGPT-o1-LLaMA-3.1",
 ]
-DEEPSEEK_TOOL_CALL_FAMILY = [
-    "deepseek-v3",
-]
+DEEPSEEK_TOOL_CALL_FAMILY = ["deepseek-v3", "deepseek-r1-0528", "Deepseek-V3.1"]
 TOOL_CALL_FAMILY = (
     QWEN_TOOL_CALL_FAMILY
@@ -167,8 +168,7 @@ class ChatModelMixin:
                     return json.loads(kwargs)
                 except json.JSONDecodeError:
                     raise TypeError(
-                        f"`chat_template_kwargs` should be json parsable, "
-                        f"got: {kwargs}"
+                        f"`chat_template_kwargs` should be json parsable, got: {kwargs}"
                     )
             elif isinstance(kwargs, dict):
                 return kwargs
@@ -254,7 +254,7 @@ class ChatModelMixin:
                         ret += role + "\n" + text + intra_message_sep + "\n"
                     else:
                         placeholders = "\n".join(
-                            f"Image-{i+1}: <image>\n"
+                            f"Image-{i + 1}: <image>\n"
                             for i in range(
                                 len(images) - len(image_futures), len(images)
                             )
@@ -463,6 +463,7 @@ class ChatModelMixin:
                 chat_context_var.set(ctx)
         previous_texts = [""]
+        full_text = ""
         # Process chunks
         if reasoning_parser:
             set_context()
@@ -474,10 +475,14 @@ class ChatModelMixin:
                 # usage
                 chat_chunk = cls._get_final_chat_completion_chunk(chunk)
             else:
+                if choices[0].get("text"):
+                    full_text += choices[0]["text"]  # type: ignore
                 chat_chunk = cls._to_chat_completion_chunk(
                     chunk, reasoning_parser, previous_texts
                 )
             yield chat_chunk
+        logger.debug("Chat finished, output: %s", full_text)
     @staticmethod
     def _to_chat_completion(
@@ -683,6 +688,52 @@ class ChatModelMixin:
         return results
+    @classmethod
+    def _eval_deepseek_r1_arguments(cls, c) -> List[Tuple]:
+        """
+        Parses tool calls from deepseek-r1 (0528) chat template format.
+        Returns:
+            List of (None, function_name, arguments_dict)
+            or (raw_content, None, None) if parsing fails.
+        """
+        text = c["choices"][0]["text"]
+        pattern = (
+            r"<\｜tool▁call▁begin｜>function<\｜tool▁sep｜>([^\n]+)\n"
+            r"```json\n(.*?)\n```<\｜tool▁call▁end｜>"
+        )
+        matches = re.findall(pattern, text, re.DOTALL)
+        if not matches:
+            return [(text, None, None)]
+        tool_calls = set()
+        results = []
+        for func_name, raw_json in matches:
+            func_and_args = None
+            try:
+                func_and_args = json.loads(raw_json)
+                arguments_hashable = frozenset(func_and_args.items())
+                tool_call_tuple = (
+                    None,
+                    func_name,
+                    func_and_args,
+                )
+            except Exception:
+                tool_call_tuple = (raw_json, None, None)
+                arguments_hashable = None
+            dedup_key = (
+                (func_name, arguments_hashable)
+                if func_and_args is not None
+                else raw_json
+            )
+            if dedup_key not in tool_calls:
+                tool_calls.add(dedup_key)
+                results.append(tool_call_tuple)
+        return results
     @classmethod
     def _eval_tool_arguments(
         cls, model_family, c, tool_call_text: Optional[str] = None
@@ -695,7 +746,10 @@ class ChatModelMixin:
         elif family in LLAMA3_TOOL_CALL_FAMILY:
             result = cls._eval_llama3_chat_arguments(c)
         elif family in DEEPSEEK_TOOL_CALL_FAMILY:
-            result = cls._eval_deepseek_chat_arguments(c)
+            if family == "deepseek-r1-0528":
+                result = cls._eval_deepseek_r1_arguments(c)
+            else:
+                result = cls._eval_deepseek_chat_arguments(c)
         else:
             raise Exception(
                 f"Model {model_family.model_name} is not support tool calls."

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -89,6 +89,7 @@ class VLLMModelConfig(TypedDict, total=False):
     mm_processor_kwargs: NotRequired[dict[str, Any]]
     min_pixels: NotRequired[int]
     max_pixels: NotRequired[int]
+    enable_expert_parallel: bool
 class VLLMGenerateConfig(TypedDict, total=False):
@@ -272,9 +273,19 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.2"):
     VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Instruct")
     VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Thinking")
     VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Coder")
+    VLLM_SUPPORTED_CHAT_MODELS.append("Deepseek-V3.1")
-if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
+if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.0"):
     VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5")
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("glm-4.5v")
+    VLLM_SUPPORTED_CHAT_MODELS.append("KAT-V1")
+if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
+    VLLM_SUPPORTED_CHAT_MODELS.append("gpt-oss")
+    VLLM_SUPPORTED_CHAT_MODELS.append("seed-oss")
+if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.1.1"):
+    VLLM_SUPPORTED_CHAT_MODELS.append("seed-oss")
 class VLLMModel(LLM):
@@ -557,7 +568,9 @@ class VLLMModel(LLM):
                 raise err.with_traceback(tb)
         # set context length after engine inited
-        self._set_context_length()
+        # if shard > 0, the engine will be inited in another process
+        if self._engine:
+            self._set_context_length()
     def _set_context_length(self):
         from vllm import envs
@@ -839,7 +852,7 @@ class VLLMModel(LLM):
             return False
         if not cls._is_linux():
             return False
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
@@ -1187,7 +1200,14 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "ggufv2"]:
+        if llm_spec.model_format not in [
+            "pytorch",
+            "gptq",
+            "awq",
+            "fp8",
+            "bnb",
+            "ggufv2",
+        ]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
@@ -1284,6 +1304,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         previous_texts = [""]
         tool_call = False
         tool_call_texts = [""]
+        full_text = ""
         if self.reasoning_parser:
             set_context()
             chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
@@ -1299,6 +1320,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             if not choices:
                 yield self._get_final_chat_completion_chunk(chunk)
             else:
+                full_text += chunk["choices"][0]["text"]
                 if self.is_tool_call_chunk_start(chunk):
                     tool_call = True
                 if tool_call:
@@ -1320,6 +1342,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
                         chunk, self.reasoning_parser, previous_texts
                     )
             i += 1
+        logger.debug("Chat finished, output: %s", full_text)
     @vllm_check
     async def async_chat(
@@ -1348,13 +1371,26 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             ):
                 full_context_kwargs["tools"] = tools
         assert self.model_family.chat_template is not None
-        full_prompt = self.get_full_context(
-            messages, self.model_family.chat_template, **full_context_kwargs
-        )
         generate_config = self._sanitize_chat_config(generate_config)
         stream = generate_config.get("stream", None)
+        lora_request = None
+        lora_model = generate_config.get("lora_name")
+        if lora_model is not None:
+            for lora in self.lora_requests:
+                if lora_model == lora.lora_name:
+                    lora_request = lora
+                    break
+        tokenizer = await self._get_tokenizer(lora_request)
+        full_prompt = self.get_full_context(
+            messages,
+            self.model_family.chat_template,
+            tokenizer=tokenizer,
+            **full_context_kwargs,
+        )
         if stream:
             agen = await self.async_generate(
                 full_prompt, generate_config, tools, request_id=request_id
@@ -1386,7 +1422,7 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
             return False
         if not cls._is_linux():
             return False
-        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):

xinference/model/rerank/__init__.py CHANGED Viewed

@@ -16,10 +16,10 @@ import codecs
 import json
 import os
 import warnings
-from typing import Dict, List
+from typing import Any, Dict, List
 from ...constants import XINFERENCE_MODEL_DIR
-from ..utils import flatten_model_src
+from ..utils import flatten_quantizations
 from .core import (
     RERANK_MODEL_DESCRIPTIONS,
     RerankModelFamilyV2,
@@ -32,8 +32,13 @@ from .custom import (
     register_rerank,
     unregister_rerank,
 )
-BUILTIN_RERANK_MODELS: Dict[str, List["RerankModelFamilyV2"]] = {}
+from .rerank_family import (
+    BUILTIN_RERANK_MODELS,
+    RERANK_ENGINES,
+    SENTENCE_TRANSFORMER_CLASSES,
+    SUPPORTED_ENGINES,
+    VLLM_CLASSES,
+)
 def register_custom_model():
@@ -58,31 +63,69 @@ def register_custom_model():
                 warnings.warn(f"{user_defined_rerank_dir}/{f} has error, {e}")
-def _install():
-    load_model_family_from_json("model_spec.json", BUILTIN_RERANK_MODELS)
+def generate_engine_config_by_model_name(model_family: "RerankModelFamilyV2"):
+    model_name = model_family.model_name
+    engines: Dict[str, List[Dict[str, Any]]] = RERANK_ENGINES.get(
+        model_name, {}
+    )  # structure for engine query
+    for spec in [x for x in model_family.model_specs if x.model_hub == "huggingface"]:
+        model_format = spec.model_format
+        quantization = spec.quantization
+        for engine in SUPPORTED_ENGINES:
+            CLASSES = SUPPORTED_ENGINES[engine]
+            for cls in CLASSES:
+                # Every engine needs to implement match method
+                if cls.match(model_family, spec, quantization):
+                    # we only match the first class for an engine
+                    if engine not in engines:
+                        engines[engine] = [
+                            {
+                                "model_name": model_name,
+                                "model_format": model_format,
+                                "quantization": quantization,
+                                "rerank_class": cls,
+                            }
+                        ]
+                    else:
+                        engines[engine].append(
+                            {
+                                "model_name": model_name,
+                                "model_format": model_format,
+                                "quantization": quantization,
+                                "rerank_class": cls,
+                            }
+                        )
+                    break
+    RERANK_ENGINES[model_name] = engines
-    for model_name, model_specs in BUILTIN_RERANK_MODELS.items():
-        model_spec = [x for x in model_specs if x.model_hub == "huggingface"][0]
+def _install():
+    _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
+    for json_obj in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8")):
+        flattened = []
+        for spec in json_obj["model_specs"]:
+            flattened.extend(flatten_quantizations(spec))
+        json_obj["model_specs"] = flattened
+        BUILTIN_RERANK_MODELS[json_obj["model_name"]] = RerankModelFamilyV2(**json_obj)
+    for model_name, model_spec in BUILTIN_RERANK_MODELS.items():
         if model_spec.model_name not in RERANK_MODEL_DESCRIPTIONS:
             RERANK_MODEL_DESCRIPTIONS.update(generate_rerank_description(model_spec))
-    register_custom_model()
+    from .sentence_transformers.core import SentenceTransformerRerankModel
+    from .vllm.core import VLLMRerankModel
-    # register model description
-    for ud_rerank in get_user_defined_reranks():
-        RERANK_MODEL_DESCRIPTIONS.update(generate_rerank_description(ud_rerank))
+    SENTENCE_TRANSFORMER_CLASSES.extend([SentenceTransformerRerankModel])
+    VLLM_CLASSES.extend([VLLMRerankModel])
+    SUPPORTED_ENGINES["sentence_transformers"] = SENTENCE_TRANSFORMER_CLASSES
+    SUPPORTED_ENGINES["vllm"] = VLLM_CLASSES
-def load_model_family_from_json(json_filename, target_families):
-    _model_spec_json = os.path.join(os.path.dirname(__file__), json_filename)
-    flattened_model_specs = []
-    for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8")):
-        flattened_model_specs.extend(flatten_model_src(spec))
+    for model_spec in BUILTIN_RERANK_MODELS.values():
+        generate_engine_config_by_model_name(model_spec)
-    for spec in flattened_model_specs:
-        if spec["model_name"] not in target_families:
-            target_families[spec["model_name"]] = [RerankModelFamilyV2(**spec)]
-        else:
-            target_families[spec["model_name"]].append(RerankModelFamilyV2(**spec))
+    register_custom_model()
-    del _model_spec_json
+    # register model description
+    for ud_rerank in get_user_defined_reranks():
+        RERANK_MODEL_DESCRIPTIONS.update(generate_rerank_description(ud_rerank))

xinference/model/rerank/cache_manager.py ADDED Viewed

@@ -0,0 +1,35 @@
+import os
+from typing import TYPE_CHECKING
+from ..cache_manager import CacheManager
+if TYPE_CHECKING:
+    from .core import RerankModelFamilyV2
+class RerankCacheManager(CacheManager):
+    def __init__(self, model_family: "RerankModelFamilyV2"):
+        from ..llm.cache_manager import LLMCacheManager
+        super().__init__(model_family)
+        # Composition design mode for avoiding duplicate code
+        self.cache_helper = LLMCacheManager(model_family)
+        spec = self._model_family.model_specs[0]
+        model_dir_name = (
+            f"{self._model_family.model_name}-{spec.model_format}-{spec.quantization}"
+        )
+        self._cache_dir = os.path.join(self._v2_cache_dir_prefix, model_dir_name)
+        self.cache_helper._cache_dir = self._cache_dir
+    def cache(self) -> str:
+        spec = self._model_family.model_specs[0]
+        if spec.model_uri is not None:
+            return self.cache_helper.cache_uri()
+        else:
+            if spec.model_hub == "huggingface":
+                return self.cache_helper.cache_from_huggingface()
+            elif spec.model_hub == "modelscope":
+                return self.cache_helper.cache_from_modelscope()
+            else:
+                raise ValueError(f"Unknown model hub: {spec.model_hub}")

xinference 1.8.1rc1__py3-none-any.whl → 1.9.1__py3-none-any.whl

Potentially problematic release.

xinference 1.8.1rc1py3-none-any.whl → 1.9.1py3-none-any.whl