PyPI - xinference - Versions diffs - 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl - Mend

xinference 1.2.1py3-none-any.whl → 1.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (26) hide show

xinference/_version.py +3 -3
xinference/api/restful_api.py +4 -7
xinference/client/handlers.py +3 -0
xinference/core/scheduler.py +4 -7
xinference/deploy/local.py +2 -1
xinference/model/audio/core.py +5 -0
xinference/model/audio/kokoro.py +139 -0
xinference/model/audio/model_spec.json +8 -0
xinference/model/audio/model_spec_modelscope.json +9 -0
xinference/model/llm/llama_cpp/core.py +21 -14
xinference/model/llm/llm_family.json +306 -1
xinference/model/llm/llm_family.py +4 -1
xinference/model/llm/llm_family_modelscope.json +307 -3
xinference/model/llm/mlx/core.py +11 -3
xinference/model/llm/transformers/core.py +9 -1
xinference/model/llm/transformers/qwen2_audio.py +3 -1
xinference/model/llm/transformers/qwen2_vl.py +20 -3
xinference/model/llm/transformers/utils.py +22 -11
xinference/model/llm/utils.py +111 -1
xinference/model/llm/vllm/core.py +13 -2
{xinference-1.2.1.dist-info → xinference-1.2.2.dist-info}/METADATA +9 -8
{xinference-1.2.1.dist-info → xinference-1.2.2.dist-info}/RECORD +26 -25
{xinference-1.2.1.dist-info → xinference-1.2.2.dist-info}/LICENSE +0 -0
{xinference-1.2.1.dist-info → xinference-1.2.2.dist-info}/WHEEL +0 -0
{xinference-1.2.1.dist-info → xinference-1.2.2.dist-info}/entry_points.txt +0 -0
{xinference-1.2.1.dist-info → xinference-1.2.2.dist-info}/top_level.txt +0 -0

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -538,7 +538,10 @@ def _generate_model_file_names(
     )
     need_merge = False
-    if llm_spec.quantization_parts is None:
+    if (
+        llm_spec.quantization_parts is None
+        or quantization not in llm_spec.quantization_parts
+    ):
         file_names.append(final_file_name)
     elif quantization is not None and quantization in llm_spec.quantization_parts:
         parts = llm_spec.quantization_parts[quantization]

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -4769,10 +4769,11 @@
         "model_format":"mlx",
         "model_size_in_billions":2,
         "quantizations":[
+          "4bit",
           "8bit"
         ],
         "model_hub": "modelscope",
-        "model_id":"okwinds/Qwen2-VL-2B-Instruct-MLX-8bit",
+        "model_id":"mlx-community/Qwen2-VL-2B-Instruct-{quantization}",
         "model_revision":"master"
       },
       {
@@ -4825,6 +4826,97 @@
       "<|endoftext|>"
     ]
   },
+  {
+    "version":1,
+    "context_length":128000,
+    "model_name":"qwen2.5-vl-instruct",
+    "model_lang":[
+      "en",
+      "zh"
+    ],
+    "model_ability":[
+      "chat",
+      "vision"
+    ],
+    "model_description":"Qwen2.5-VL: Qwen2.5-VL is the latest version of the vision language models in the Qwen model familities.",
+    "model_specs":[
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":3,
+        "quantizations":[
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"qwen/Qwen2.5-VL-3B-Instruct"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"qwen/Qwen2.5-VL-7B-Instruct"
+      },
+      {
+        "model_format":"pytorch",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"qwen/Qwen2.5-VL-72B-Instruct"
+      },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":3,
+        "quantizations":[
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"mlx-community/Qwen2.5-VL-3B-Instruct-{quantization}"
+      },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":7,
+        "quantizations":[
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"mlx-community/Qwen2.5-VL-7B-Instruct-{quantization}"
+      },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":72,
+        "quantizations":[
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"mlx-community/Qwen2.5-VL-72B-Instruct-{quantization}"
+      }
+    ],
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 32768,
@@ -5558,7 +5650,7 @@
           "q8_0"
         ],
         "model_id": "qwen/Qwen2.5-7B-Instruct-GGUF",
-        "model_file_name_template": "qwen2_5-7b-instruct-{quantization}.gguf",
+        "model_file_name_template": "qwen2.5-7b-instruct-{quantization}.gguf",
         "model_hub": "modelscope",
         "model_file_name_split_template": "qwen2.5-7b-instruct-{quantization}-{part}.gguf",
         "quantization_parts": {
@@ -6473,6 +6565,19 @@
         "model_file_name_template": "DeepSeek-R1-Distill-Qwen-1.5B-{quantization}.gguf",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": "1_5",
+        "quantizations": [
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_id": "mlx-community/DeepSeek-R1-Distill-Qwen-1.5B-{quantization}",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 7,
@@ -6621,6 +6726,125 @@
       "<｜end▁of▁sentence｜>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "deepseek-r1-distill-llama",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "deepseek-r1-distill-llama is distilled from DeepSeek-R1 based on Llama",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_M",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0",
+          "F16"
+        ],
+        "model_id": "unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF",
+        "model_file_name_template": "DeepSeek-R1-Distill-Llama-8B-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit",
+          "bf16"
+        ],
+        "model_id": "okwinds/DeepSeek-R1-Distill-Llama-8B-MLX-{quantization}",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "Q2_K",
+          "Q2_K_L",
+          "Q3_K_M",
+          "Q4_K_M",
+          "Q5_K_M",
+          "Q6_K",
+          "Q8_0",
+          "F16"
+        ],
+        "quantization_parts": {
+          "Q6_K": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "Q8_0": [
+            "00001-of-00002",
+            "00002-of-00002"
+          ],
+          "F16": [
+            "00001-of-00003",
+            "00002-of-00003",
+            "00003-of-00003"
+          ]
+        },
+        "model_id": "unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF",
+        "model_file_name_template": "DeepSeek-R1-Distill-Qwen-7B-{quantization}.gguf",
+        "model_file_name_split_template": "DeepSeek-R1-Distill-Llama-70B-{quantization}/DeepSeek-R1-Distill-Llama-70B-{quantization}-{part}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "3bit",
+          "4bit",
+          "6bit",
+          "8bit"
+        ],
+        "model_id": "okwinds/DeepSeek-R1-Distill-Llama-70B-MLX-{quantization}",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
+    "stop_token_ids": [
+      151643
+    ],
+    "stop": [
+      "<｜end▁of▁sentence｜>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 8192,
@@ -6911,7 +7135,7 @@
       "<|endoftext|>"
     ]
   },
-   {
+  {
     "version": 1,
     "context_length": 32768,
     "model_name": "marco-o1",
@@ -7009,5 +7233,85 @@
       "<|user|>",
       "<|observation|>"
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "internlm3-instruct",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "tools"
+    ],
+    "model_description": "InternLM3 has open-sourced an 8-billion parameter instruction model, InternLM3-8B-Instruct, designed for general-purpose usage and advanced reasoning.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Shanghai_AI_Laboratory/internlm3-8b-instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Shanghai_AI_Laboratory/internlm3-8b-instruct-gptq-int4",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Shanghai_AI_Laboratory/internlm3-8b-instruct-awq",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0"
+        ],
+        "model_id": "Shanghai_AI_Laboratory/internlm3-8b-instruct-gguf",
+        "model_file_name_template": "internlm3-8b-instruct-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format":"mlx",
+        "model_size_in_billions":8,
+        "quantizations":[
+          "4bit"
+        ],
+        "model_hub": "modelscope",
+        "model_id":"mlx-community/internlm3-8b-instruct-{quantization}"
+      }
+    ],
+    "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      128131
+    ],
+    "stop": [
+      "</s>",
+      "<|im_end|>"
+    ]
   }
 ]

xinference/model/llm/mlx/core.py CHANGED Viewed

@@ -31,7 +31,12 @@ from ....types import (
 )
 from ..core import LLM
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import QWEN_TOOL_CALL_FAMILY, ChatModelMixin, generate_completion_chunk
+from ..utils import (
+    DEEPSEEK_TOOL_CALL_FAMILY,
+    QWEN_TOOL_CALL_FAMILY,
+    ChatModelMixin,
+    generate_completion_chunk,
+)
 logger = logging.getLogger(__name__)
@@ -424,8 +429,11 @@ class MLXChatModel(MLXModel, ChatModelMixin):
         model_family = self.model_family.model_family or self.model_family.model_name
         tools = generate_config.pop("tools", []) if generate_config else None
         full_context_kwargs = {}
-        if tools and model_family in QWEN_TOOL_CALL_FAMILY:
-            full_context_kwargs["tools"] = tools
+        if tools:
+            if model_family in QWEN_TOOL_CALL_FAMILY:
+                full_context_kwargs["tools"] = tools
+            elif model_family in DEEPSEEK_TOOL_CALL_FAMILY:
+                self._tools_to_messages_for_deepseek(messages, tools)
         assert self.model_family.chat_template is not None
         full_prompt = self.get_full_context(
             messages, self.model_family.chat_template, **full_context_kwargs

xinference/model/llm/transformers/core.py CHANGED Viewed

@@ -39,7 +39,12 @@ from ....types import (
 from ...utils import select_device
 from ..core import LLM
 from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import LLAMA3_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY, ChatModelMixin
+from ..utils import (
+    DEEPSEEK_TOOL_CALL_FAMILY,
+    LLAMA3_TOOL_CALL_FAMILY,
+    QWEN_TOOL_CALL_FAMILY,
+    ChatModelMixin,
+)
 from .utils import get_context_length, get_max_src_len, pad_prefill_tokens
 logger = logging.getLogger(__name__)
@@ -62,6 +67,7 @@ NON_DEFAULT_MODEL_LIST: List[str] = [
     "MiniCPM-V-2.6",
     "glm-4v",
     "qwen2-vl-instruct",
+    "qwen2.5-vl-instruct",
     "qwen2-audio",
     "qwen2-audio-instruct",
     "deepseek-v2",
@@ -681,6 +687,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             or model_family in LLAMA3_TOOL_CALL_FAMILY
         ):
             full_context_kwargs["tools"] = tools
+        elif tools and model_family in DEEPSEEK_TOOL_CALL_FAMILY:
+            self._tools_to_messages_for_deepseek(messages, tools)
         assert self.model_family.chat_template is not None
         full_prompt = self.get_full_context(
             messages,

xinference/model/llm/transformers/qwen2_audio.py CHANGED Viewed

@@ -55,9 +55,9 @@ class Qwen2AudioChatModel(PytorchChatModel):
         device = self._pytorch_model_config.get("device", "auto")
         device = select_device(device)
-        self._device = device
         # for multiple GPU, set back to auto to make multiple devices work
         device = "auto" if device == "cuda" else device
+        self._device = device
         self._processor = AutoProcessor.from_pretrained(
             self.model_path,
@@ -105,6 +105,8 @@ class Qwen2AudioChatModel(PytorchChatModel):
         inputs = self._processor(
             text=text, audios=audios, return_tensors="pt", padding=True
         )
+        # Make sure that the inputs and the model are on the same device.
+        inputs.data = {k: v.to(self._device) for k, v in inputs.data.items()}
         inputs.input_ids = inputs.input_ids.to(self._device)
         generate_config = generate_config if generate_config else {}
         stream = generate_config.get("stream", False) if generate_config else False

xinference/model/llm/transformers/qwen2_vl.py CHANGED Viewed

@@ -45,9 +45,13 @@ class Qwen2VLChatModel(PytorchChatModel):
     def match(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+            return False
         llm_family = model_family.model_family or model_family.model_name
         if "qwen2-vl-instruct".lower() in llm_family.lower():
             return True
+        if "qwen2.5-vl-instruct".lower() in llm_family.lower():
+            return True
         if "qvq-72b-preview".lower() in llm_family.lower():
             return True
         return False
@@ -55,6 +59,11 @@ class Qwen2VLChatModel(PytorchChatModel):
     def load(self):
         from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+        try:
+            from transformers import Qwen2_5_VLForConditionalGeneration
+        except ImportError:
+            Qwen2_5_VLForConditionalGeneration = None
         device = self._pytorch_model_config.get("device", "auto")
         device = select_device(device)
         self._device = device
@@ -66,8 +75,16 @@ class Qwen2VLChatModel(PytorchChatModel):
         )
         self._tokenizer = self._processor.tokenizer
         flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
+        llm_family = self.model_family.model_family or self.model_family.model_name
+        model_cls = (
+            Qwen2_5_VLForConditionalGeneration
+            if "qwen2.5" in llm_family
+            else Qwen2VLForConditionalGeneration
+        )
+        if model_cls is None:
+            raise ImportError("`transformers` version is too old, please upgrade it")
         if flash_attn_installed:
-            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+            self._model = model_cls.from_pretrained(
                 self.model_path,
                 torch_dtype="bfloat16",
                 device_map=device,
@@ -76,14 +93,14 @@ class Qwen2VLChatModel(PytorchChatModel):
             ).eval()
         elif is_npu_available():
             # Ascend do not support bf16
-            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+            self._model = model_cls.from_pretrained(
                 self.model_path,
                 device_map="auto",
                 trust_remote_code=True,
                 torch_dtype="float16",
             ).eval()
         else:
-            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
+            self._model = model_cls.from_pretrained(
                 self.model_path, device_map=device, trust_remote_code=True
             ).eval()

xinference/model/llm/transformers/utils.py CHANGED Viewed

@@ -193,16 +193,14 @@ def _get_pad_param(seq_len_idx: int, pad_len: int) -> Tuple:
 def _merge_kv_cache(
     xinf_model_obj: "PytorchModel",
-    past_kv: Tuple[Tuple[torch.Tensor]],
-    new_kv: Tuple[Tuple[torch.Tensor]],
-):
+    past_cache: DynamicCache,
+    new_cache: DynamicCache,
+) -> DynamicCache:
     from torch.nn.functional import pad
     _, seq_len_idx = xinf_model_obj.get_batch_size_and_seq_len_indexes_from_kv()
-    past_cache = DynamicCache.from_legacy_cache(past_kv)
-    new_cache = DynamicCache.from_legacy_cache(new_kv)
-    past_seq_len = past_kv[0][0].shape[seq_len_idx]
-    new_seq_len = new_kv[0][0].shape[seq_len_idx]
+    past_seq_len = past_cache[0][0].shape[seq_len_idx]
+    new_seq_len = new_cache[0][0].shape[seq_len_idx]
     if past_seq_len != new_seq_len:
         padding_target = new_cache if past_seq_len > new_seq_len else past_cache
         padding_len = abs(past_seq_len - new_seq_len)
@@ -219,8 +217,12 @@ def _merge_kv_cache(
     for idx in range(len(past_cache)):
         k1, k2 = new_cache.key_cache[idx], past_cache.key_cache[idx]
         v1, v2 = new_cache.value_cache[idx], past_cache.value_cache[idx]
-        ret_kv.update(torch.cat((k1, k2), 0), torch.cat((v1, v2), 0), idx)
-    return ret_kv.to_legacy_cache()
+        ret_kv.update(
+            torch.cat((k1, k2), 0).contiguous(),
+            torch.cat((v1, v2), 0).contiguous(),
+            idx,
+        )
+    return ret_kv
 def get_batch_size_and_seq_len_from_kv_cache(kv, xinf_model_obj: "PytorchModel"):
@@ -228,6 +230,15 @@ def get_batch_size_and_seq_len_from_kv_cache(kv, xinf_model_obj: "PytorchModel")
     return kv[0][0].shape[bs_idx], kv[0][0].shape[seq_len_idx] + 1
+def convert_to_cache_cls(cache) -> DynamicCache:
+    """
+    Compatible with some old models
+    """
+    if isinstance(cache, tuple):
+        return DynamicCache.from_legacy_cache(cache)
+    return cache
 @torch.inference_mode()
 def _batch_inference_one_step_internal(
     xinf_model_obj: "PytorchModel",
@@ -269,7 +280,7 @@ def _batch_inference_one_step_internal(
         out = model(**prefill_kws, use_cache=True)
         logits = out.logits
-        past_key_values = out.past_key_values
+        past_key_values = convert_to_cache_cls(out.past_key_values)
         for i, r in enumerate(prefill_reqs):
             (
@@ -317,7 +328,7 @@ def _batch_inference_one_step_internal(
         )
         out = model(**inf_kws, use_cache=True, past_key_values=past_key_values)
         logits = out.logits
-        past_key_values = out.past_key_values
+        past_key_values = convert_to_cache_cls(out.past_key_values)
         for i, r in enumerate(valid_req_list):
             (

xinference 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl

Potentially problematic release.

xinference 1.2.1py3-none-any.whl → 1.2.2py3-none-any.whl