PyPI - xinference - Versions diffs - 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

xinference 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (328) hide show

xinference/model/llm/transformers/multimodal/minicpmv45.py ADDED Viewed

@@ -0,0 +1,340 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+import torch
+from PIL import Image
+from .....core.model import register_batching_multimodal_models
+from .....model.utils import select_device
+from .....types import PytorchModelConfig
+from ....scheduler.request import InferenceRequest
+from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
+from ...utils import _decode_image, parse_messages
+from ..core import register_non_default_model
+from .core import PytorchMultiModalModel
+logger = logging.getLogger(__name__)
+@register_batching_multimodal_models("MiniCPM-V-4.5")
+@register_transformer
+@register_non_default_model("MiniCPM-V-4.5")
+class MiniCPMV45Model(PytorchMultiModalModel):
+    @classmethod
+    def match_json(
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        family = model_family.model_family or model_family.model_name
+        if "MiniCPM-V-4.5".lower() in family.lower():
+            return True
+        return False
+    def _sanitize_model_config(
+        self, pytorch_model_config: Optional[PytorchModelConfig]
+    ) -> PytorchModelConfig:
+        pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
+        assert pytorch_model_config is not None
+        # Configure pixel parameters for MiniCPM-V-4.5
+        pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
+        pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
+        return pytorch_model_config
+    def decide_device(self):
+        device = self._pytorch_model_config.get("device", "auto")
+        self._device = select_device(device)
+        self._device = (
+            "auto"
+            if self._device == "cuda" and self.quantization is None
+            else self._device
+        )
+    def load_processor(self):
+        from transformers import AutoProcessor, AutoTokenizer
+        min_pixels = self._pytorch_model_config.get("min_pixels")
+        max_pixels = self._pytorch_model_config.get("max_pixels")
+        self._processor = AutoProcessor.from_pretrained(
+            self.model_path,
+            trust_remote_code=True,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+        self._tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path, trust_remote_code=True
+        )
+    def load_multimodal_model(self):
+        from transformers import AutoModel
+        from transformers.generation import GenerationConfig
+        if "int4" in self.model_path:
+            model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
+        else:
+            kwargs = self.apply_bnb_quantization()
+            model = AutoModel.from_pretrained(
+                self.model_path,
+                trust_remote_code=True,
+                torch_dtype=torch.float16,
+                device_map=self._device,
+                **kwargs,
+            )
+        self._model = model.eval()
+        # Specify hyperparameters for generation
+        self._model.generation_config = GenerationConfig.from_pretrained(
+            self.model_path,
+            trust_remote_code=True,
+        )
+        self._device = self._model.device
+    def _message_content_to_chat(self, content):
+        MAX_NUM_FRAMES = 64
+        def encode_video(video_path):
+            from decord import VideoReader, cpu
+            def uniform_sample(l, n):
+                gap = len(l) / n
+                idxs = [int(i * gap + gap / 2) for i in range(n)]
+                return [l[i] for i in idxs]
+            vr = VideoReader(video_path, ctx=cpu(0))
+            sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+            frame_idx = [i for i in range(0, len(vr), sample_fps)]
+            if len(frame_idx) > MAX_NUM_FRAMES:
+                frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+            frames = vr.get_batch(frame_idx).asnumpy()
+            frames = [Image.fromarray(v.astype("uint8")) for v in frames]
+            logger.info(
+                f"Num frames: {len(frames)} when decoding video for {self.model_uid}"
+            )
+            return frames
+        def _load_video(_url):
+            frames = None
+            if _url.startswith("data:"):
+                raise RuntimeError("Only video url format is supported")
+            else:
+                frames = encode_video(_url)
+            return frames
+        if not isinstance(content, str):
+            texts = []
+            image_urls = []
+            video_urls = []
+            for c in content:
+                c_type = c.get("type")
+                if c_type == "text":
+                    texts.append(c["text"])
+                elif c_type == "image_url":
+                    image_urls.append(c["image_url"]["url"])
+                elif c_type == "video_url":
+                    video_urls.append(c["video_url"]["url"])
+            image_futures = []
+            with ThreadPoolExecutor() as executor:
+                for image_url in image_urls:
+                    fut = executor.submit(_decode_image, image_url)
+                    image_futures.append(fut)
+            images = [fut.result() for fut in image_futures]
+            frames = []
+            if len(video_urls) > 1:
+                raise RuntimeError("Only one video per message is supported")
+            for v in video_urls:
+                frames = _load_video(v)
+            text = " ".join(texts)
+            return text, images, frames
+        return content, [], []
+    def _convert_to_specific_style(self, messages: List[Dict]) -> Tuple:
+        video_existed = False
+        prompt, _, chat_history = parse_messages(messages)
+        content, images_chat, video_frames = self._message_content_to_chat(prompt)
+        if len(video_frames) > 0:
+            video_existed = True
+            images_chat = video_frames
+        msgs = []
+        query_to_response: List[Dict] = []
+        for h in chat_history or []:
+            images_history = []
+            role = h["role"]
+            content_h, images_tmp, video_frames_h = self._message_content_to_chat(
+                h["content"]
+            )
+            if images_tmp != []:
+                images_history = images_tmp
+            if len(video_frames_h) > 0:
+                video_existed = True
+                images_history = video_frames_h
+            if len(query_to_response) == 0 and role == "user":
+                query_to_response.append(
+                    {"role": "user", "content": images_history + [content_h]}
+                )
+            if len(query_to_response) == 1 and role == "assistant":
+                query_to_response.append(
+                    {"role": "assistant", "content": images_history + [content_h]}
+                )
+            if len(query_to_response) == 2:
+                msgs.extend(query_to_response)
+                query_to_response = []
+        msgs.append({"role": "user", "content": images_chat + [content]})
+        return msgs, video_existed
+    def build_inputs_from_messages(
+        self,
+        messages: List[Dict],
+        generate_config: Dict,
+    ):
+        msgs, video_existed = self._convert_to_specific_style(messages)
+        # Set decode params for video
+        params = {}
+        if video_existed:
+            params = {"use_image_id": False, "max_slice_nums": 1}
+        return dict(msgs=msgs, image=None, **params)
+    def build_generate_kwargs(
+        self,
+        generate_config: Dict,
+    ) -> Dict[str, Any]:
+        return dict(**generate_config)
+    def build_streaming_iter(
+        self,
+        messages: List[Dict],
+        generate_config: Dict,
+    ) -> Tuple[Iterator, int]:
+        inputs = self.build_inputs_from_messages(messages, generate_config)
+        config = self.build_generate_kwargs(generate_config)
+        chat_iter = self._model.chat(
+            **inputs, **config, tokenizer=self._tokenizer, sampling=True
+        )
+        return chat_iter, -1
+    def prepare_sanitize_generate_config(self, req: InferenceRequest):
+        """
+        Refer to MiniCPM-V-4.5 documentation for generation parameters
+        """
+        raw_config = req.inference_kwargs.get("raw_params", {})
+        temperature = raw_config.get("temperature", None)
+        if temperature is None:
+            raw_config["temperature"] = 0.7
+        top_p = raw_config.get("top_p", None)
+        if top_p is None:
+            raw_config["top_p"] = 0.8
+        top_k = raw_config.get("top_k", None)
+        if top_k is None:
+            raw_config["top_k"] = 100
+        repetition_penalty = raw_config.get("repetition_penalty", None)
+        if repetition_penalty is None:
+            raw_config["repetition_penalty"] = 1.05
+        return raw_config
+    def _handle_input_ids_and_images(self, msgs: List[Dict]) -> Dict:
+        """
+        Handle input IDs and images for MiniCPM-V-4.5
+        Based on MiniCPM-V-2.6 implementation with adaptations for 4.5
+        """
+        from copy import deepcopy
+        copy_msgs = deepcopy(msgs)
+        images = []
+        for i, msg in enumerate(copy_msgs):
+            role = msg["role"]
+            content = msg["content"]
+            assert role in ["user", "assistant"]
+            if i == 0:
+                assert role == "user", "The role of first msg should be user"
+            if isinstance(content, str):
+                content = [content]
+            cur_msgs = []
+            for c in content:
+                if isinstance(c, Image.Image):
+                    images.append(c)
+                    cur_msgs.append("(<image>./</image>)")
+                elif isinstance(c, str):
+                    cur_msgs.append(c)
+            msg["content"] = "\n".join(cur_msgs)
+        return {
+            "prompt": self._processor.tokenizer.apply_chat_template(
+                copy_msgs, tokenize=False, add_generation_prompt=True
+            ),
+            "input_image": images,
+        }
+    def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):  # type: ignore
+        msgs, video_existed = self._convert_to_specific_style(messages)
+        if video_existed:
+            raise RuntimeError(
+                f"Continuous batching does not support video inputs for this model: {self.model_uid}"
+            )
+        return self._handle_input_ids_and_images(msgs)
+    def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
+        prompts_lists = [x["prompt"] for x in prompts]
+        input_images_lists = [x["input_image"] for x in prompts]
+        inputs = self._processor(
+            prompts_lists,
+            input_images_lists,
+            max_slice_nums=None,
+            use_image_id=None,
+            return_tensors="pt",
+            max_length=8192,
+        ).to(self._model.device)
+        inputs.pop("image_sizes")
+        masked_input_ids = inputs["input_ids"] * inputs["attention_mask"]
+        for i in range(masked_input_ids.shape[0]):
+            non_zero_values = masked_input_ids[i][masked_input_ids[i] != 0].tolist()
+            req_list[i].prompt_tokens = non_zero_values
+            req_list[i].extra_kwargs["attention_mask_seq_len"] = len(non_zero_values)
+            req_list[i].padding_len = masked_input_ids.shape[1] - len(non_zero_values)
+        model_inputs = {
+            "input_ids": inputs["input_ids"],
+            "image_bound": inputs["image_bound"],
+            "pixel_values": inputs["pixel_values"],
+            "tgt_sizes": inputs["tgt_sizes"],
+        }
+        model_inputs["inputs_embeds"], _ = self._model.get_vllm_embedding(model_inputs)
+        return {
+            "inputs_embeds": model_inputs["inputs_embeds"],
+            "attention_mask": inputs["attention_mask"],
+        }
+    def build_decode_position_ids(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        return None
+    def batch_inference(self, req_list: List[InferenceRequest]):
+        """
+        This method is rewritten
+        because the specific inference process is performed by `self._model.llm`,
+        not `self._model` itself
+        """
+        from ..utils import batch_inference_one_step
+        self.prepare_batch_inference(req_list)
+        batch_inference_one_step(
+            self, req_list, self.model_uid, self._model.llm, self._tokenizer
+        )
+        self.handle_batch_inference_results(req_list)

xinference/model/llm/transformers/multimodal/qwen2_vl.py CHANGED Viewed

@@ -27,11 +27,19 @@ logger = logging.getLogger(__name__)
 @register_batching_multimodal_models(
-    "qwen2-vl-instruct", "qwen2.5-vl-instruct", "QvQ-72B-Preview"
+    "qwen2-vl-instruct",
+    "qwen2.5-vl-instruct",
+    "QvQ-72B-Preview",
+    "Qwen3-VL-Instruct",
+    "Qwen3-VL-Thinking",
 )
 @register_transformer
 @register_non_default_model(
-    "qwen2-vl-instruct", "qwen2.5-vl-instruct", "QvQ-72B-Preview"
+    "qwen2-vl-instruct",
+    "qwen2.5-vl-instruct",
+    "QvQ-72B-Preview",
+    "Qwen3-VL-Instruct",
+    "Qwen3-VL-Thinking",
 )
 class Qwen2VLChatModel(PytorchMultiModalModel):
     def _sanitize_model_config(
@@ -47,7 +55,7 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
     def match_json(
         cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb", "fp8"]:
             return False
         llm_family = model_family.model_family or model_family.model_name
         if "qwen2-vl-instruct".lower() in llm_family.lower():
@@ -56,6 +64,8 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
             return True
         if "qvq-72b-preview".lower() in llm_family.lower():
             return True
+        if "qwen3-vl" in llm_family.lower():
+            return True
         return False
     def decide_device(self):
@@ -85,13 +95,19 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
         except ImportError:
             Qwen2_5_VLForConditionalGeneration = None
+        try:
+            from transformers import AutoModelForImageTextToText
+        except ImportError:
+            AutoModelForImageTextToText = None
         kwargs = self.apply_bnb_quantization()
         llm_family = self.model_family.model_family or self.model_family.model_name
-        model_cls = (
-            Qwen2_5_VLForConditionalGeneration
-            if "qwen2.5" in llm_family
-            else Qwen2VLForConditionalGeneration
-        )
+        if "qwen2.5" in llm_family:
+            model_cls = Qwen2_5_VLForConditionalGeneration
+        elif "qwen3" in llm_family:
+            model_cls = AutoModelForImageTextToText
+        else:
+            model_cls = Qwen2VLForConditionalGeneration
         if model_cls is None:
             raise ImportError("`transformers` version is too old, please upgrade it")
         device = "auto" if self._device == "cuda" else self._device
@@ -118,6 +134,16 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
                 torch_dtype="float16",
                 **kwargs,
             ).eval()
+        elif device == "mps":
+            # MacOS special, see https://github.com/QwenLM/Qwen2.5-VL/issues/761
+            self._model = model_cls.from_pretrained(
+                self.model_path,
+                torch_dtype="bfloat16",
+                device_map=device,
+                attn_implementation="eager",
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+            ).eval()
         else:
             self._model = model_cls.from_pretrained(
                 self.model_path,

xinference/model/llm/transformers/utils.py CHANGED Viewed

@@ -281,11 +281,34 @@ def _batch_inference_one_step_internal(
             r.append_new_token(token)
         if decode_reqs:
+            # Ensure all decode requests have the same kv_cache reference
+            # This prevents batch size mismatches during merging
             decode_kv = decode_reqs[0].kv_cache
+            # Verify that all decode requests share the same kv_cache
+            for req in decode_reqs[1:]:
+                if req.kv_cache is not decode_kv:
+                    logger.warning(
+                        "Inconsistent kv_cache references detected in decode requests. "
+                        "This may indicate a batching synchronization issue."
+                    )
+                    # Use the first decode_kv as the reference to maintain consistency
+                    req.kv_cache = decode_kv
             # prefill and decode kv cache need to be merged at `batch_size` and `seq_len` dimensions.
             merged_kv_cache = xinf_model_obj.merge_kv_cache(decode_kv, past_key_values)
+            # Update sequence length information after KV cache merge
+            _, merged_seq_len = get_batch_size_and_seq_len_from_kv_cache(
+                merged_kv_cache, xinf_model_obj
+            )
             for r in valid_req_list:
                 r.kv_cache = merged_kv_cache
+                # Update attention mask sequence length to match merged KV cache
+                if "attention_mask_seq_len" in r.extra_kwargs:
+                    # Ensure the attention mask length doesn't exceed the merged sequence length
+                    r.extra_kwargs["attention_mask_seq_len"] = min(
+                        r.extra_kwargs["attention_mask_seq_len"], merged_seq_len - 1
+                    )
             empty_cache()
         else:
             for r in valid_req_list:

xinference/model/llm/utils.py CHANGED Viewed

@@ -71,6 +71,12 @@ QWEN_TOOL_CALL_FAMILY = [
     "Qwen3-Thinking",
     "Qwen3-Instruct",
     "Qwen3-Coder",
+    "Qwen3-VL-Instruct",
+    "Qwen3-VL-Thinking",
+    "Qwen3-Next-Instruct",
+    "Qwen3-Next-Thinking",
+    "Qwen3-Omni-Instruct",
+    "Qwen3-Omni-Thinking",
 ]
 GLM4_TOOL_CALL_FAMILY = [
@@ -96,7 +102,6 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
 class ChatModelMixin:
     def __init__(self):
         self.model_family = None
         self.model_uid = None
@@ -139,7 +144,7 @@ class ChatModelMixin:
         tokenize=False,
         **kwargs,
     ):
-        if "vision" not in self.model_family.model_ability:  # type: ignore
+        if "vision" not in self.model_family.model_ability and "audio" not in self.model_family.model_ability:  # type: ignore
             messages = self.convert_messages_with_content_list_to_str_conversion(
                 messages
             )
@@ -182,8 +187,7 @@ class ChatModelMixin:
                 return kwargs
             else:
                 raise TypeError(
-                    f"`chat_template_kwargs` but be a JSON parsable str "
-                    f"or dict, got: {kwargs}"
+                    f"`chat_template_kwargs` but be a JSON parsable str or dict, got: {kwargs}"
                 )
         elif reasoning_parser and not reasoning_parser.enable_thinking:
             # hybrid model like qwen3,
@@ -347,9 +351,7 @@ class ChatModelMixin:
         assert choices is not None
         usage = (
             chunk["usage"]
-            if choices[0]["finish_reason"] is not None
-            and reasoning_parser
-            and reasoning_parser.check_content_parser()
+            if choices and choices[0]["finish_reason"] is not None or not choices
             else None
         )
         chat_chunk = {
@@ -798,7 +800,11 @@ class ChatModelMixin:
         chunk_id=None,
         previous_texts: List[str] = [""],
     ):
+        if not c.get("choices"):
+            return c
         _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
+        tool_result = None
+        finish_reason = None
         if isinstance(self.tool_parser, Glm4ToolParser):
             tool_result = self.tool_parser.extract_tool_calls_streaming(
                 [],
@@ -847,15 +853,11 @@ class ChatModelMixin:
             "tool_calls": tool_calls,
         }
-        try:
+        # For tool completion chunks, use None for usage, actual values for stop
+        if finish_reason == "tool_calls":
+            usage = None
+        else:
             usage = c.get("usage")
-            assert "prompt_tokens" in usage
-        except Exception:
-            usage = {
-                "prompt_tokens": -1,
-                "completion_tokens": -1,
-                "total_tokens": -1,
-            }
         return {
             "id": "chat" + f"cmpl-{_id}",
             "model": model_uid,
@@ -880,25 +882,32 @@ class ChatModelMixin:
     ):
         if not self.tool_parser:
             return self._get_final_chat_completion_chunk(c)
-        if self.reasoning_parser:
-            c = self.reasoning_parser.prepare_reasoning_content(c)
         _id = str(uuid.uuid4())
         reasoning_content = None
+        content = ""
+        # First, process reasoning content if reasoning parser exists
+        text = c["choices"][0]["text"]
         if self.reasoning_parser and self.reasoning_parser.check_content_parser():
-            text = c["choices"][0]["text"]
-            reasoning_content, content = (
+            # Extract reasoning content directly from the original text
+            reasoning_content, processed_content = (
                 self.reasoning_parser.extract_reasoning_content(text)
             )
-            c["choices"][0]["text"] = content
+            # Use the processed content (without thinking tags) for tool parsing
+            if processed_content:
+                text = processed_content
+        # Then, extract tool calls from the processed text (without thinking tags)
         tool_calls = []
         failed_contents = []
         if isinstance(self.tool_parser, Glm4ToolParser):
             tool_result = self.tool_parser.extract_tool_calls(c)
         else:
-            text = c["choices"][0]["text"]
             tool_result = self.tool_parser.extract_tool_calls(text)
-        for content, func, args in tool_result:
+        # Process tool results
+        for tool_content, func, args in tool_result:
             if func:
                 tool_calls.append(
                     {
@@ -911,25 +920,31 @@ class ChatModelMixin:
                     }
                 )
             else:
-                if content:
-                    failed_contents.append(content)
-        finish_reason = "tool_calls" if tool_calls else "stop"
+                if tool_content:
+                    failed_contents.append(tool_content)
-        content = "".join(failed_contents) if failed_contents else None
+        # Determine the final content
+        if tool_calls:
+            # For tool calls, the main content should be empty or contain only non-tool parts
+            content = "".join(failed_contents) if failed_contents else ""
+        else:
+            # For non-tool calls, use the processed content from reasoning parser
+            content = text
+        finish_reason = "tool_calls" if tool_calls else "stop"
         m = {
             "role": "assistant",
-            "content": content if content else "",
+            "content": content,
             "tool_calls": tool_calls,
         }
         # add only reasoning_content is None
         if reasoning_content is not None:
             m["reasoning_content"] = reasoning_content
-        try:
-            usage = c.get("usage")
-            assert "prompt_tokens" in usage
-        except Exception:
+        # For tool completion chunks, use actual usage values when available
+        usage = c.get("usage")
+        if not usage or not isinstance(usage, dict) or "prompt_tokens" not in usage:
             usage = {
                 "prompt_tokens": -1,
                 "completion_tokens": -1,
@@ -1009,7 +1024,8 @@ class ChatModelMixin:
                 completion_chunk, self.reasoning_parser, previous_texts
             )
             if (
-                "reasoning_content" in chat_chunk["choices"][0]["delta"]
+                chat_chunk["choices"]
+                and "reasoning_content" in chat_chunk["choices"][0]["delta"]
                 and chat_chunk["choices"][0]["delta"]["reasoning_content"] is not None
             ):
                 yield chat_chunk

xinference 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

Potentially problematic release.

xinference 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl