PyPI - xinference - Versions diffs - 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

xinference 1.4.0py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (132) hide show

xinference/model/llm/transformers/intern_vl.py CHANGED Viewed

@@ -19,14 +19,14 @@ from typing import Dict, Iterator, List, Optional, Union
 import torch
 from ....types import ChatCompletion, ChatCompletionChunk
-from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
 from ..utils import (
     _decode_image,
     generate_chat_completion,
     generate_completion_chunk,
     parse_messages,
 )
-from .core import PytorchChatModel, PytorchGenerateConfig
+from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
 from .utils import cache_clean
 logger = logging.getLogger(__name__)
@@ -232,6 +232,10 @@ def _load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=
     return pixel_values, num_patches_list
+@register_transformer
+@register_non_default_model(
+    "internvl-chat", "internvl2", "Internvl2.5", "Internvl2.5-MPO", "InternVL3"
+)
 class InternVLChatModel(PytorchChatModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -257,6 +261,8 @@ class InternVLChatModel(PytorchChatModel):
     def _split_model(self):
         import math
+        from transformers import AutoConfig
         device_map = {}
         world_size = torch.cuda.device_count()
         # single gpu
@@ -265,22 +271,26 @@ class InternVLChatModel(PytorchChatModel):
         model_size = f"{self.model_spec.model_size_in_billions}B"
         model_name = self.model_family.model_name.lower().replace("-mpo", "")
         model_name = f"{model_name}-{model_size}"
-        num_layers = {
-            "internvl2-1B": 24,
-            "internvl2-2B": 24,
-            "internvl2-4B": 32,
-            "internvl2-8B": 32,
-            "internvl2-26B": 48,
-            "internvl2-40B": 60,
-            "internvl2-76B": 80,
-            "internvl2.5-1B": 24,
-            "internvl2.5-2B": 24,
-            "internvl2.5-4B": 36,
-            "internvl2.5-8B": 32,
-            "internvl2.5-26B": 48,
-            "internvl2.5-38B": 64,
-            "internvl2.5-78B": 80,
-        }[model_name]
+        if "internvl3" in model_name.lower():
+            config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
+            num_layers = config.llm_config.num_hidden_layers
+        else:
+            num_layers = {
+                "internvl2-1B": 24,
+                "internvl2-2B": 24,
+                "internvl2-4B": 32,
+                "internvl2-8B": 32,
+                "internvl2-26B": 48,
+                "internvl2-40B": 60,
+                "internvl2-76B": 80,
+                "internvl2.5-1B": 24,
+                "internvl2.5-2B": 24,
+                "internvl2.5-4B": 36,
+                "internvl2.5-8B": 32,
+                "internvl2.5-26B": 48,
+                "internvl2.5-38B": 64,
+                "internvl2.5-78B": 80,
+            }[model_name]
         # Since the first GPU will be used for ViT, treat it as half a GPU.
         num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))

xinference/model/llm/transformers/minicpmv26.py CHANGED Viewed

@@ -20,7 +20,12 @@ import torch
 from PIL import Image
 from ....core.scheduler import InferenceRequest
-from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    CompletionChunk,
+    PytorchModelConfig,
+)
 from ...utils import select_device
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import (
@@ -52,6 +57,15 @@ class MiniCPMV26Model(PytorchChatModel):
             return True
         return False
+    def _sanitize_model_config(
+        self, pytorch_model_config: Optional[PytorchModelConfig]
+    ) -> PytorchModelConfig:
+        pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
+        assert pytorch_model_config is not None
+        pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
+        pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
+        return pytorch_model_config
     def _get_model_class(self):
         from transformers import AutoModel
@@ -99,8 +113,13 @@ class MiniCPMV26Model(PytorchChatModel):
             self.model_path,
             trust_remote_code=True,
         )
+        min_pixels = self._pytorch_model_config.get("min_pixels")
+        max_pixels = self._pytorch_model_config.get("max_pixels")
         self._processor = AutoProcessor.from_pretrained(
-            self.model_path, trust_remote_code=True
+            self.model_path,
+            trust_remote_code=True,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
         )
         self._device = self._model.device
         self._save_tensorizer()

xinference/model/llm/transformers/qwen-omni.py ADDED Viewed

@@ -0,0 +1,308 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import importlib.util
+import io
+import logging
+import sys
+import time
+import uuid
+from typing import Dict, Iterator, List, Optional, Union
+from ....model.utils import select_device
+from ....types import (
+    ChatCompletion,
+    ChatCompletionAudio,
+    ChatCompletionChoice,
+    ChatCompletionChunk,
+    ChatCompletionMessage,
+    CompletionChunk,
+    CompletionUsage,
+)
+from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ..utils import generate_completion_chunk
+from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
+from .utils import cache_clean
+logger = logging.getLogger(__name__)
+DEFAULT_SYSTEM_PROMPT = (
+    "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
+    "capable of perceiving auditory and visual inputs, as well as generating text and speech."
+)
+@register_transformer
+@register_non_default_model("qwen2.5-omni")
+class Qwen2_5OmniChatModel(PytorchChatModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._tokenizer = None
+        self._model = None
+        self._device = None
+        self._processor = None
+    @classmethod
+    def match(
+        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+            return False
+        llm_family = model_family.model_family or model_family.model_name
+        if "qwen2.5-omni".lower() in llm_family.lower():
+            return True
+        return False
+    def load(self):
+        from transformers import (
+            Qwen2_5OmniForConditionalGeneration,
+            Qwen2_5OmniProcessor,
+        )
+        device = self._pytorch_model_config.get("device", "auto")
+        device = select_device(device)
+        self._device = device
+        # for multiple GPU, set back to auto to make multiple devices work
+        device = "auto" if device == "cuda" else device
+        flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
+        kwargs = (
+            {}
+            if not flash_attn_installed
+            else {"attn_implementation": "flash_attention_2"}
+        )
+        logger.debug("Loading model with extra kwargs: %s", kwargs)
+        self._processor = Qwen2_5OmniProcessor.from_pretrained(
+            self.model_path, trust_remote_code=True
+        )
+        self._tokenizer = self._processor.tokenizer
+        self._model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
+            self.model_path,
+            torch_dtype="auto",
+            device_map=device,
+            trust_remote_code=True,
+            **kwargs,
+        )
+    @cache_clean
+    def chat(
+        self,
+        messages: List[Dict],
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        messages = self._transform_messages(messages)
+        generate_config = generate_config if generate_config else {}
+        stream = generate_config.get("stream", False) if generate_config else False
+        if stream:
+            it = self._generate_stream(messages, generate_config)
+            return self._to_chat_completion_chunks(it)
+        else:
+            c = self._generate(messages, generate_config)
+            return c
+    def _transform_messages(
+        self,
+        messages: Union[List[ChatCompletionMessage], List[dict]],
+    ):
+        messages = super()._transform_messages(messages)
+        if messages[0]["role"] != "system":
+            messages.insert(
+                0,
+                {
+                    "role": "system",
+                    "content": [{"type": "text", "text": DEFAULT_SYSTEM_PROMPT}],  # type: ignore
+                },
+            )
+        else:
+            logger.debug("Force to set system prompt")
+            messages[0]["content"] = [{"type": "text", "text": DEFAULT_SYSTEM_PROMPT}]  # type: ignore
+        return messages
+    def _generate(
+        self, messages: List, config: PytorchGenerateConfig = {}
+    ) -> ChatCompletion:
+        import soundfile as sf
+        from qwen_omni_utils import process_mm_info
+        use_audio_in_video = config.get("use_audio_in_video", True)
+        voice = config.get("voice", "Chelsie")
+        text = self._processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        audios, images, videos = process_mm_info(
+            messages, use_audio_in_video=use_audio_in_video
+        )
+        logger.debug(
+            "Text, audio, image, video: %s, %s, %s, %s", text, audios, images, videos
+        )
+        inputs = self._processor(
+            text=text,
+            images=images,
+            audio=audios,
+            videos=videos,
+            padding=True,
+            return_tensors="pt",
+            use_audio_in_video=use_audio_in_video,
+        )
+        inputs = inputs.to(self._device)
+        # Inference: Generation of the output
+        generated_ids, audio = self._model.generate(
+            **inputs,
+            speaker=voice,
+            max_new_tokens=config.get("max_tokens", 512),
+            temperature=config.get("temperature", 1),
+            use_audio_in_video=use_audio_in_video,
+        )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :]
+            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = self._processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+        wav_io = io.BytesIO()
+        sf.write(
+            wav_io,
+            audio.reshape(-1).detach().cpu().numpy(),
+            samplerate=24000,
+            format="WAV",
+        )
+        wav_bytes = wav_io.getvalue()
+        audio_content = base64.b64encode(wav_bytes).decode()
+        return ChatCompletion(
+            id="chat" + str(uuid.uuid1()),
+            object="chat.completion",
+            created=int(time.time()),
+            model=self.model_uid,
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message={
+                        "role": "assistant",
+                        "content": output_text,
+                        "audio": ChatCompletionAudio(
+                            id="audio" + str(uuid.uuid1()),
+                            data=audio_content,
+                            expires_at=int(time.time()),
+                            transcript="",
+                        ),
+                    },
+                    finish_reason="stop",
+                )
+            ],
+            usage=CompletionUsage(
+                prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
+            ),
+        )
+    def _generate_stream(
+        self, messages: List, config: PytorchGenerateConfig = {}
+    ) -> Iterator[CompletionChunk]:
+        from threading import Thread
+        from qwen_omni_utils import process_mm_info
+        from transformers import TextIteratorStreamer
+        use_audio_in_video = config.get("use_audio_in_video", True)
+        voice = config.get("voice", "Chelsie")
+        text = self._processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        audios, images, videos = process_mm_info(
+            messages, use_audio_in_video=use_audio_in_video
+        )
+        logger.debug(
+            "Text, audio, image, video: %s, %s, %s, %s", text, audios, images, videos
+        )
+        inputs = self._processor(
+            text=text,
+            images=images,
+            audio=audios,
+            videos=videos,
+            padding=True,
+            return_tensors="pt",
+            use_audio_in_video=use_audio_in_video,
+        )
+        inputs = inputs.to(self._device)
+        tokenizer = self._tokenizer
+        streamer = TextIteratorStreamer(
+            tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
+        )
+        # TODO(xuye): Cannot find a way to streaming output,
+        # will implement it when it's supported
+        gen_kwargs = {
+            "max_new_tokens": config.get("max_tokens", 512),
+            "temperature": config.get("temperature", 1),
+            "streamer": streamer,
+            "speaker": voice,
+            **inputs,
+        }
+        error = None
+        def model_generate():
+            try:
+                return self._model.generate(**gen_kwargs)
+            except Exception:
+                nonlocal error
+                error = sys.exc_info()
+                streamer.end()
+                raise
+        thread = Thread(target=model_generate)
+        thread.start()
+        completion_id = str(uuid.uuid1())
+        for new_text in streamer:
+            yield generate_completion_chunk(
+                chunk_text=new_text,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
+                prompt_tokens=-1,
+                completion_tokens=-1,
+                total_tokens=-1,
+                has_choice=True,
+                has_content=True,
+            )
+        if error:
+            _, err, tb = error  # type: ignore
+            raise err.with_traceback(tb)
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
+            prompt_tokens=-1,
+            completion_tokens=-1,
+            total_tokens=-1,
+            has_choice=True,
+            has_content=False,
+        )

xinference/model/llm/transformers/qwen2_audio.py CHANGED Viewed

@@ -74,7 +74,7 @@ class Qwen2AudioChatModel(PytorchChatModel):
     def _transform_messages(
         self,
-        messages: List[ChatCompletionMessage],
+        messages: Union[List[ChatCompletionMessage], List[dict]],
     ):
         import librosa

xinference/model/llm/transformers/qwen2_vl.py CHANGED Viewed

@@ -24,15 +24,18 @@ from ....types import (
     ChatCompletionChunk,
     ChatCompletionMessage,
     CompletionChunk,
+    PytorchModelConfig,
 )
-from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
 from ..utils import generate_chat_completion, generate_completion_chunk
-from .core import PytorchChatModel, PytorchGenerateConfig
+from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
 from .utils import cache_clean
 logger = logging.getLogger(__name__)
+@register_transformer
+@register_non_default_model("qwen2-vl-instruct", "qwen2.5-vl-instruct")
 class Qwen2VLChatModel(PytorchChatModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -41,6 +44,15 @@ class Qwen2VLChatModel(PytorchChatModel):
         self._device = None
         self._processor = None
+    def _sanitize_model_config(
+        self, pytorch_model_config: Optional[PytorchModelConfig]
+    ) -> PytorchModelConfig:
+        pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
+        assert pytorch_model_config is not None
+        pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
+        pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
+        return pytorch_model_config
     @classmethod
     def match(
         cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
@@ -69,9 +81,13 @@ class Qwen2VLChatModel(PytorchChatModel):
         self._device = device
         # for multiple GPU, set back to auto to make multiple devices work
         device = "auto" if device == "cuda" else device
+        min_pixels = self._pytorch_model_config.get("min_pixels")
+        max_pixels = self._pytorch_model_config.get("max_pixels")
         self._processor = AutoProcessor.from_pretrained(
-            self.model_path, trust_remote_code=True
+            self.model_path,
+            trust_remote_code=True,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
         )
         self._tokenizer = self._processor.tokenizer
         flash_attn_installed = importlib.util.find_spec("flash_attn") is not None

xinference/model/llm/utils.py CHANGED Viewed

@@ -31,6 +31,7 @@ from typing import (
     List,
     Optional,
     Tuple,
+    Union,
     cast,
 )
@@ -255,19 +256,26 @@ class ChatModelMixin:
             and choices
             and "delta" in choices[0]
         ):
-            if reasoning_parser is not None:
-                # process parsing reasoning content
-                assert previous_texts is not None
+            if choices[0]["finish_reason"] is None:
+                if reasoning_parser is not None:
+                    # process parsing reasoning content
+                    assert previous_texts is not None
+                    delta = choices[0]["delta"]  # type: ignore
+                    if text := delta.get("content"):
+                        current_text = previous_texts[-1] + text
+                        delta = reasoning_parser.extract_reasoning_content_streaming(
+                            previous_text=previous_texts[-1],
+                            current_text=current_text,
+                            delta_text=text,
+                        )
+                        previous_texts[-1] = current_text
+                        choices[0]["delta"] = delta  # type: ignore
+            elif choices[0]["finish_reason"] is not None:
                 delta = choices[0]["delta"]  # type: ignore
-                if text := delta.get("content"):
-                    current_text = previous_texts[-1] + text
-                    delta = reasoning_parser.extract_reasoning_content_streaming(
-                        previous_text=previous_texts[-1],
-                        current_text=current_text,
-                        delta_text=text,
-                    )
-                    previous_texts[-1] = current_text
-                    choices[0]["delta"] = delta  # type: ignore
+                if "content" not in delta:
+                    delta["content"] = ""  # type: ignore
+                if reasoning_parser is not None:
+                    delta["reasoning_content"] = None  # type: ignore
             # Already a ChatCompletionChunk, we don't need to convert chunk.
             return cast(ChatCompletionChunk, chunk)
@@ -286,7 +294,11 @@ class ChatModelMixin:
                         delta_text=choice["text"],
                     )
                     previous_texts[-1] = current_text
-            if "tool_calls" in choice:
+            elif "text" in choice and choice["finish_reason"] is not None:
+                delta["content"] = choice["text"]
+                if reasoning_parser is not None:
+                    delta["reasoning_content"] = None
+            elif "tool_calls" in choice:
                 delta["tool_calls"] = choice["tool_calls"]
             choices_list.append(
                 {
@@ -319,8 +331,9 @@ class ChatModelMixin:
     ) -> ChatCompletionChunk:
         choices_list = []
         for i, choice in enumerate(chunk["choices"]):
-            delta = {"role": "assistant", "content": ""}
+            delta = ChatCompletionChunkDelta(role="assistant", content="")
             if reasoning_parser is not None:
+                delta["content"] = None
                 delta["reasoning_content"] = ""
             choices_list.append(
                 {
@@ -750,7 +763,7 @@ class ChatModelMixin:
     def _transform_messages(
         self,
-        messages: List[ChatCompletionMessage],
+        messages: Union[List[ChatCompletionMessage], List[dict]],
     ):
         transformed_messages = []
         for msg in messages:
@@ -771,6 +784,15 @@ class ChatModelMixin:
                         new_content.append(
                             {"type": "video", "video": item["video_url"]["url"]}
                         )
+                    elif "audio_url" in item:
+                        new_content.append(
+                            {"type": "audio", "audio": item["audio_url"]["url"]}
+                        )
+                    else:
+                        logger.warning(
+                            "Unknown message type, message: %s, this message may be ignored",
+                            messages,
+                        )
             new_message = {"role": role, "content": new_content}
             transformed_messages.append(new_message)

xinference 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

Potentially problematic release.

xinference 1.4.0py3-none-any.whl → 1.5.0py3-none-any.whl