PyPI - xinference - Versions diffs - 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

xinference 1.6.0.post1py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (124) hide show

xinference/model/llm/transformers/multimodal/qwen2_audio.py ADDED Viewed

@@ -0,0 +1,131 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from io import BytesIO
+from threading import Thread
+from typing import Any, Dict, Iterator, List, Tuple
+from urllib.request import urlopen
+import numpy as np
+from .....model.utils import select_device
+from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ..core import register_non_default_model
+from .core import PytorchMultiModalModel
+logger = logging.getLogger(__name__)
+@register_transformer
+@register_non_default_model("qwen2-audio-instruct")
+class Qwen2AudioChatModel(PytorchMultiModalModel):
+    @classmethod
+    def match_json(
+        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        llm_family = model_family.model_family or model_family.model_name
+        if "qwen2-audio".lower() in llm_family.lower():
+            return True
+        return False
+    def decide_device(self):
+        device = self._pytorch_model_config.get("device", "auto")
+        self._device = select_device(device)
+    def load_processor(self):
+        from transformers import AutoProcessor
+        self._processor = AutoProcessor.from_pretrained(
+            self.model_path,
+            device_map="auto" if self._device == "cuda" else self._device,
+            # trust_remote_code=True,
+            code_revision=self.model_spec.model_revision,
+        )
+        self._tokenizer = self._processor.tokenizer
+    def load_multimodal_model(self):
+        from transformers import Qwen2AudioForConditionalGeneration
+        kwargs = self.apply_bnb_quantization()
+        self._model = Qwen2AudioForConditionalGeneration.from_pretrained(
+            self.model_path,
+            device_map="auto" if self._device == "cuda" else self._device,
+            # trust_remote_code=True,
+            revision=self.model_spec.model_revision,
+            **kwargs,
+        )
+    def _transform_messages(
+        self,
+        messages: List[dict],  # type: ignore
+    ):
+        import librosa
+        text = self._processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=False
+        )
+        audios: List[np.ndarray] = []
+        for msg in messages:
+            content = msg["content"]
+            if isinstance(content, List):
+                for item in content:  # type: ignore
+                    if item.get("type") == "audio" and "audio_url" in item:
+                        audio = librosa.load(
+                            BytesIO(urlopen(item["audio_url"]["url"]).read()),
+                            sr=self._processor.feature_extractor.sampling_rate,
+                        )[0]
+                        audios.append(audio)
+        return text, audios
+    def build_inputs_from_messages(
+        self,
+        messages: List[Dict],
+        generate_config: Dict,
+    ):
+        text, audios = self._transform_messages(messages)
+        inputs = self._processor(
+            text=text, audios=audios, return_tensors="pt", padding=True
+        )
+        # Make sure that the inputs and the model are on the same device.
+        inputs.data = {k: v.to(self._device) for k, v in inputs.data.items()}
+        inputs.input_ids = inputs.input_ids.to(self._device)
+        return inputs
+    def build_generate_kwargs(
+        self,
+        generate_config: Dict,
+    ) -> Dict[str, Any]:
+        return dict(max_length=generate_config.get("max_tokens", 512))
+    def build_streaming_iter(
+        self,
+        messages: List[Dict],
+        generate_config: Dict,
+    ) -> Tuple[Iterator, int]:
+        from transformers import TextIteratorStreamer
+        inputs = self.build_inputs_from_messages(messages, generate_config)
+        config = self.build_generate_kwargs(generate_config)
+        tokenizer = self._processor.tokenizer
+        streamer = TextIteratorStreamer(
+            tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
+        )
+        gen_kwargs = {"streamer": streamer, **inputs, **config}
+        thread = Thread(target=self._model.generate, kwargs=gen_kwargs)
+        thread.start()
+        return streamer, len(inputs.input_ids[0])

xinference/model/llm/transformers/{qwen2_vl.py → multimodal/qwen2_vl.py} RENAMED Viewed

@@ -1,256 +1,224 @@
-# Copyright 2022-2023 XProbe Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import importlib.util
-import logging
-import sys
-import uuid
-from typing import Iterator, List, Optional, Union
-from ....device_utils import is_npu_available
-from ....model.utils import select_device
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    ChatCompletionMessage,
-    CompletionChunk,
-    PytorchModelConfig,
-)
-from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
-from ..utils import generate_chat_completion, generate_completion_chunk
-from .core import PytorchChatModel, PytorchGenerateConfig, register_non_default_model
-from .utils import cache_clean
-logger = logging.getLogger(__name__)
-@register_transformer
-@register_non_default_model("qwen2-vl-instruct", "qwen2.5-vl-instruct")
-class Qwen2VLChatModel(PytorchChatModel):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._tokenizer = None
-        self._model = None
-        self._device = None
-        self._processor = None
-    def _sanitize_model_config(
-        self, pytorch_model_config: Optional[PytorchModelConfig]
-    ) -> PytorchModelConfig:
-        pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
-        assert pytorch_model_config is not None
-        pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
-        pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
-        return pytorch_model_config
-    @classmethod
-    def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
-    ) -> bool:
-        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
-            return False
-        llm_family = model_family.model_family or model_family.model_name
-        if "qwen2-vl-instruct".lower() in llm_family.lower():
-            return True
-        if "qwen2.5-vl-instruct".lower() in llm_family.lower():
-            return True
-        if "qvq-72b-preview".lower() in llm_family.lower():
-            return True
-        return False
-    def load(self):
-        from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
-        try:
-            from transformers import Qwen2_5_VLForConditionalGeneration
-        except ImportError:
-            Qwen2_5_VLForConditionalGeneration = None
-        device = self._pytorch_model_config.get("device", "auto")
-        device = select_device(device)
-        self._device = device
-        # for multiple GPU, set back to auto to make multiple devices work
-        device = "auto" if device == "cuda" else device
-        kwargs = self.apply_bnb_quantization()
-        min_pixels = self._pytorch_model_config.get("min_pixels")
-        max_pixels = self._pytorch_model_config.get("max_pixels")
-        self._processor = AutoProcessor.from_pretrained(
-            self.model_path,
-            trust_remote_code=True,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-        self._tokenizer = self._processor.tokenizer
-        flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
-        llm_family = self.model_family.model_family or self.model_family.model_name
-        model_cls = (
-            Qwen2_5_VLForConditionalGeneration
-            if "qwen2.5" in llm_family
-            else Qwen2VLForConditionalGeneration
-        )
-        if model_cls is None:
-            raise ImportError("`transformers` version is too old, please upgrade it")
-        if flash_attn_installed:
-            self._model = model_cls.from_pretrained(
-                self.model_path,
-                torch_dtype="bfloat16",
-                device_map=device,
-                attn_implementation="flash_attention_2",
-                trust_remote_code=True,
-                **kwargs,
-            ).eval()
-        elif is_npu_available():
-            # Ascend do not support bf16
-            self._model = model_cls.from_pretrained(
-                self.model_path,
-                device_map="auto",
-                trust_remote_code=True,
-                torch_dtype="float16",
-                **kwargs,
-            ).eval()
-        else:
-            self._model = model_cls.from_pretrained(
-                self.model_path, device_map=device, trust_remote_code=True
-            ).eval()
-    @cache_clean
-    def chat(
-        self,
-        messages: List[ChatCompletionMessage],  # type: ignore
-        generate_config: Optional[PytorchGenerateConfig] = None,
-    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        messages = self._transform_messages(messages)
-        generate_config = generate_config if generate_config else {}
-        stream = generate_config.get("stream", False) if generate_config else False
-        if stream:
-            it = self._generate_stream(messages, generate_config)
-            return self._to_chat_completion_chunks(it)
-        else:
-            c = self._generate(messages, generate_config)
-            return c
-    def _generate(
-        self, messages: List, config: PytorchGenerateConfig = {}
-    ) -> ChatCompletion:
-        from qwen_vl_utils import process_vision_info
-        # Preparation for inference
-        text = self._processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = self._processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to(self._device)
-        # Inference: Generation of the output
-        generated_ids = self._model.generate(
-            **inputs,
-            max_new_tokens=config.get("max_tokens", 512),
-            temperature=config.get("temperature", 1),
-        )
-        generated_ids_trimmed = [
-            out_ids[len(in_ids) :]
-            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_text = self._processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False,
-        )[0]
-        return generate_chat_completion(self.model_uid, output_text)
-    def _generate_stream(
-        self, messages: List, config: PytorchGenerateConfig = {}
-    ) -> Iterator[CompletionChunk]:
-        from threading import Thread
-        from qwen_vl_utils import process_vision_info
-        from transformers import TextIteratorStreamer
-        text = self._processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = self._processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to(self._model.device)
-        tokenizer = self._tokenizer
-        streamer = TextIteratorStreamer(
-            tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
-        )
-        gen_kwargs = {
-            "max_new_tokens": config.get("max_tokens", 512),
-            "temperature": config.get("temperature", 1),
-            "streamer": streamer,
-            **inputs,
-        }
-        error = None
-        def model_generate():
-            try:
-                return self._model.generate(**gen_kwargs)
-            except Exception:
-                nonlocal error
-                error = sys.exc_info()
-                streamer.end()
-                raise
-        thread = Thread(target=model_generate)
-        thread.start()
-        completion_id = str(uuid.uuid1())
-        for new_text in streamer:
-            yield generate_completion_chunk(
-                chunk_text=new_text,
-                finish_reason=None,
-                chunk_id=completion_id,
-                model_uid=self.model_uid,
-                prompt_tokens=-1,
-                completion_tokens=-1,
-                total_tokens=-1,
-                has_choice=True,
-                has_content=True,
-            )
-        if error:
-            _, err, tb = error  # type: ignore
-            raise err.with_traceback(tb)
-        yield generate_completion_chunk(
-            chunk_text=None,
-            finish_reason="stop",
-            chunk_id=completion_id,
-            model_uid=self.model_uid,
-            prompt_tokens=-1,
-            completion_tokens=-1,
-            total_tokens=-1,
-            has_choice=True,
-            has_content=False,
-        )
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib.util
+import logging
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+from .....core.model import register_batching_multimodal_models
+from .....core.scheduler import InferenceRequest
+from .....device_utils import is_npu_available
+from .....model.utils import select_device
+from .....types import PytorchModelConfig
+from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ..core import register_non_default_model
+from .core import PytorchMultiModalModel
+logger = logging.getLogger(__name__)
+@register_batching_multimodal_models(
+    "qwen2-vl-instruct", "qwen2.5-vl-instruct", "QvQ-72B-Preview"
+)
+@register_transformer
+@register_non_default_model(
+    "qwen2-vl-instruct", "qwen2.5-vl-instruct", "QvQ-72B-Preview"
+)
+class Qwen2VLChatModel(PytorchMultiModalModel):
+    def _sanitize_model_config(
+        self, pytorch_model_config: Optional[PytorchModelConfig]
+    ) -> PytorchModelConfig:
+        pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
+        assert pytorch_model_config is not None
+        pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
+        pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
+        return pytorch_model_config
+    @classmethod
+    def match_json(
+        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+            return False
+        llm_family = model_family.model_family or model_family.model_name
+        if "qwen2-vl-instruct".lower() in llm_family.lower():
+            return True
+        if "qwen2.5-vl-instruct".lower() in llm_family.lower():
+            return True
+        if "qvq-72b-preview".lower() in llm_family.lower():
+            return True
+        return False
+    def decide_device(self):
+        device = self._pytorch_model_config.get("device", "auto")
+        device = select_device(device)
+        # for multiple GPU, set back to auto to make multiple devices work
+        self._device = device
+    def load_processor(self):
+        from transformers import AutoProcessor
+        min_pixels = self._pytorch_model_config.get("min_pixels")
+        max_pixels = self._pytorch_model_config.get("max_pixels")
+        self._processor = AutoProcessor.from_pretrained(
+            self.model_path,
+            trust_remote_code=True,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+        self._tokenizer = self._processor.tokenizer
+    def load_multimodal_model(self):
+        from transformers import Qwen2VLForConditionalGeneration
+        try:
+            from transformers import Qwen2_5_VLForConditionalGeneration
+        except ImportError:
+            Qwen2_5_VLForConditionalGeneration = None
+        kwargs = self.apply_bnb_quantization()
+        flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
+        llm_family = self.model_family.model_family or self.model_family.model_name
+        model_cls = (
+            Qwen2_5_VLForConditionalGeneration
+            if "qwen2.5" in llm_family
+            else Qwen2VLForConditionalGeneration
+        )
+        if model_cls is None:
+            raise ImportError("`transformers` version is too old, please upgrade it")
+        device = "auto" if self._device == "cuda" else self._device
+        if flash_attn_installed:
+            self._model = model_cls.from_pretrained(
+                self.model_path,
+                torch_dtype="bfloat16",
+                device_map=device,
+                attn_implementation="flash_attention_2",
+                trust_remote_code=True,
+                **kwargs,
+            ).eval()
+        elif is_npu_available():
+            # Ascend do not support bf16
+            self._model = model_cls.from_pretrained(
+                self.model_path,
+                device_map="auto",
+                trust_remote_code=True,
+                torch_dtype="float16",
+                **kwargs,
+            ).eval()
+        else:
+            self._model = model_cls.from_pretrained(
+                self.model_path,
+                device_map=device,
+                trust_remote_code=True,
+                **kwargs,
+            ).eval()
+    def build_inputs_from_messages(
+        self,
+        messages: List[Dict],
+        generate_config: Dict,
+    ):
+        from qwen_vl_utils import process_vision_info
+        messages = self._transform_messages(messages)
+        # Preparation for inference
+        text = self._processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self._processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self._device)
+        return inputs
+    def build_generate_kwargs(self, generate_config: Dict) -> Dict[str, Any]:
+        max_new_tokens = generate_config.get("max_tokens", 512)
+        temperature = generate_config.get("temperature", 1)
+        return {"max_new_tokens": max_new_tokens, "temperature": temperature}
+    def build_streaming_iter(
+        self,
+        messages: List[Dict],
+        generate_config: Dict,
+    ) -> Tuple[Iterator, int]:
+        from threading import Thread
+        from transformers import TextIteratorStreamer
+        tokenizer = self._tokenizer
+        streamer = TextIteratorStreamer(
+            tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
+        )
+        inputs = self.build_inputs_from_messages(messages, generate_config)
+        config = self.build_generate_kwargs(generate_config)
+        def model_generate():
+            try:
+                return self._model.generate(**inputs, **config, streamer=streamer)
+            except Exception:
+                streamer.end()
+                raise
+        thread = Thread(target=model_generate)
+        thread.start()
+        return streamer, len(inputs.input_ids[0])
+    def prepare_sanitize_generate_config(self, req: InferenceRequest):
+        """
+        This file corresponds to multiple models,
+        so the corresponding configuration is read directly through the transformers interface.
+        """
+        from transformers import GenerationConfig
+        gen_config = GenerationConfig.from_pretrained(self.model_path).to_dict()
+        raw_config = req.inference_kwargs.get("raw_params", {})
+        gen_config.update(raw_config)
+        return gen_config
+    def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):
+        return self._transform_messages(messages)
+    def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
+        import torch
+        from qwen_vl_utils import process_vision_info
+        batch_text = self._processor.apply_chat_template(
+            prompts, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(prompts)
+        inputs = self._processor(
+            text=batch_text,
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            padding_side="left",
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self._model.device)
+        for r, _ids, attn_mask in zip(
+            req_list, inputs["input_ids"], inputs["attention_mask"]
+        ):
+            r.prompt_tokens = _ids.tolist()
+            real_len = torch.sum(attn_mask).item()
+            r.padding_len = attn_mask.numel() - real_len
+            r.extra_kwargs["attention_mask_seq_len"] = real_len
+        input_ids = inputs["input_ids"]
+        batch_size, seq_len = input_ids.shape
+        position_ids = self.build_prefill_position_ids(batch_size, seq_len, req_list)
+        return {**inputs, "position_ids": position_ids}

xinference/model/llm/transformers/opt.py CHANGED Viewed

@@ -16,10 +16,12 @@ from typing import List, Optional
 from ....core.scheduler import InferenceRequest
 from ....types import LoRA
-from ..llm_family import LLMFamilyV1, LLMSpecV1
-from .core import PytorchModel, PytorchModelConfig
+from ..llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from .core import PytorchModel, PytorchModelConfig, register_non_default_model
+@register_transformer
+@register_non_default_model("opt")
 class OptPytorchModel(PytorchModel):
     def __init__(
         self,

xinference 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl

Potentially problematic release.

xinference 1.6.0.post1py3-none-any.whl → 1.7.0py3-none-any.whl