PyPI - xinference - Versions diffs - 1.6.0__py3-none-any.whl → 1.6.1__py3-none-any.whl - Mend

xinference 1.6.0py3-none-any.whl → 1.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show

xinference/model/llm/transformers/{cogagent.py → multimodal/cogagent.py} RENAMED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2022-2023 XProbe Inc.
+# Copyright 2022-2025 XProbe Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,47 +13,36 @@
 # limitations under the License.
 import logging
 import re
-import uuid
 from concurrent.futures import ThreadPoolExecutor
-from typing import Dict, Iterator, List, Literal, Optional, Union
+from threading import Thread
+from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union
 import torch
-from ....model.utils import select_device
-from ....types import (
-    ChatCompletion,
-    ChatCompletionChunk,
-    CogagentGenerateConfig,
-    CompletionChunk,
-)
-from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import (
-    _decode_image,
-    generate_chat_completion,
-    generate_completion_chunk,
-    parse_messages,
-)
-from .core import PytorchChatModel
-from .utils import cache_clean
+from .....model.utils import select_device
+from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ...utils import _decode_image, parse_messages
+from ..core import register_non_default_model
+from .core import PytorchMultiModalModel
 logger = logging.getLogger(__name__)
-class CogAgentChatModel(PytorchChatModel):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._torch_type = None
-        self._device = None
-        self._tokenizer = None
-        self._model = None
-        self._platform: Literal["Mac", "WIN", "Mobile"] | None = "Mac"  # type: ignore
-        self._format: Literal[  # type: ignore
-            "(Answer in Action-Operation-Sensitive format.)",
-            "(Answer in Status-Plan-Action-Operation format.)",
-            "(Answer in Status-Action-Operation-Sensitive format.)",
-            "(Answer in Status-Action-Operation format.)",
-            "(Answer in Action-Operation format.)",
-        ] | None = "(Answer in Action-Operation-Sensitive format.)"
+@register_transformer
+@register_non_default_model("cogagent")
+class CogAgentChatModel(PytorchMultiModalModel):
+    def __init__(self, *args, **kws):
+        super().__init__(*args, **kws)
+        self._platform: Optional[Literal["Mac", "WIN", "Mobile"]] = "Mac"
+        self._format: Optional[
+            Literal[
+                "(Answer in Action-Operation-Sensitive format.)",
+                "(Answer in Status-Plan-Action-Operation format.)",
+                "(Answer in Status-Action-Operation-Sensitive format.)",
+                "(Answer in Status-Action-Operation format.)",
+                "(Answer in Action-Operation format.)",
+            ]
+        ] = "(Answer in Action-Operation-Sensitive format.)"
     @classmethod
     def match_json(
@@ -64,17 +53,21 @@ class CogAgentChatModel(PytorchChatModel):
             return True
         return False
-    def load(self):
-        from transformers import AutoModelForCausalLM, AutoTokenizer
+    def decide_device(self):
         device = self._pytorch_model_config.get("device", "auto")
         self._device = select_device(device)
+    def load_processor(self):
+        from transformers import AutoTokenizer
         self._tokenizer = AutoTokenizer.from_pretrained(
             self.model_path, trust_remote_code=True
         )
-        kwargs = self.apply_bnb_quantization()
+    def load_multimodal_model(self):
+        from transformers import AutoModelForCausalLM
+        kwargs = self.apply_bnb_quantization()
         self._model = AutoModelForCausalLM.from_pretrained(
             self.model_path,
             torch_dtype=torch.bfloat16,
@@ -153,7 +146,7 @@ class CogAgentChatModel(PytorchChatModel):
         return history_step, history_action
-    def get_query_and_history(
+    def _get_query_and_history(
         self,
         prompt: Union[str, List[Dict]],
         chat_history: Optional[List[Dict]] = None,
@@ -181,26 +174,14 @@ class CogAgentChatModel(PytorchChatModel):
         logger.info(f"query:{query}")
         return query, image
-    @cache_clean
-    def chat(
+    def build_inputs_from_messages(
         self,
         messages: List[Dict],
-        generate_config: Optional[CogagentGenerateConfig] = None,
-    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        if generate_config is not None:
-            self._platform = generate_config.pop("platform", self._platform)
-            self._format = generate_config.pop("format", self._format)
-        sanitize_generate_config = self._sanitize_generate_config(generate_config)
-        stream = sanitize_generate_config.get("stream")
-        sanitized_config = {
-            "max_length": sanitize_generate_config.get("max_tokens", 512),
-            "top_k": sanitize_generate_config.get("top_k", 1),
-            "do_sample": True,
-        }
+        generate_config: Dict,
+    ):
         prompt, _, chat_history = parse_messages(messages)
-        query, image = self.get_query_and_history(prompt, chat_history)
+        query, image = self._get_query_and_history(prompt, chat_history)
         full_context_kwargs = {
             "return_tensors": "pt",
@@ -218,53 +199,35 @@ class CogAgentChatModel(PytorchChatModel):
             **full_context_kwargs,
         )
         inputs.to(self._model.device)
+        return inputs
-        if stream:
-            it = self._streaming_chat_response(inputs, sanitized_config)
-            return self._to_chat_completion_chunks(it)
-        else:
-            # Generate response
-            with torch.no_grad():
-                outputs = self._model.generate(**inputs, **sanitized_config)
-                outputs = outputs[:, inputs["input_ids"].shape[1] :]
-                response = self._tokenizer.decode(outputs[0], skip_special_tokens=True)
-            return generate_chat_completion(self.model_uid, response)
-    def _streaming_chat_response(
-        self, inputs: Dict, config: Dict
-    ) -> Iterator[CompletionChunk]:
-        from threading import Thread
+    def build_generate_kwargs(
+        self,
+        generate_config: Dict,
+    ) -> Dict[str, Any]:
+        generate_config = {} if generate_config is None else generate_config
+        self._platform = generate_config.pop("platform", self._platform)
+        self._format = generate_config.pop("format", self._format)
+        return {
+            "max_length": generate_config.get("max_tokens", 512),
+            "top_k": generate_config.get("top_k", 1),
+            "do_sample": True,
+        }
+    def build_streaming_iter(
+        self,
+        messages: List[Dict],
+        generate_config: Dict,
+    ) -> Tuple[Iterator, int]:
         from transformers import TextIteratorStreamer
+        config = self.build_generate_kwargs(generate_config)
+        inputs = self.build_inputs_from_messages(messages, generate_config)
         streamer = TextIteratorStreamer(
             self._tokenizer, skip_prompt=True, skip_special_tokens=True
         )
-        generation_kwargs = {**inputs, **config}
+        generation_kwargs = {**inputs, **config, "streamer": streamer}
         thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
         thread.start()
-        completion_id = str(uuid.uuid1())
-        for new_text in streamer:
-            yield generate_completion_chunk(
-                chunk_text=new_text,
-                finish_reason=None,
-                chunk_id=completion_id,
-                model_uid=self.model_uid,
-                prompt_tokens=-1,
-                completion_tokens=-1,
-                total_tokens=-1,
-            )
-        yield generate_completion_chunk(
-            chunk_text=None,
-            finish_reason="stop",
-            chunk_id=completion_id,
-            model_uid=self.model_uid,
-            prompt_tokens=-1,
-            completion_tokens=-1,
-            total_tokens=-1,
-            has_choice=True,
-            has_content=False,
-        )
+        return streamer, len(inputs.input_ids[0])

xinference/model/llm/transformers/multimodal/core.py ADDED Viewed

@@ -0,0 +1,205 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import uuid
+from abc import abstractmethod
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from .....types import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    CompletionChunk,
+    PytorchGenerateConfig,
+)
+from ...utils import generate_chat_completion, generate_completion_chunk
+from ..core import PytorchChatModel
+from ..utils import cache_clean
+class PytorchMultiModalModel(PytorchChatModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._tokenizer = None
+        self._device = None
+        self._processor = None
+        self._model = None
+    @abstractmethod
+    def decide_device(self):
+        """
+        Update self._device
+        """
+        pass
+    @abstractmethod
+    def load_processor(self):
+        """
+        Load self._processor and self._tokenizer
+        """
+        pass
+    @abstractmethod
+    def load_multimodal_model(self):
+        """
+        Load self._model
+        """
+        pass
+    def load(self):
+        self.decide_device()
+        self.load_processor()
+        self.load_multimodal_model()
+    @abstractmethod
+    def build_inputs_from_messages(
+        self,
+        messages: List[Dict],
+        generate_config: Dict,
+    ):
+        """
+        Convert from input OpenAI-formatted messages to
+        actual parameters needed for inference,
+        e.g. input_ids, attention_masks, etc.
+        """
+        pass
+    @abstractmethod
+    def build_generate_kwargs(
+        self,
+        generate_config: Dict,
+    ) -> Dict[str, Any]:
+        """
+        Hyperparameters needed for generation,
+        e.g. temperature, max_new_tokens, etc.
+        """
+        pass
+    @abstractmethod
+    def build_streaming_iter(
+        self,
+        messages: List[Dict],
+        generate_config: Dict,
+    ) -> Tuple[Iterator, int]:
+        """
+        Return the iterator needed for streaming inference and the length of prompt token for statisticians.
+        The length of prompt token usually comes from the input_ids.
+        In this interface you need to call the `build_inputs_from_messages` and `build_generate_kwargs`.
+        """
+        pass
+    def get_stop_strs(self) -> List[str]:
+        return []
+    def check_conditions(self, new_text: str) -> Tuple[str, bool]:
+        stop_strs = self.get_stop_strs()
+        for ss in stop_strs:
+            if new_text.endswith(ss):
+                new_text = new_text[: -len(ss)]
+                break
+        return new_text, False
+    def generate_non_streaming(
+        self,
+        messages: List[Dict],
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> ChatCompletion:
+        generate_config = generate_config if generate_config else {}  # type: ignore
+        streamer, prompt_tokens = self.build_streaming_iter(messages, generate_config)  # type: ignore
+        completion_tokens, total_tokens = 0, 0
+        res = ""
+        for i, new_text in enumerate(streamer):
+            new_text, should_stop = self.check_conditions(new_text)
+            if should_stop:
+                break
+            completion_tokens = i
+            total_tokens = prompt_tokens + completion_tokens
+            res += new_text
+        return generate_chat_completion(
+            self.model_uid,
+            res,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens if prompt_tokens != -1 else -1,
+            total_tokens=total_tokens if prompt_tokens != -1 else -1,
+        )
+    def generate_streaming(
+        self,
+        messages: List[Dict],
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Iterator[CompletionChunk]:
+        generate_config = generate_config if generate_config else {}  # type: ignore
+        streamer, prompt_tokens = self.build_streaming_iter(messages, generate_config)  # type: ignore
+        stream_options = generate_config.pop("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
+        completion_id = str(uuid.uuid1())
+        completion_tokens, total_tokens = 0, 0
+        for i, new_text in enumerate(streamer):
+            new_text, should_stop = self.check_conditions(new_text)
+            if should_stop:
+                break
+            completion_tokens = i
+            total_tokens = prompt_tokens + completion_tokens
+            yield generate_completion_chunk(
+                chunk_text=new_text,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens if prompt_tokens != -1 else -1,
+                total_tokens=total_tokens if prompt_tokens != -1 else -1,
+                has_choice=True,
+                has_content=True,
+            )
+        yield generate_completion_chunk(
+            chunk_text=None,
+            finish_reason="stop",
+            chunk_id=completion_id,
+            model_uid=self.model_uid,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens if prompt_tokens != -1 else -1,
+            total_tokens=total_tokens if prompt_tokens != -1 else -1,
+            has_choice=True,
+            has_content=False,
+        )
+        if include_usage:
+            yield generate_completion_chunk(
+                chunk_text=None,
+                finish_reason=None,
+                chunk_id=completion_id,
+                model_uid=self.model_uid,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens if prompt_tokens != -1 else -1,
+                total_tokens=total_tokens if prompt_tokens != -1 else -1,
+                has_choice=False,
+                has_content=False,
+            )
+    @cache_clean
+    def chat(
+        self,
+        messages: List[Dict],
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        stream = generate_config.get("stream", False) if generate_config else False
+        return (
+            self._to_chat_completion_chunks(
+                self.generate_streaming(messages, generate_config)
+            )
+            if stream
+            else self.generate_non_streaming(messages, generate_config)
+        )

xinference/model/llm/transformers/{deepseek_vl2.py → multimodal/deepseek_vl2.py} RENAMED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2022-2023 XProbe Inc.
+# Copyright 2022-2025 XProbe Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,32 +13,28 @@
 # limitations under the License.
 import base64
 import logging
-import os.path
+import os
 import tempfile
-import uuid
 from concurrent.futures import ThreadPoolExecutor
 from io import BytesIO
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterator, List, Tuple
 import requests
 import torch
-from ....model.utils import select_device
-from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
-from ..llm_family import LLMFamilyV1, LLMSpecV1
-from ..utils import generate_chat_completion, generate_completion_chunk
-from .core import PytorchChatModel, PytorchGenerateConfig
-from .utils import cache_clean
+from .....model.utils import select_device
+from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ..core import register_non_default_model
+from .core import PytorchMultiModalModel
 logger = logging.getLogger(__name__)
-class DeepSeekVL2ChatModel(PytorchChatModel):
+@register_transformer
+@register_non_default_model("deepseek-vl2")
+class DeepSeekVL2ChatModel(PytorchMultiModalModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self._tokenizer = None
-        self._model = None
-        self._vl_chat_processor = None
         self._type = None
     @classmethod
@@ -50,25 +46,26 @@ class DeepSeekVL2ChatModel(PytorchChatModel):
             return True
         return False
-    def load(self):
-        from transformers import AutoModelForCausalLM
-        from ....thirdparty.deepseek_vl2.models import (
-            DeepseekVLV2ForCausalLM,
-            DeepseekVLV2Processor,
-        )
+    def decide_device(self):
         self._device = self._pytorch_model_config.get("device", "auto")
         self._device = select_device(self._device)
         self._type = torch.bfloat16
-        kwargs = self.apply_bnb_quantization()
+    def load_processor(self):
+        from .....thirdparty.deepseek_vl2.models import DeepseekVLV2Processor
         # specify the path to the model
-        self._vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(  # type: ignore
+        self._processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(  # type: ignore
             self.model_path
         )
-        self._tokenizer = self._vl_chat_processor.tokenizer
+        self._tokenizer = self._processor.tokenizer
+    def load_multimodal_model(self):
+        from transformers import AutoModelForCausalLM
+        from .....thirdparty.deepseek_vl2.models import DeepseekVLV2ForCausalLM
+        kwargs = self.apply_bnb_quantization()
         vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(  # type: ignore
             self.model_path,
             trust_remote_code=True,
@@ -138,29 +135,24 @@ class DeepSeekVL2ChatModel(PytorchChatModel):
                 elif c_type == "text":
                     new_content.append(c["text"])
             if images:
-                new_content.insert(0, "<image_placeholder>")
                 images = _download(images)
             return "".join(new_content), images
         return content, []
-    @cache_clean
-    def chat(
-        self,
-        messages: List[Dict],
-        generate_config: Optional[PytorchGenerateConfig] = None,
-    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        if not generate_config:
-            generate_config = {}
+    def get_stop_strs(self) -> List[str]:
+        conversation = self._processor.new_chat_template()
+        stop_str = conversation.sep2
+        return [stop_str]
-        stream = generate_config.get("stream", False)
-        stream_options = generate_config.pop("stream_options", None)
-        include_usage = (
-            stream_options["include_usage"]
-            if isinstance(stream_options, dict)
-            else False
-        )
+    def build_generate_kwargs(self, generate_config: Dict):
+        max_new_tokens = generate_config.get("max_tokens", 512)
+        return {"max_new_tokens": max_new_tokens}
-        prompt = ""
+    def build_inputs_from_messages(
+        self,
+        messages: List[Dict],
+        generate_config: Dict,
+    ):
         deepseek_messages = []
         for i, message in enumerate(messages):
             role = message["role"]
@@ -183,8 +175,6 @@ class DeepSeekVL2ChatModel(PytorchChatModel):
                         msg["images"] = images
                     deepseek_messages.append(msg)
                     deepseek_messages.append({"role": "<|Assistant|>", "content": ""})
-                if i == len(messages) - 1:
-                    prompt = "<image>\n<|ref|>" + content + "<|/ref|>"
             elif role == "assistant":
                 deepseek_messages.append({"role": "<|Assistant|>", "content": content})
             else:
@@ -192,11 +182,11 @@ class DeepSeekVL2ChatModel(PytorchChatModel):
                     f"Unexpected message in messages: role: {role}, message: {message}"
                 )
-        from ....thirdparty.deepseek_vl2.utils.io import load_pil_images
+        from .....thirdparty.deepseek_vl2.utils.io import load_pil_images
         # load images and prepare for inputs
         pil_images = load_pil_images(deepseek_messages)
-        prepare_inputs = self._vl_chat_processor(
+        prepare_inputs = self._processor(
             conversations=deepseek_messages,
             images=pil_images,
             force_batchify=True,
@@ -205,88 +195,37 @@ class DeepSeekVL2ChatModel(PytorchChatModel):
         # run image encoder to get the image embeddings
         inputs_embeds = self._model.prepare_inputs_embeds(**prepare_inputs)
-        max_new_tokens = generate_config.get("max_tokens", 512)
-        conversation = self._vl_chat_processor.new_chat_template()
-        stop_str = conversation.sep2
-        streamer = self._model.language.generate(
+        return dict(
+            input_ids=prepare_inputs.input_ids,
             inputs_embeds=inputs_embeds,
             attention_mask=prepare_inputs.attention_mask,
             pad_token_id=self._tokenizer.eos_token_id,
             bos_token_id=self._tokenizer.bos_token_id,
             eos_token_id=self._tokenizer.eos_token_id,
-            max_new_tokens=max_new_tokens,
+        )
+    def build_streaming_iter(
+        self,
+        messages: List[Dict],
+        generate_config: Dict,
+    ) -> Tuple[Iterator, int]:
+        _inputs = self.build_inputs_from_messages(messages, generate_config)
+        configs = self.build_generate_kwargs(generate_config)
+        streamer = self._model.language.generate(
+            **_inputs,
+            **configs,
             do_sample=False,
             use_cache=True,
         )
+        return streamer, len(_inputs["input_ids"][0])
-        if stream:
-            it = self._generate_stream(streamer, stop_str, include_usage, prompt)
-            return self._to_chat_completion_chunks(it)
-        else:
-            return self._generate(streamer, stop_str)
-    def _generate(self, streamer, stop_str) -> ChatCompletion:
-        generated_text = ""
-        for new_text in streamer:
-            if isinstance(new_text, torch.Tensor):
-                new_text = self._tokenizer.decode(
-                    new_text.cpu().tolist(), skip_special_tokens=True
-                )
-            if new_text.endswith(stop_str):
-                new_text = new_text[: -len(stop_str)]
-            generated_text += new_text
-        return generate_chat_completion(self.model_uid, generated_text)
-    def _generate_stream(
-        self, streamer, stop_str, include_usage, prompt
-    ) -> Iterator[CompletionChunk]:
-        completion_id = str(uuid.uuid1())
-        prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
-        input_ids = self._tokenizer(prompt).input_ids
-        prompt_tokens = len(input_ids)
-        for i, new_text in enumerate(streamer):
-            if new_text.endswith(stop_str):
-                new_text = new_text[: -len(stop_str)]
-            completion_tokens = i
-            total_tokens = prompt_tokens + completion_tokens
-            yield generate_completion_chunk(
-                chunk_text=new_text,
-                finish_reason=None,
-                chunk_id=completion_id,
-                model_uid=self.model_uid,
-                prompt_tokens=prompt_tokens,
-                completion_tokens=completion_tokens,
-                total_tokens=total_tokens,
-                has_choice=True,
-                has_content=True,
+    def check_conditions(self, new_text: str) -> Tuple[str, bool]:
+        stop_str = self.get_stop_strs()[0]
+        if isinstance(new_text, torch.Tensor):
+            new_text = self._tokenizer.decode(
+                new_text.cpu().tolist(), skip_special_tokens=True
             )
-        yield generate_completion_chunk(
-            chunk_text=None,
-            finish_reason="stop",
-            chunk_id=completion_id,
-            model_uid=self.model_uid,
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            total_tokens=total_tokens,
-            has_choice=True,
-            has_content=False,
-        )
-        if include_usage:
-            yield generate_completion_chunk(
-                chunk_text=None,
-                finish_reason=None,
-                chunk_id=completion_id,
-                model_uid=self.model_uid,
-                prompt_tokens=prompt_tokens,
-                completion_tokens=completion_tokens,
-                total_tokens=total_tokens,
-                has_choice=False,
-                has_content=False,
-            )
+        if new_text.endswith(stop_str):
+            new_text = new_text[: -len(stop_str)]
+        return new_text, False

xinference 1.6.0__py3-none-any.whl → 1.6.1__py3-none-any.whl

Potentially problematic release.

xinference 1.6.0py3-none-any.whl → 1.6.1py3-none-any.whl