PyPI - xinference - Versions diffs - 0.12.3__py3-none-any.whl → 0.13.0__py3-none-any.whl - Mend - Supply Chain Defender

xinference 0.12.3py3-none-any.whl → 0.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (71) hide show

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -18,6 +18,8 @@ import os
 from functools import lru_cache
 from typing import Iterable, Iterator, List, Optional, Tuple, Union
+import torch
 from ....core.scheduler import InferenceRequest
 from ....device_utils import (
     get_device_preferred_dtype,
@@ -43,7 +45,7 @@ from ...utils import select_device
 from ..core import LLM
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import QWEN_TOOL_CALL_FAMILY, ChatModelMixin
-from .utils import get_context_length, get_max_src_len
+from .utils import get_context_length, get_max_src_len, pad_prefill_tokens
 logger = logging.getLogger(__name__)
@@ -409,9 +411,171 @@ class PytorchModel(LLM):
         else:
             return generator_wrapper(prompt, generate_config)
+    def build_prefill_attention_mask(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        """
+        Build attention mask for prefill phase.
+        Padding `0` on the left.
+        Note that the parameter `seq_length` is from `input_ids`.
+        """
+        data = []
+        for r in reqs:
+            real_len = seq_length - r.padding_len
+            x = torch.cat(
+                [
+                    torch.full((r.padding_len,), 0, dtype=torch.long),
+                    torch.ones((real_len,), dtype=torch.long),
+                ]
+            )
+            data.append(x)
+            r.extra_kwargs["attention_mask_seq_len"] = real_len
+        return torch.stack(data).to(self._device)
+    def build_decode_attention_mask(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        """
+        Build attention mask for decode phase.
+        Note that the `seq_length` parameter is from merged kv_cache.
+        So we need pad `0` on the left again.
+        """
+        data = []
+        for r in reqs:
+            r.extra_kwargs["attention_mask_seq_len"] += 1
+            attention_mask_seq_len = r.extra_kwargs["attention_mask_seq_len"]
+            pad_len = seq_length - attention_mask_seq_len
+            x = torch.cat(
+                [
+                    torch.full((pad_len,), 0, dtype=torch.long),
+                    torch.ones((attention_mask_seq_len,), dtype=torch.long),
+                ]
+            )
+            data.append(x)
+        return torch.stack(data).to(self._device)
+    def build_prefill_position_ids(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        """
+        Build position ids for prefill phase.
+        Padding `0` on the left.
+        Note that the parameter `seq_length` is from `input_ids`.
+        Record the `max_position_id` on request for the decode phase.
+        """
+        res = []
+        for r in reqs:
+            real_seq_len = seq_length - r.padding_len
+            res.append(
+                torch.cat(
+                    [
+                        torch.full((r.padding_len,), 0, dtype=torch.long),
+                        torch.arange(0, real_seq_len, dtype=torch.long),
+                    ]
+                )
+            )
+            r.extra_kwargs["max_position_id"] = real_seq_len - 1
+        return torch.stack(res).to(self._device)
+    def build_decode_position_ids(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        """
+        Build position ids for decode phase.
+        For most models, just let the `max_position_id` in previous step += 1 and use the latest `max_position_id`
+        """
+        data = []
+        for r in reqs:
+            r.extra_kwargs["max_position_id"] += 1
+            data.append([r.extra_kwargs["max_position_id"]])
+        position_ids = torch.as_tensor(data, dtype=torch.long, device=self._device)
+        return position_ids
+    def build_prefill_token_type_ids(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        """
+        Build token_type_ids for prefill phase.
+        For most models, this is not required.
+        """
+        return None
+    def build_decode_token_type_ids(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        """
+        Build token_type_ids for decode phase.
+        For most models, this is not required.
+        """
+        return None
+    def build_prefill_inputs(self, prompts: List, req_list: List[InferenceRequest]):
+        """
+        Get inputs for inference. Models may have their own impl.
+        """
+        assert isinstance(prompts[0], str)
+        inputs = self._tokenizer(prompts, padding=False).input_ids
+        context_len = self.get_context_len()
+        input_ids = torch.as_tensor(
+            pad_prefill_tokens(inputs, context_len, req_list), device=self._device
+        )
+        return input_ids
+    def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
+        """
+        Get all inputs parameters for prefill phase. Models may have their own impl.
+        """
+        input_ids = self.build_prefill_inputs(prompts, req_list)
+        res = {"input_ids": input_ids}
+        batch_size, seq_len = input_ids.shape
+        attention_mask = self.build_prefill_attention_mask(
+            batch_size, seq_len, req_list
+        )
+        if attention_mask is not None:
+            res["attention_mask"] = attention_mask
+        position_ids = self.build_prefill_position_ids(batch_size, seq_len, req_list)
+        if position_ids is not None:
+            res["position_ids"] = position_ids
+        token_type_ids = self.build_prefill_token_type_ids(
+            batch_size, seq_len, req_list
+        )
+        if token_type_ids is not None:
+            res["token_type_ids"] = token_type_ids
+        return res
+    def build_decode_kwargs(
+        self,
+        prompts: List,
+        req_list: List[InferenceRequest],
+        batch_size: int,
+        seq_len: int,
+    ):
+        """
+        Get all inputs parameters for decode phase. Models may have their own impl.
+        """
+        res = {"input_ids": torch.as_tensor(prompts, device=self._device)}
+        attention_mask = self.build_decode_attention_mask(batch_size, seq_len, req_list)
+        if attention_mask is not None:
+            res["attention_mask"] = attention_mask
+        position_ids = self.build_decode_position_ids(batch_size, seq_len, req_list)
+        if position_ids is not None:
+            res["position_ids"] = position_ids
+        token_type_ids = self.build_decode_token_type_ids(batch_size, seq_len, req_list)
+        if token_type_ids is not None:
+            res["token_type_ids"] = token_type_ids
+        return res
     @staticmethod
-    def require_attention_mask():
-        return False
+    def get_batch_size_and_seq_len_indexes_from_kv() -> Tuple[int, int]:
+        """
+        From huggingface transformers document, the `pask_key_values` has the shape of
+        `(batch_size, num_heads, sequence_length, embed_size_per_head)`.
+        However, for some models, the shape may be changed.
+        """
+        return 0, 2
+    def get_dtype(self):
+        raise NotImplementedError("Not implemented.")
     @lru_cache
     def get_context_len(self):
@@ -426,28 +590,38 @@ class PytorchModel(LLM):
     def prepare_batch_inference(self, req_list: List[InferenceRequest]):
         # check some parameters
         for r in req_list:
-            if r.sanitized_generate_config is None:
-                r.sanitized_generate_config = self.prepare_sanitize_generate_config(r)
-            if r.is_prefill:
-                # check some generate params
-                max_src_len = get_max_src_len(self.get_context_len(), r)  # type: ignore
-                if max_src_len < 0:
-                    r.stopped = True
-                    r.error_msg = "Max tokens exceeds model's max length"
-                    continue
-                if r.stream_interval <= 0:
-                    r.stopped = True
-                    r.error_msg = "`stream_interval` must be greater than 0"
-                    continue
-                stop_str = r.sanitized_generate_config.get("stop", None)
-                if stop_str and (
-                    not (isinstance(stop_str, str) or isinstance(stop_str, Iterable))
-                ):
-                    r.stopped = True
-                    r.error_msg = "Invalid `stop` field type"
-                    continue
-    def _get_builtin_stop_token_ids(self) -> Tuple:
+            try:
+                if r.sanitized_generate_config is None:
+                    r.sanitized_generate_config = self.prepare_sanitize_generate_config(
+                        r
+                    )
+                if r.is_prefill:
+                    # check some generate params
+                    max_src_len = get_max_src_len(self.get_context_len(), r)  # type: ignore
+                    if max_src_len < 0:
+                        r.stopped = True
+                        r.error_msg = "Max tokens exceeds model's max length"
+                        continue
+                    if r.stream_interval <= 0:
+                        r.stopped = True
+                        r.error_msg = "`stream_interval` must be greater than 0"
+                        continue
+                    stop_str = r.sanitized_generate_config.get("stop", None)
+                    if stop_str and (
+                        not (
+                            isinstance(stop_str, str) or isinstance(stop_str, Iterable)
+                        )
+                    ):
+                        r.stopped = True
+                        r.error_msg = "Invalid `stop` field type"
+                        continue
+            # Catch exception here. If not catch exception, the request would hang.
+            except Exception as e:
+                logger.exception(f"prepare inference error with {e}")
+                r.stopped = True
+                r.error_msg = str(e)
+    def get_builtin_stop_token_ids(self) -> Tuple:
         return (
             tuple(self.model_family.prompt_style.stop_token_ids)
             if self.model_family.prompt_style
@@ -494,17 +668,8 @@ class PytorchModel(LLM):
         from .utils import batch_inference_one_step
         self.prepare_batch_inference(req_list)
-        context_len = self.get_context_len()
-        assert isinstance(context_len, int)
         batch_inference_one_step(
-            req_list,
-            self.model_uid,
-            self._model,
-            self._tokenizer,
-            self._device,
-            context_len,
-            self._get_builtin_stop_token_ids(),
-            require_attention_mask=self.require_attention_mask(),
+            self, req_list, self.model_uid, self._model, self._tokenizer
         )
         self.handle_batch_inference_results(req_list)
@@ -696,14 +861,20 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     def prepare_batch_inference(self, req_list: List[InferenceRequest]):
         super().prepare_batch_inference(req_list)
         for r in req_list:
-            r.full_prompt = self._get_full_prompt(
-                r.prompt, r.system_prompt, r.chat_history, None
-            )
+            try:
+                if not r.stopped and r.is_prefill:
+                    r.full_prompt = self._get_full_prompt(
+                        r.prompt, r.system_prompt, r.chat_history, None
+                    )
+            except Exception as e:
+                logger.exception(f"prepare inference error with {e}")
+                r.stopped = True
+                r.error_msg = str(e)
     def handle_batch_inference_results(self, req_list: List[InferenceRequest]):
         for req in req_list:
-            if req.stream and req.error_msg is None:
-                if req.completion:
+            if req.error_msg is None and req.completion:
+                if req.stream:
                     results = []
                     for i, c in enumerate(req.completion):
                         if c == "<bos_stream>":
@@ -722,3 +893,5 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
                             self._get_final_chat_completion_chunk(req.completion[-1])
                         )
                     req.completion = results
+                else:
+                    req.completion[0] = self._to_chat_completion(req.completion[0])

xinference/model/llm/pytorch/glm4v.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import base64
 import logging
 import time
+import typing
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from io import BytesIO
@@ -24,6 +25,7 @@ import requests
 import torch
 from PIL import Image
+from ....core.scheduler import InferenceRequest
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
@@ -36,6 +38,7 @@ from ....types import (
 from ...utils import select_device
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import get_max_src_len
 logger = logging.getLogger(__name__)
@@ -69,7 +72,6 @@ class Glm4VModel(PytorchChatModel):
         if quantization != "none":
             if self._device == "cuda" and self._is_linux():
                 kwargs["device_map"] = "auto"
-                self._device = "auto"
                 if quantization == "4-bit":
                     kwargs["load_in_4bit"] = True
                 elif quantization == "8-bit":
@@ -137,9 +139,6 @@ class Glm4VModel(PytorchChatModel):
                     fut = executor.submit(_load_image, image_url)
                     image_futures.append(fut)
             images = [fut.result() for fut in image_futures]
-            # images = []
-            # for image_url in image_urls:
-            #     images.append(_load_image(image_url))
             text = " ".join(texts)
             if len(images) == 0:
                 return text, []
@@ -149,19 +148,11 @@ class Glm4VModel(PytorchChatModel):
                 raise RuntimeError("Only one image per message is supported")
         return content, []
-    def chat(
+    def _get_chat_msgs(
         self,
         prompt: Union[str, List[Dict]],
-        system_prompt: Optional[str] = None,
         chat_history: Optional[List[ChatCompletionMessage]] = None,
-        generate_config: Optional[PytorchGenerateConfig] = None,
-    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        from transformers import TextIteratorStreamer
-        if not generate_config:
-            generate_config = {}
-        stream = generate_config.get("stream", False)
+    ):
         content, images_chat = self._message_content_to_chat(prompt)
         msgs = []
@@ -170,7 +161,7 @@ class Glm4VModel(PytorchChatModel):
         for h in chat_history or []:
             role = h["role"]
             content_h, images_tmp = self._message_content_to_chat(h["content"])
-            if images_tmp != []:
+            if images_tmp:
                 images_history = images_tmp
             if len(query_to_response) == 0 and role == "user":
                 query_to_response.append({"role": "user", "content": content_h})
@@ -185,6 +176,22 @@ class Glm4VModel(PytorchChatModel):
         elif len(images_history) > 0:
             image = images_history[0]
         msgs.append({"role": "user", "content": content, "image": image})
+        return msgs
+    def chat(
+        self,
+        prompt: Union[str, List[Dict]],
+        system_prompt: Optional[str] = None,
+        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        from transformers import TextIteratorStreamer
+        if not generate_config:
+            generate_config = {}
+        stream = generate_config.get("stream", False)
+        msgs = self._get_chat_msgs(prompt, chat_history)
         inputs = self._tokenizer.apply_chat_template(
             msgs,
@@ -282,3 +289,152 @@ class Glm4VModel(PytorchChatModel):
         )
         chunk["usage"] = completion_usage
         yield chunk
+    def _get_full_prompt(self, prompt, system_prompt, chat_history, tools):
+        msgs = self._get_chat_msgs(prompt, chat_history)
+        inputs = self._tokenizer.apply_chat_template(
+            msgs,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_tensors="pt",
+            return_dict=True,
+        )
+        return {
+            "input_ids": inputs.input_ids.squeeze(0),
+            "images": inputs.images.squeeze(0),
+        }
+    def prepare_sanitize_generate_config(self, req: InferenceRequest):
+        """
+        Refer to https://huggingface.co/THUDM/glm-4v-9b/blob/main/generation_config.json
+        """
+        raw_config = req.inference_kwargs.get("raw_params", {})
+        temperature = raw_config.get("temperature", None)
+        if temperature is None:
+            raw_config["temperature"] = 0.8
+        top_p = raw_config.get("top_p", None)
+        if top_p is None:
+            raw_config["top_p"] = 0.8
+        return raw_config
+    def build_prefill_inputs(self, prompts: List, req_list: List[InferenceRequest]):
+        context_len = self.get_context_len()
+        assert isinstance(prompts[0], dict)
+        images = []
+        max_length = float("-inf")
+        for i, feature in enumerate(prompts):
+            req = req_list[i]
+            if "images" in feature:
+                images.append(feature.pop("images", None))
+            max_src_len = get_max_src_len(context_len, req)
+            input_ids = feature["input_ids"][-max_src_len:]
+            req.prompt_tokens = input_ids.tolist()
+            feature["input_ids"] = input_ids
+            max_length = max(len(input_ids), max_length)
+        def pad_to_max_length_internal(feature, max_len, idx):
+            padding_length = max_len - len(feature["input_ids"])
+            req_list[idx].padding_len = padding_length
+            feature["input_ids"] = torch.cat(
+                [torch.full((padding_length,), 0), feature["input_ids"]]
+            )
+            return feature
+        features = [
+            pad_to_max_length_internal(feature, max_length, i)
+            for i, feature in enumerate(prompts)
+        ]
+        batch = {
+            key: torch.stack([feature[key] for feature in features])
+            for key in features[0].keys()
+        }
+        if images:
+            batch["images"] = torch.stack(images).to(self._device)
+        batch["input_ids"] = batch["input_ids"].to(self._device)
+        return batch
+    @staticmethod
+    def is_empty(images_list: Optional[List[List[torch.Tensor]]]):
+        """
+        Copied from https://huggingface.co/THUDM/glm-4v-9b/blob/main/modeling_chatglm.py
+        """
+        if images_list is None or len(images_list) == 0:
+            return True
+        for image_list in images_list:
+            if image_list is not None:
+                return False
+        return True
+    @typing.no_type_check
+    def get_full_attention_mask(
+        self, attention_mask, input_ids, images, req_list: List[InferenceRequest]
+    ):
+        """
+        Modified according to https://huggingface.co/THUDM/glm-4v-9b/blob/main/modeling_chatglm.py
+        """
+        image_size: int = self._model.config.vision_config["image_size"]
+        patch_size: int = self._model.config.vision_config["patch_size"]
+        num_patches = (image_size // patch_size // 2) ** 2
+        new_attention_masks = []
+        # if not image, use this default id
+        eoi_token_pos = 6
+        boi_token_pos = 4
+        for i in range(len(input_ids)):
+            input_id = input_ids[i].tolist()
+            req = req_list[i]
+            if not self.is_empty(images):
+                _boi_token_pos, _eoi_token_pos = input_id.index(
+                    self._model.config.boi_token_id
+                ), input_id.index(self._model.config.eoi_token_id)
+            else:
+                _boi_token_pos = boi_token_pos + req.padding_len
+                _eoi_token_pos = eoi_token_pos + req.padding_len
+            assert eoi_token_pos - boi_token_pos == 2
+            new_attention_masks.append(
+                torch.cat(
+                    (
+                        attention_mask[i, : _boi_token_pos + 1],
+                        attention_mask.new_ones(num_patches),
+                        attention_mask[i, _eoi_token_pos:],
+                    )
+                )
+            )
+        attention_mask = torch.stack(new_attention_masks, dim=0).to(self._device)
+        return attention_mask
+    def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
+        batch = self.build_prefill_inputs(prompts, req_list)
+        batch_size, seq_len = batch["input_ids"].shape
+        attention_mask = self.build_prefill_attention_mask(
+            batch_size, seq_len, req_list
+        )
+        if attention_mask is not None:
+            full_attention_mask = self.get_full_attention_mask(
+                attention_mask, batch["input_ids"], batch["images"], req_list
+            )
+            batch["attention_mask"] = full_attention_mask
+            for r in req_list:
+                r.extra_kwargs["attention_mask_seq_len"] = full_attention_mask.shape[1]
+        position_ids = self.build_prefill_position_ids(batch_size, seq_len, req_list)
+        if position_ids is not None:
+            batch["position_ids"] = position_ids
+        return batch
+    def build_decode_attention_mask(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        max_seq_len = max(r.extra_kwargs["attention_mask_seq_len"] for r in reqs)
+        new_attention_mask = []
+        for r in reqs:
+            attn_mask_seq_len = r.extra_kwargs["attention_mask_seq_len"]
+            pad_len = max_seq_len - attn_mask_seq_len
+            new_attention_mask.append(
+                torch.cat(
+                    [torch.full((pad_len,), 0), torch.ones((attn_mask_seq_len + 1,))]
+                )
+            )
+            r.extra_kwargs["attention_mask_seq_len"] += 1
+        return torch.stack(new_attention_mask, dim=0).to(self._device)