PyPI - xinference - Versions diffs - 0.12.3__py3-none-any.whl → 0.13.0__py3-none-any.whl - Mend

xinference 0.12.3py3-none-any.whl → 0.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (71) hide show

xinference/model/llm/mlx/core.py ADDED Viewed

@@ -0,0 +1,408 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import platform
+import sys
+import time
+import uuid
+from typing import Dict, Iterable, Iterator, List, Optional, TypedDict, Union
+from ....fields import max_tokens_field
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    ChatCompletionMessage,
+    Completion,
+    CompletionChoice,
+    CompletionChunk,
+    CompletionUsage,
+    LoRA,
+)
+from ..core import LLM
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import ChatModelMixin
+logger = logging.getLogger(__name__)
+class MLXModelConfig(TypedDict, total=False):
+    revision: Optional[str]
+    max_gpu_memory: str
+    trust_remote_code: bool
+class MLXGenerateConfig(TypedDict, total=False):
+    max_tokens: int
+    temperature: float
+    repetition_penalty: Optional[float]
+    repetition_context_size: Optional[float]
+    top_p: float
+    logit_bias: Optional[Dict[int, float]]
+    stop: Optional[Union[str, List[str]]]
+    stop_token_ids: Optional[Union[int, List[int]]]
+    stream: bool
+    stream_options: Optional[Union[dict, None]]
+class MLXModel(LLM):
+    def __init__(
+        self,
+        model_uid: str,
+        model_family: "LLMFamilyV1",
+        model_spec: "LLMSpecV1",
+        quantization: str,
+        model_path: str,
+        model_config: Optional[MLXModelConfig] = None,
+        peft_model: Optional[List[LoRA]] = None,
+    ):
+        super().__init__(model_uid, model_family, model_spec, quantization, model_path)
+        self._use_fast_tokenizer = True
+        self._model_config: MLXModelConfig = self._sanitize_model_config(model_config)
+        if peft_model is not None:
+            raise ValueError("MLX engine has not supported lora yet")
+    def _sanitize_model_config(
+        self, model_config: Optional[MLXModelConfig]
+    ) -> MLXModelConfig:
+        if model_config is None:
+            model_config = MLXModelConfig()
+        model_config.setdefault("revision", self.model_spec.model_revision)
+        model_config.setdefault("trust_remote_code", True)
+        return model_config
+    def _sanitize_generate_config(
+        self,
+        generate_config: Optional[MLXGenerateConfig],
+    ) -> MLXGenerateConfig:
+        if generate_config is None:
+            generate_config = MLXGenerateConfig()
+        generate_config.setdefault("max_tokens", max_tokens_field.default)
+        # default config is adapted from
+        # https://github.com/ml-explore/mlx-examples/blob/f212b770d8b5143e23102eda20400ae43340f844/llms/mlx_lm/utils.py#L129
+        generate_config.setdefault("temperature", 0.0)
+        generate_config.setdefault("repetition_penalty", None)
+        generate_config.setdefault("repetition_context_size", 20)
+        generate_config.setdefault("top_p", 1.0)
+        generate_config.setdefault("logit_bias", None)
+        return generate_config
+    def _load_model(self, **kwargs):
+        try:
+            from mlx_lm import load
+        except ImportError:
+            error_message = "Failed to import module 'mlx_lm'"
+            installation_guide = [
+                "Please make sure 'mlx_lm' is installed. ",
+                "You can install it by `pip install mlx_lm`\n",
+            ]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        tokenizer_config = dict(
+            use_fast=self._use_fast_tokenizer,
+            trust_remote_code=kwargs["trust_remote_code"],
+            revision=kwargs["revision"],
+        )
+        logger.debug(
+            "loading model with tokenizer config: %s, model config: %s",
+            tokenizer_config,
+            self._model_config,
+        )
+        return load(
+            self.model_path,
+            tokenizer_config=tokenizer_config,
+            model_config=self._model_config,
+        )
+    def load(self):
+        kwargs = {}
+        kwargs["revision"] = self._model_config.get(
+            "revision", self.model_spec.model_revision
+        )
+        kwargs["trust_remote_code"] = self._model_config.get("trust_remote_code")
+        self._model, self._tokenizer = self._load_model(**kwargs)
+    @classmethod
+    def match(
+        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if llm_spec.model_format not in ["mlx"]:
+            return False
+        if sys.platform != "darwin" or platform.processor() != "arm":
+            # only work for Mac M chips
+            return False
+        if "generate" not in llm_family.model_ability:
+            return False
+        return True
+    def _generate_stream(self, prompt: str, kwargs: MLXGenerateConfig):
+        import mlx.core as mx
+        from mlx_lm.utils import generate_step
+        model = self._model
+        model_uid = self.model_uid
+        tokenizer = self._tokenizer
+        max_tokens = kwargs["max_tokens"]
+        chunk_id = str(uuid.uuid4())
+        stop_token_ids = kwargs.get("stop_token_ids", [])
+        stream = kwargs.get("stream", False)
+        stream_options = kwargs.pop("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
+        prompt_tokens = mx.array(tokenizer.encode(prompt))
+        input_echo_len = len(prompt_tokens)
+        i = 0
+        start = time.time()
+        output = ""
+        for (token, _), i in zip(
+            generate_step(
+                prompt_tokens,
+                model,
+                temp=kwargs["temperature"],
+                repetition_penalty=kwargs["repetition_penalty"],
+                repetition_context_size=kwargs["repetition_context_size"],
+                top_p=kwargs["top_p"],
+                logit_bias=kwargs["logit_bias"],
+            ),
+            range(max_tokens),
+        ):
+            if token == tokenizer.eos_token_id or token in stop_token_ids:  # type: ignore
+                break
+            # Yield the last segment if streaming
+            out = tokenizer.decode(
+                token,
+                skip_special_tokens=True,
+                spaces_between_special_tokens=False,
+                clean_up_tokenization_spaces=True,
+            )
+            if stream:
+                # this special character is mainly for qwen
+                out = out.strip("�")
+                output = out
+            else:
+                output += out
+            completion_choice = CompletionChoice(
+                text=output, index=0, logprobs=None, finish_reason=None
+            )
+            completion_chunk = CompletionChunk(
+                id=chunk_id,
+                object="text_completion",
+                created=int(time.time()),
+                model=model_uid,
+                choices=[completion_choice],
+            )
+            completion_usage = CompletionUsage(
+                prompt_tokens=input_echo_len,
+                completion_tokens=i,
+                total_tokens=(input_echo_len + i),
+            )
+            yield completion_chunk, completion_usage
+        logger.info(
+            f"Average generation speed: {i / (time.time() - start):.2f} tokens/s."
+        )
+        if i == max_tokens - 1:
+            finish_reason = "length"
+        else:
+            finish_reason = "stop"
+        if stream:
+            completion_choice = CompletionChoice(
+                text="", index=0, logprobs=None, finish_reason=finish_reason
+            )
+        else:
+            completion_choice = CompletionChoice(
+                text=output, index=0, logprobs=None, finish_reason=finish_reason
+            )
+        completion_chunk = CompletionChunk(
+            id=chunk_id,
+            object="text_completion",
+            created=int(time.time()),
+            model=model_uid,
+            choices=[completion_choice],
+        )
+        completion_usage = CompletionUsage(
+            prompt_tokens=input_echo_len,
+            completion_tokens=i,
+            total_tokens=(input_echo_len + i),
+        )
+        yield completion_chunk, completion_usage
+        if include_usage:
+            completion_chunk = CompletionChunk(
+                id=chunk_id,
+                object="text_completion",
+                created=int(time.time()),
+                model=model_uid,
+                choices=[],
+            )
+            completion_usage = CompletionUsage(
+                prompt_tokens=input_echo_len,
+                completion_tokens=i,
+                total_tokens=(input_echo_len + i),
+            )
+            yield completion_chunk, completion_usage
+    def generate(
+        self, prompt: str, generate_config: Optional[MLXGenerateConfig] = None
+    ) -> Union[Completion, Iterator[CompletionChunk]]:
+        def generator_wrapper(
+            prompt: str, generate_config: MLXGenerateConfig
+        ) -> Iterator[CompletionChunk]:
+            for completion_chunk, completion_usage in self._generate_stream(
+                prompt,
+                generate_config,
+            ):
+                completion_chunk["usage"] = completion_usage
+                yield completion_chunk
+        logger.debug(
+            "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
+        )
+        generate_config = self._sanitize_generate_config(generate_config)
+        assert self._model is not None
+        assert self._tokenizer is not None
+        stream = generate_config.get("stream", False)
+        if not stream:
+            for completion_chunk, completion_usage in self._generate_stream(
+                prompt,
+                generate_config,
+            ):
+                pass
+            completion = Completion(
+                id=completion_chunk["id"],
+                object=completion_chunk["object"],
+                created=completion_chunk["created"],
+                model=completion_chunk["model"],
+                choices=completion_chunk["choices"],
+                usage=completion_usage,
+            )
+            return completion
+        else:
+            return generator_wrapper(prompt, generate_config)
+class MLXChatModel(MLXModel, ChatModelMixin):
+    def __init__(
+        self,
+        model_uid: str,
+        model_family: "LLMFamilyV1",
+        model_spec: "LLMSpecV1",
+        quantization: str,
+        model_path: str,
+        model_config: Optional[MLXModelConfig] = None,
+        peft_model: Optional[List[LoRA]] = None,
+    ):
+        super().__init__(
+            model_uid,
+            model_family,
+            model_spec,
+            quantization,
+            model_path,
+            model_config,
+            peft_model,
+        )
+    def _sanitize_generate_config(
+        self,
+        generate_config: Optional[MLXGenerateConfig],
+    ) -> MLXGenerateConfig:
+        generate_config = super()._sanitize_generate_config(generate_config)
+        if (
+            (not generate_config.get("stop"))
+            and self.model_family.prompt_style
+            and self.model_family.prompt_style.stop
+        ):
+            generate_config["stop"] = self.model_family.prompt_style.stop.copy()
+        if (
+            generate_config.get("stop_token_ids", None) is None
+            and self.model_family.prompt_style
+            and self.model_family.prompt_style.stop_token_ids
+        ):
+            generate_config[
+                "stop_token_ids"
+            ] = self.model_family.prompt_style.stop_token_ids.copy()
+        return generate_config
+    @classmethod
+    def match(
+        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if llm_spec.model_format not in ["mlx"]:
+            return False
+        if sys.platform != "darwin" or platform.processor() != "arm":
+            # only work for Mac M chips
+            return False
+        if "chat" not in llm_family.model_ability:
+            return False
+        return True
+    def chat(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        generate_config: Optional[MLXGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        tools = generate_config.pop("tools", []) if generate_config else None  # type: ignore
+        full_prompt = self.get_full_prompt(
+            self.model_family, prompt, system_prompt, chat_history, tools
+        )
+        generate_config = self._sanitize_generate_config(generate_config)
+        # TODO(codingl2k1): qwen hacky to set stop for function call.
+        model_family = self.model_family.model_family or self.model_family.model_name
+        if tools and model_family in ["qwen-chat", "qwen1.5-chat"]:
+            stop = generate_config.get("stop")
+            if isinstance(stop, str):
+                generate_config["stop"] = [stop, "Observation:"]
+            elif isinstance(stop, Iterable):
+                assert not isinstance(stop, str)
+                generate_config["stop"] = list(stop) + ["Observation:"]
+            else:
+                generate_config["stop"] = "Observation:"
+        stream = generate_config.get("stream", False)
+        if stream:
+            it = self.generate(full_prompt, generate_config)
+            assert isinstance(it, Iterator)
+            return self._to_chat_completion_chunks(it)
+        else:
+            c = self.generate(full_prompt, generate_config)
+            assert not isinstance(c, Iterator)
+            if tools:
+                return self._tool_calls_completion(
+                    self.model_family, self.model_uid, c, tools
+                )
+            return self._to_chat_completion(c)

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -29,6 +29,7 @@ from ....types import (
     PytorchGenerateConfig,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import GLM4_TOOL_CALL_FAMILY
 from .core import PytorchChatModel, PytorchModelConfig
@@ -103,7 +104,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         if tools is None:
             return False
         tool_choice = generate_config.pop("tool_choice", "none")
-        if self.model_family.model_name == "glm4-chat":
+        if self.model_family.model_name in GLM4_TOOL_CALL_FAMILY:
             chat_history[:] = self.process_messages(
                 chat_history, tools=tools, tool_choice=tool_choice
             )
@@ -335,14 +336,6 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                     ),
                 )
-    @staticmethod
-    def require_attention_mask():
-        """
-        GLM4 needs to use attention mask and position ids during inference.
-        Otherwise, the inference result would be not available.
-        """
-        return True
     def prepare_sanitize_generate_config(self, req: InferenceRequest):
         """
         Set temperature and top_p to 0.8 by default

xinference/model/llm/pytorch/cogvlm2.py CHANGED Viewed

@@ -23,6 +23,7 @@ import requests
 import torch
 from PIL import Image
+from ....core.scheduler import InferenceRequest
 from ....model.utils import select_device
 from ....types import (
     ChatCompletion,
@@ -35,11 +36,30 @@ from ....types import (
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import get_max_src_len
 logger = logging.getLogger(__name__)
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
+LANGUAGE_TOKEN_TYPE = 0
+VISION_TOKEN_TYPE = 1
+def recur_move_to(item, tgt, criterion_func):
+    """
+    This function is copied from https://github.com/THUDM/CogVLM2/blob/main/basic_demo/cli_demo_batch_inference.py
+    """
+    if criterion_func(item):
+        device_copy = item.to(tgt)
+        return device_copy
+    elif isinstance(item, list):
+        return [recur_move_to(v, tgt, criterion_func) for v in item]
+    elif isinstance(item, tuple):
+        return tuple([recur_move_to(v, tgt, criterion_func) for v in item])
+    elif isinstance(item, dict):
+        return {k: recur_move_to(v, tgt, criterion_func) for k, v in item.items()}
+    else:
+        return item
 class CogVLM2Model(PytorchChatModel):
@@ -176,11 +196,33 @@ class CogVLM2Model(PytorchChatModel):
                             content["image_url"]["url"]
                         )
             assistant = chat_history[i + 1]["content"]
-            query = query + f" USER: {user} ASSISTANT:"
-            history.append((query, assistant))
-            query = query + f" {assistant}"
+            history.append((user, assistant))
+            query = assistant  # type: ignore
         return query, history, [pixel_values]
+    def get_query_and_history(
+        self,
+        prompt: Union[str, List[Dict]],
+        system_prompt: Optional[str] = None,
+        chat_history: Optional[List[ChatCompletionMessage]] = None,
+    ):
+        content, image = self._message_content_to_cogvlm2(prompt)
+        history = []
+        history_image = None
+        if chat_history:
+            query, history, history_image = self._history_content_to_cogvlm2(
+                system_prompt, chat_history  # type: ignore
+            )
+        if image and history_image:
+            history = []
+            query = content
+        else:
+            image = image if image else history_image
+            query = content
+        return query, image, history
     def chat(
         self,
         prompt: Union[str, List[Dict]],
@@ -198,22 +240,9 @@ class CogVLM2Model(PytorchChatModel):
             else 512,
         }
-        content, image = self._message_content_to_cogvlm2(prompt)
-        history = []
-        query = ""
-        history_image = None
-        if chat_history:
-            query, history, history_image = self._history_content_to_cogvlm2(
-                system_prompt, chat_history
-            )
-        if image and history_image:
-            history = []
-            query = system_prompt + f" USER: {content} ASSISTANT:"
-        else:
-            image = image if image else history_image
-            query = query + f" USER: {content} ASSISTANT:"
+        query, image, history = self.get_query_and_history(
+            prompt, system_prompt=system_prompt, chat_history=chat_history
+        )
         input_by_model = self._model.build_conversation_input_ids(
             self._tokenizer,
@@ -319,3 +348,159 @@ class CogVLM2Model(PytorchChatModel):
             ),
         )
         yield chunk
+    @staticmethod
+    def build_position_ids(x, attention_mask=None):
+        """
+        Copied from https://huggingface.co/THUDM/cogvlm2-llama3-chinese-chat-19B-int4/blob/main/modeling_cogvlm.py
+        """
+        # Fix: 参考官方开源代码
+        if attention_mask is not None:
+            tmp = x.clone()
+            tmp[~(attention_mask.bool())] = -1
+        else:
+            tmp = x.clone()
+        # image boi eoi token as LANGUAGE_TOKEN_TYPE
+        is_boi_eoi = torch.zeros_like(x, dtype=torch.bool)
+        is_boi_eoi[:, 1:] |= (tmp[:, 1:] == VISION_TOKEN_TYPE) & (
+            tmp[:, :-1] == LANGUAGE_TOKEN_TYPE
+        )
+        is_boi_eoi[:, 0] |= tmp[:, 0] == VISION_TOKEN_TYPE
+        is_boi_eoi[:, :-1] |= (tmp[:, :-1] == VISION_TOKEN_TYPE) & (
+            tmp[:, 1:] == LANGUAGE_TOKEN_TYPE
+        )
+        is_boi_eoi[:, -1] |= tmp[:, -1] == VISION_TOKEN_TYPE
+        tmp[is_boi_eoi] = LANGUAGE_TOKEN_TYPE
+        # final position ids
+        y = torch.zeros_like(x, dtype=torch.long)
+        y[:, 1:] = (tmp[:, 1:] == LANGUAGE_TOKEN_TYPE) | (
+            (tmp[:, 1:] == VISION_TOKEN_TYPE) & (tmp[:, :-1] == LANGUAGE_TOKEN_TYPE)
+        )
+        y = y.cumsum(dim=-1)
+        return y
+    def get_dtype(self):
+        return self._torch_type
+    def _get_full_prompt(self, prompt, system_prompt, chat_history, tools):
+        query, image, history = self.get_query_and_history(
+            prompt, system_prompt=system_prompt, chat_history=chat_history
+        )
+        input_by_model: dict = self._model.build_conversation_input_ids(
+            self._tokenizer,
+            query=query,
+            history=history,
+            images=image,
+            template_version="chat",
+        )
+        return {
+            "input_ids": input_by_model["input_ids"],  # seq_len
+            "token_type_ids": input_by_model["token_type_ids"],  # seq_len
+            "attention_mask": input_by_model["attention_mask"],  # seq_len
+            "images": input_by_model["images"],
+        }
+    def prepare_sanitize_generate_config(self, req: InferenceRequest):
+        """
+        See https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B/blob/main/generation_config.json
+        """
+        raw_config = req.inference_kwargs.get("raw_params", {})
+        temperature = raw_config.get("temperature", None)
+        if temperature is None:
+            raw_config["temperature"] = 0.6
+        top_p = raw_config.get("top_p", None)
+        if top_p is None:
+            raw_config["top_p"] = 0.9
+        return raw_config
+    def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
+        context_len = self.get_context_len()
+        assert isinstance(prompts[0], dict)
+        images = []
+        max_length = float("-inf")
+        for i, feature in enumerate(prompts):
+            req = req_list[i]
+            if "images" in feature:
+                images.append(feature.pop("images", None))
+            max_src_len = get_max_src_len(context_len, req)
+            input_ids = feature["input_ids"][-max_src_len:]
+            req.prompt_tokens = input_ids.tolist()
+            feature["input_ids"] = input_ids
+            feature["token_type_ids"] = feature["token_type_ids"][-max_src_len:]
+            feature["attention_mask"] = feature["attention_mask"][-max_src_len:]
+            req.extra_kwargs["attention_mask_seq_len"] = feature[
+                "attention_mask"
+            ].shape[0]
+            max_length = max(len(input_ids), max_length)
+        def pad_to_max_length_internal(feature, max_len, idx):
+            padding_length = max_len - len(feature["input_ids"])
+            req_list[idx].padding_len = padding_length
+            feature["input_ids"] = torch.cat(
+                [torch.full((padding_length,), 0), feature["input_ids"]]
+            )
+            feature["token_type_ids"] = torch.cat(
+                [
+                    torch.zeros(padding_length, dtype=torch.long),
+                    feature["token_type_ids"],
+                ]
+            )
+            feature["attention_mask"] = torch.cat(
+                [
+                    torch.zeros(padding_length, dtype=torch.long),
+                    feature["attention_mask"],
+                ]
+            )
+            return feature
+        features = [
+            pad_to_max_length_internal(feature, max_length, i)
+            for i, feature in enumerate(prompts)
+        ]
+        batch = {
+            key: torch.stack([feature[key] for feature in features])
+            for key in features[0].keys()
+        }
+        position_ids = self.build_position_ids(batch["token_type_ids"])
+        batch["position_ids"] = position_ids
+        for i in range(len(prompts)):
+            req = req_list[i]
+            req.extra_kwargs["max_position_id"] = position_ids[i : i + 1, -1].item()
+        if images:
+            batch["images"] = images
+        batch = recur_move_to(
+            batch, self._device, lambda x: isinstance(x, torch.Tensor)
+        )
+        dtype = self.get_dtype()
+        if dtype:
+            batch = recur_move_to(
+                batch,
+                dtype,
+                lambda x: isinstance(x, torch.Tensor) and torch.is_floating_point(x),
+            )
+        return batch
+    def build_decode_token_type_ids(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        token_type_ids = torch.full(
+            (batch_size, 1), fill_value=1, dtype=torch.long, device=self._device
+        )
+        return token_type_ids
+    def build_decode_position_ids(
+        self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
+    ):
+        tmp = []
+        for r in reqs:
+            r.extra_kwargs["max_position_id"] += 1
+            tmp.append(r.extra_kwargs["max_position_id"])
+        position_ids = torch.as_tensor(
+            tmp, device=self._device, dtype=torch.long
+        ).unsqueeze(1)
+        return position_ids

xinference 0.12.3__py3-none-any.whl → 0.13.0__py3-none-any.whl

Potentially problematic release.

xinference 0.12.3py3-none-any.whl → 0.13.0py3-none-any.whl