PyPI - xinference - Versions diffs - 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

xinference 0.7.5py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (120) hide show

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -11,13 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import time
+import uuid
 from typing import Any, Dict, Iterator, List, Optional, Union
 from ....types import (
     SPECIAL_TOOL_PROMPT,
     ChatCompletion,
+    ChatCompletionChoice,
     ChatCompletionChunk,
     ChatCompletionMessage,
+    CompletionChoice,
+    CompletionChunk,
+    CompletionUsage,
     PytorchGenerateConfig,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -106,38 +112,74 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         tools = self._handle_tools(generate_config)
+        kwargs: Dict[str, Any] = {}
+        generate_config = generate_config or {}
+        temperature = generate_config.get("temperature")
+        if temperature is not None:
+            kwargs["temperature"] = float(temperature)
+        top_p = generate_config.get("top_p")
+        if top_p is not None:
+            kwargs["top_p"] = float(top_p)
+        max_length = generate_config.get("max_tokens")
+        if max_length is not None:
+            kwargs["max_length"] = int(max_length)
+        # Tool calls only works for non stream, so we call chat directly.
+        if prompt == SPECIAL_TOOL_PROMPT and chat_history:
+            tool_message = chat_history.pop()
+            content = tool_message.get("content")
+            assert content is not None
+            prompt = content
+            kwargs["role"] = "observation"
+            chat_history = [h for h in chat_history if not h.get("tool_calls")]
+        if not chat_history:
+            chat_history = []
         if tools:
-            # Tool calls only works for non stream, so we call chat directly.
-            kwargs: Dict[str, Any] = {}
-            generate_config = generate_config or {}
-            temperature = generate_config.get("temperature")
-            if temperature is not None:
-                kwargs["temperature"] = float(temperature)
-            top_p = generate_config.get("top_p")
-            if top_p is not None:
-                kwargs["top_p"] = float(top_p)
-            max_length = generate_config.get("max_tokens")
-            if max_length is not None:
-                kwargs["max_length"] = int(max_length)
-            if prompt == SPECIAL_TOOL_PROMPT and chat_history:
-                tool_message = chat_history.pop()
-                content = tool_message.get("content")
-                assert content is not None
-                prompt = content
-                kwargs["role"] = "observation"
-                chat_history = [h for h in chat_history if not h.get("tool_calls")]
-            if not chat_history:
-                chat_history = []
             msg = self._model.chat(
                 self._tokenizer, prompt, [tools] + chat_history, **kwargs
             )
             return self._tool_calls_completion(
-                self.model_family.model_name, self.model_uid, msg, tools
+                self.model_family, self.model_uid, msg, tools
             )
         else:
-            return super().chat(
-                prompt=prompt,
-                system_prompt=system_prompt,
-                chat_history=chat_history,
-                generate_config=generate_config,
-            )
+            stream = generate_config.get("stream", False)
+            if stream:
+                def _stream_generator():
+                    last_chunk_text_length = 0
+                    for chunk_text, _ in self._model.stream_chat(
+                        self._tokenizer, prompt, chat_history, **kwargs
+                    ):
+                        chunk_text = chunk_text[last_chunk_text_length:]
+                        last_chunk_text_length += len(chunk_text)
+                        completion_choice = CompletionChoice(
+                            text=chunk_text, index=0, logprobs=None, finish_reason=None
+                        )
+                        yield CompletionChunk(
+                            id=str(uuid.uuid1()),
+                            object="text_completion",
+                            created=int(time.time()),
+                            model=self.model_uid,
+                            choices=[completion_choice],
+                        )
+                return self._to_chat_completion_chunks(_stream_generator())
+            else:
+                response, _ = self._model.chat(
+                    self._tokenizer, prompt, chat_history, **kwargs
+                )
+                return ChatCompletion(
+                    id="chat" + str(uuid.uuid1()),
+                    object="chat.completion",
+                    created=int(time.time()),
+                    model=self.model_uid,
+                    choices=[
+                        ChatCompletionChoice(
+                            index=0,
+                            message={"role": "assistant", "content": response},
+                            finish_reason="stop",
+                        )
+                    ],
+                    usage=CompletionUsage(
+                        prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
+                    ),
+                )

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -192,7 +192,8 @@ class PytorchModel(LLM):
     ) -> bool:
         if llm_spec.model_format not in ["pytorch", "gptq"]:
             return False
-        if llm_family.model_name in [
+        model_family = llm_family.model_family or llm_family.model_name
+        if model_family in [
             "baichuan-chat",
             "vicuna-v1.3",
             "falcon",
@@ -211,11 +212,7 @@ class PytorchModel(LLM):
     def generate(
         self, prompt: str, generate_config: Optional[PytorchGenerateConfig] = None
     ) -> Union[Completion, Iterator[CompletionChunk]]:
-        from .utils import (
-            generate_stream,
-            generate_stream_chatglm,
-            generate_stream_falcon,
-        )
+        from .utils import generate_stream, generate_stream_falcon
         model_family_name = self.model_family.model_name.lower()
@@ -223,17 +220,7 @@ class PytorchModel(LLM):
             prompt: str, generate_config: PytorchGenerateConfig
         ) -> Iterator[CompletionChunk]:
             if "falcon" in model_family_name:
-                for completion_chunk, _ in generate_stream_falcon(
-                    self.model_uid,
-                    self._model,
-                    self._tokenizer,
-                    prompt,
-                    self._device,
-                    generate_config,
-                ):
-                    yield completion_chunk
-            elif "chatglm" in model_family_name:
-                for completion_chunk, _ in generate_stream_chatglm(
+                for completion_chunk, completion_usage in generate_stream_falcon(
                     self.model_uid,
                     self._model,
                     self._tokenizer,
@@ -241,9 +228,10 @@ class PytorchModel(LLM):
                     self._device,
                     generate_config,
                 ):
+                    completion_chunk["usage"] = completion_usage
                     yield completion_chunk
             else:
-                for completion_chunk, _ in generate_stream(
+                for completion_chunk, completion_usage in generate_stream(
                     self.model_uid,
                     self._model,
                     self._tokenizer,
@@ -251,6 +239,7 @@ class PytorchModel(LLM):
                     self._device,
                     generate_config,
                 ):
+                    completion_chunk["usage"] = completion_usage
                     yield completion_chunk
         logger.debug(
@@ -274,16 +263,6 @@ class PytorchModel(LLM):
                     generate_config,
                 ):
                     pass
-            elif "chatglm" in model_family_name:
-                for completion_chunk, completion_usage in generate_stream_chatglm(
-                    self.model_uid,
-                    self._model,
-                    self._tokenizer,
-                    prompt,
-                    self._device,
-                    generate_config,
-                ):
-                    pass
             else:
                 for completion_chunk, completion_usage in generate_stream(
                     self.model_uid,
@@ -442,6 +421,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             "chatglm2-32k",
             "llama-2",
             "llama-2-chat",
+            "internlm2-chat",
         ]:
             return False
         if "chat" not in llm_family.model_ability:
@@ -465,7 +445,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
         generate_config = self._sanitize_generate_config(generate_config)
         # TODO(codingl2k1): qwen hacky to set stop for function call.
-        if tools and self.model_family.model_name == "qwen-chat":
+        model_family = self.model_family.model_family or self.model_family.model_name
+        if tools and "qwen-chat" == model_family:
             stop = generate_config.get("stop")
             if isinstance(stop, str):
                 generate_config["stop"] = [stop, "Observation:"]
@@ -485,6 +466,6 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             assert not isinstance(c, Iterator)
             if tools:
                 return self._tool_calls_completion(
-                    self.model_family.model_name, self.model_uid, c, tools
+                    self.model_family, self.model_uid, c, tools
                 )
             return self._to_chat_completion(c)

xinference/model/llm/pytorch/internlm2.py ADDED Viewed

@@ -0,0 +1,155 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import uuid
+from typing import Any, Dict, Iterator, List, Optional, Union
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChoice,
+    ChatCompletionChunk,
+    ChatCompletionMessage,
+    CompletionChoice,
+    CompletionChunk,
+    CompletionUsage,
+    PytorchGenerateConfig,
+)
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from .core import PytorchChatModel, PytorchModelConfig
+class Internlm2PytorchChatModel(PytorchChatModel):
+    def __init__(
+        self,
+        model_uid: str,
+        model_family: "LLMFamilyV1",
+        model_spec: "LLMSpecV1",
+        quantization: str,
+        model_path: str,
+        pytorch_model_config: Optional[PytorchModelConfig] = None,
+    ):
+        super().__init__(
+            model_uid,
+            model_family,
+            model_spec,
+            quantization,
+            model_path,
+            pytorch_model_config=pytorch_model_config,
+        )
+    def _load_model(self, **kwargs):
+        try:
+            from transformers import AutoModel, AutoTokenizer
+        except ImportError:
+            error_message = "Failed to import module 'transformers'"
+            installation_guide = [
+                "Please make sure 'transformers' is installed. ",
+                "You can install it by `pip install transformers`\n",
+            ]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path,
+            trust_remote_code=kwargs["trust_remote_code"],
+            encode_special_tokens=True,
+            revision=kwargs["revision"],
+        )
+        model = AutoModel.from_pretrained(
+            self.model_path,
+            **kwargs,
+        )
+        return model, tokenizer
+    @classmethod
+    def match(
+        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if llm_spec.model_format != "pytorch":
+            return False
+        model_family = llm_family.model_family or llm_family.model_name
+        if model_family != "internlm2-chat":
+            return False
+        if "chat" not in llm_family.model_ability:
+            return False
+        return True
+    def chat(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        generate_config: Optional[PytorchGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        kwargs: Dict[str, Any] = {}
+        generate_config = generate_config or {}
+        temperature = generate_config.get("temperature")
+        if temperature is not None:
+            kwargs["temperature"] = float(temperature)
+        top_p = generate_config.get("top_p")
+        if top_p is not None:
+            kwargs["top_p"] = float(top_p)
+        max_new_tokens = generate_config.get("max_tokens")
+        if max_new_tokens is not None:
+            kwargs["max_length"] = int(max_new_tokens)
+        stream = generate_config.get("stream", False)
+        if chat_history:
+            input_history = [
+                (chat_history[i]["content"], (chat_history[i + 1]["content"]))
+                for i in range(0, len(chat_history), 2)
+            ]
+        else:
+            input_history = []
+        if stream:
+            def _stream_generator():
+                last_chunk_text_length = 0
+                for chunk_text, _ in self._model.stream_chat(
+                    self._tokenizer, prompt, input_history, **kwargs
+                ):
+                    chunk_text = chunk_text[last_chunk_text_length:]
+                    last_chunk_text_length += len(chunk_text)
+                    completion_choice = CompletionChoice(
+                        text=chunk_text, index=0, logprobs=None, finish_reason=None
+                    )
+                    yield CompletionChunk(
+                        id=str(uuid.uuid1()),
+                        object="text_completion",
+                        created=int(time.time()),
+                        model=self.model_uid,
+                        choices=[completion_choice],
+                    )
+            return self._to_chat_completion_chunks(_stream_generator())
+        else:
+            response, _ = self._model.chat(
+                self._tokenizer, prompt, input_history, **kwargs
+            )
+            return ChatCompletion(
+                id="chat" + str(uuid.uuid1()),
+                object="chat.completion",
+                created=int(time.time()),
+                model=self.model_uid,
+                choices=[
+                    ChatCompletionChoice(
+                        index=0,
+                        message={"role": "assistant", "content": response},
+                        finish_reason="stop",
+                    )
+                ],
+                usage=CompletionUsage(
+                    prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
+                ),
+            )

xinference/model/llm/pytorch/utils.py CHANGED Viewed

@@ -14,7 +14,6 @@
 import gc
 import logging
-import re
 import time
 import uuid
 from threading import Thread
@@ -23,7 +22,6 @@ from typing import Iterable, Iterator, Tuple
 import torch
 from transformers import GenerationConfig, TextIteratorStreamer
 from transformers.generation.logits_process import (
-    LogitsProcessor,
     LogitsProcessorList,
     RepetitionPenaltyLogitsProcessor,
     TemperatureLogitsWarper,
@@ -480,154 +478,3 @@ def generate_stream_falcon(
     # clean
     gc.collect()
     torch.cuda.empty_cache()
-class InvalidScoreLogitsProcessor(LogitsProcessor):
-    def __call__(
-        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
-    ) -> torch.FloatTensor:
-        if torch.isnan(scores).any() or torch.isinf(scores).any():
-            scores.zero_()
-            scores[..., 5] = 5e4
-        return scores
-invalid_score_processor = InvalidScoreLogitsProcessor()
-def process_response(response):
-    response = response.strip()
-    response = response.replace("[[训练时间]]", "2023年")
-    punkts = [
-        [",", "，"],
-        ["!", "！"],
-        [":", "："],
-        [";", "；"],
-        ["\\?", "？"],
-    ]
-    for item in punkts:
-        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
-        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
-    return response
-@torch.inference_mode()
-def generate_stream_chatglm(
-    model_uid,
-    model,
-    tokenizer,
-    prompt,
-    device,
-    generate_config,
-    judge_sent_end=False,
-):
-    stream = generate_config.get("stream", False)
-    temperature = float(generate_config.get("temperature", 1.0))
-    repetition_penalty = float(generate_config.get("repetition_penalty", 1.0))
-    top_p = float(generate_config.get("top_p", 1.0))
-    max_new_tokens = int(generate_config.get("max_tokens", 256))
-    echo = generate_config.get("echo", False)
-    stop_str = generate_config.get("stop", None)
-    eos_token_id = generate_config.get("stop_token_ids", [])
-    eos_token_id.append(tokenizer.eos_token_id)
-    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
-    input_echo_len = len(inputs["input_ids"][0])
-    gen_kwargs = {
-        "max_length": max_new_tokens + input_echo_len,
-        "do_sample": True if temperature > 1e-5 else False,
-        "top_p": top_p,
-        "repetition_penalty": repetition_penalty,
-        "logits_processor": [invalid_score_processor],
-    }
-    if temperature > 1e-5:
-        gen_kwargs["temperature"] = temperature
-    total_len = 0
-    last_response_length = 0
-    for total_ids in model.stream_generate(
-        **inputs, eos_token_id=eos_token_id, **gen_kwargs
-    ):
-        total_ids = total_ids.tolist()[0]
-        total_len = len(total_ids)
-        if echo:
-            output_ids = total_ids
-        else:
-            output_ids = total_ids[input_echo_len:]
-        response = tokenizer.decode(output_ids)
-        response = process_response(response)
-        partially_stopped = False
-        stopped = False
-        if stop_str:
-            if isinstance(stop_str, str):
-                pos = response.rfind(stop_str, 0)
-                if pos != -1:
-                    response = response[:pos]
-                    stopped = True
-                else:
-                    partially_stopped = is_partial_stop(response, stop_str)
-            elif isinstance(stop_str, Iterable):
-                for each_stop in stop_str:
-                    pos = response.rfind(each_stop, 0)
-                    if pos != -1:
-                        response = response[:pos]
-                        stopped = True
-                        break
-                    else:
-                        partially_stopped = is_partial_stop(response, each_stop)
-                        if partially_stopped:
-                            break
-            else:
-                raise ValueError("Invalid stop field type.")
-        if stream:
-            response = response.strip("�")
-            tmp_response_length = len(response)
-            response = response[last_response_length:]
-            last_response_length = tmp_response_length
-        if not partially_stopped:
-            completion_choice = CompletionChoice(
-                text=response, index=0, logprobs=None, finish_reason=None
-            )
-            completion_chunk = CompletionChunk(
-                id=str(uuid.uuid1()),
-                object="text_completion",
-                created=int(time.time()),
-                model=model_uid,
-                choices=[completion_choice],
-            )
-            completion_usage = CompletionUsage(
-                prompt_tokens=input_echo_len,
-                completion_tokens=(total_len - input_echo_len),
-                total_tokens=total_len,
-            )
-            yield completion_chunk, completion_usage
-        if stopped:
-            break
-    if total_len - input_echo_len == max_new_tokens - 1:
-        finish_reason = "length"
-    else:
-        finish_reason = "stop"
-    completion_choice = CompletionChoice(
-        text=response, index=0, logprobs=None, finish_reason=finish_reason
-    )
-    completion_chunk = CompletionChunk(
-        id=str(uuid.uuid1()),
-        object="text_completion",
-        created=int(time.time()),
-        model=model_uid,
-        choices=[completion_choice],
-    )
-    completion_usage = CompletionUsage(
-        prompt_tokens=input_echo_len,
-        completion_tokens=(total_len - input_echo_len),
-        total_tokens=total_len,
-    )
-    yield completion_chunk, completion_usage

xinference/model/llm/utils.py CHANGED Viewed

@@ -16,7 +16,7 @@ import json
 import logging
 import time
 import uuid
-from typing import AsyncGenerator, Dict, Iterator, List, Optional
+from typing import AsyncGenerator, Dict, Iterator, List, Optional, cast
 from xinference.model.llm.llm_family import PromptStyleV1
@@ -299,6 +299,24 @@ Begin!"""
             )
             ret += chat_history[-1]["role"] + ":"
             return ret
+        elif prompt_style.style_name == "INTERNLM2":
+            ret = (
+                "<s>"
+                if prompt_style.system_prompt == ""
+                else "<s>[UNUSED_TOKEN_146]system\n"
+                + prompt_style.system_prompt
+                + prompt_style.intra_message_sep
+                + "\n"
+            )
+            for message in chat_history:
+                role = message["role"]
+                content = message["content"]
+                if content:
+                    ret += role + "\n" + content + prompt_style.intra_message_sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
         elif prompt_style.style_name == "ADD_COLON_SINGLE_COT":
             ret = prompt_style.system_prompt + prompt_style.intra_message_sep
             for message in chat_history:
@@ -360,7 +378,7 @@ Begin!"""
     @classmethod
     def _to_chat_completion_chunk(cls, chunk: CompletionChunk) -> ChatCompletionChunk:
-        return {
+        chat_chunk = {
             "id": "chat" + chunk["id"],
             "model": chunk["model"],
             "created": chunk["created"],
@@ -376,12 +394,16 @@ Begin!"""
                 for i, choice in enumerate(chunk["choices"])
             ],
         }
+        usage = chunk.get("usage")
+        if usage is not None:
+            chat_chunk["usage"] = usage
+        return cast(ChatCompletionChunk, chat_chunk)
     @classmethod
     def _get_first_chat_completion_chunk(
         cls, chunk: CompletionChunk
     ) -> ChatCompletionChunk:
-        return {
+        chat_chunk = {
             "id": "chat" + chunk["id"],
             "model": chunk["model"],
             "created": chunk["created"],
@@ -397,6 +419,10 @@ Begin!"""
                 for i, choice in enumerate(chunk["choices"])
             ],
         }
+        usage = chunk.get("usage")
+        if usage is not None:
+            chat_chunk["usage"] = usage
+        return cast(ChatCompletionChunk, chat_chunk)
     @classmethod
     def _to_chat_completion_chunks(
@@ -494,16 +520,19 @@ Begin!"""
         return text, None, None
     @classmethod
-    def _tool_calls_completion(cls, model_name, model_uid, c, tools):
+    def _tool_calls_completion(cls, model_family, model_uid, c, tools):
         _id = str(uuid.uuid4())
-        if model_name == "gorilla-openfunctions-v1":
+        family = model_family.model_family or model_family.model_name
+        if "gorilla-openfunctions-v1" == family:
             content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
-        elif model_name == "chatglm3":
+        elif "chatglm3" == family:
             content, func, args = cls._eval_chatglm3_arguments(c, tools)
-        elif model_name == "qwen-chat":
+        elif "qwen-chat" == family:
             content, func, args = cls._eval_qwen_chat_arguments(c, tools)
         else:
-            raise Exception(f"Model {model_name} is not support tool calls.")
+            raise Exception(
+                f"Model {model_family.model_name} is not support tool calls."
+            )
         logger.debug("Tool call content: %s, func: %s, args: %s", content, func, args)
         if content:

xinference 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

xinference 0.7.5py3-none-any.whl → 0.8.1py3-none-any.whl