PyPI - xinference - Versions diffs - 0.10.2.post1__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

xinference 0.10.2.post1py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (92) hide show

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -84,6 +84,96 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "llama-3",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Llama 3 is an auto-regressive language model that uses an optimized transformer architecture",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "LLM-Research/Meta-Llama-3-8B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "LLM-Research/Meta-Llama-3-70B",
+        "model_hub": "modelscope"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 8192,
+    "model_name": "llama-3-instruct",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks..",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "LLM-Research/Meta-Llama-3-8B-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 70,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "LLM-Research/Meta-Llama-3-70B-Instruct",
+        "model_hub": "modelscope"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "LLAMA3",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n\n",
+      "inter_message_sep": "<|eot_id|>",
+      "stop_token_ids": [
+        128001,
+        128009
+      ],
+      "stop": [
+        "<|end_of_text|>",
+        "<|eot_id|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,
@@ -323,7 +413,7 @@
         ],
         "model_hub": "modelscope",
         "model_id": "ZhipuAI/chatglm3-6b",
-        "model_revision": "v1.0.0"
+        "model_revision": "v1.0.2"
       }
     ],
     "prompt_style": {
@@ -1847,6 +1937,17 @@
         "model_id": "qwen/Qwen1.5-72B-Chat",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 110,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen1.5-110B-Chat",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "gptq",
         "model_size_in_billions": "0_5",
@@ -1916,6 +2017,15 @@
         "model_id": "qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 110,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-110B-Chat-GPTQ-Int4",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "awq",
         "model_size_in_billions": "0_5",
@@ -1979,6 +2089,15 @@
         "model_id": "qwen/Qwen1.5-72B-Chat-AWQ",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 110,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-110B-Chat-AWQ",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": "0_5",
@@ -3205,5 +3324,93 @@
         "model_revision": "master"
       }
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "phi-3-mini-128k-instruct",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The Phi-3-Mini-128K-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "LLM-Research/Phi-3-mini-128k-instruct",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "PHI3",
+      "system_prompt": "You are a helpful AI assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "inter_message_sep": "<|end|>\n",
+      "stop_token_ids":[
+        32000,
+        32007
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|end|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "phi-3-mini-4k-instruct",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The Phi-3-Mini-4k-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "LLM-Research/Phi-3-mini-4k-instruct",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "PHI3",
+      "system_prompt": "You are a helpful AI assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "inter_message_sep": "<|end|>\n",
+      "stop_token_ids":[
+        32000,
+        32007
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|end|>"
+      ]
+    }
   }
 ]

xinference/model/llm/pytorch/deepseek_vl.py CHANGED Viewed

@@ -27,9 +27,11 @@ import torch
 from ....model.utils import select_device
 from ....types import (
     ChatCompletion,
-    ChatCompletionChoice,
     ChatCompletionChunk,
     ChatCompletionMessage,
+    Completion,
+    CompletionChoice,
+    CompletionChunk,
     CompletionUsage,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -67,12 +69,12 @@ class DeepSeekVLChatModel(PytorchChatModel):
         self._type = torch.float16 if self._device == "mps" else torch.bfloat16
         # specify the path to the model
-        self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
+        self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(  # type: ignore
             self.model_path
         )
         self._tokenizer = self._vl_chat_processor.tokenizer
-        vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
+        vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(  # type: ignore
             self.model_path, trust_remote_code=True, device_map=self._device
         )
         self._model = vl_gpt.to(self._type).eval()
@@ -149,10 +151,11 @@ class DeepSeekVLChatModel(PytorchChatModel):
         chat_history: Optional[List[ChatCompletionMessage]] = None,
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        if generate_config and generate_config.get("stream"):
-            raise Exception(
-                f"Chat with model {self.model_family.model_name} does not support stream."
-            )
+        if not generate_config:
+            generate_config = {}
+        stream = generate_config.get("stream", False)
         prompt, images = self._message_content_to_deepseek(prompt)
         prompt_messages: List[Dict[str, Any]] = [
             {
@@ -184,6 +187,7 @@ class DeepSeekVLChatModel(PytorchChatModel):
         deepseek_history.extend(prompt_messages)
+        from ....thirdparty.deepseek_vl.serve.inference import generate
         from ....thirdparty.deepseek_vl.utils.io import load_pil_images
         # load images and prepare for inputs
@@ -192,41 +196,93 @@ class DeepSeekVLChatModel(PytorchChatModel):
             conversations=deepseek_history, images=pil_images, force_batchify=True
         ).to(self._model.device, self._model.dtype)
-        # run image encoder to get the image embeddings
-        inputs_embeds = self._model.prepare_inputs_embeds(**prepare_inputs)
-        # run the model to get the response
-        outputs = self._model.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=prepare_inputs.attention_mask,
-            pad_token_id=self._tokenizer.eos_token_id,
-            bos_token_id=self._tokenizer.bos_token_id,
-            eos_token_id=self._tokenizer.eos_token_id,
-            max_new_tokens=512,
-            do_sample=True,
-            top_p=0.95,
-            temperature=0.2,
-            repetition_penalty=1.1,
-            use_cache=True,
-        )
+        temperature = generate_config.get("temperature", 0.2)
+        top_p = generate_config.get("top_p", 0.95)
+        max_new_tokens = generate_config.get("max_tokens", 512)
+        repetition_penalty = generate_config.get("repetition_penalty", 1.1)
+        conversation = self._vl_chat_processor.new_chat_template()
+        stop_str = conversation.sep2
+        stop_words = [stop_str]
-        answer = self._tokenizer.decode(
-            outputs[0].cpu().tolist(), skip_special_tokens=True
+        streamer = generate(
+            vl_gpt=self._model,
+            tokenizer=self._tokenizer,
+            prepare_inputs=prepare_inputs,
+            max_gen_len=max_new_tokens,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            top_p=top_p,
+            stop_words=stop_words,
         )
-        return ChatCompletion(
-            id="chat" + str(uuid.uuid1()),
-            object="chat.completion",
+        if stream:
+            it = self._generate_stream(streamer, stop_str)
+            return self._to_chat_completion_chunks(it)
+        else:
+            c = self._generate(streamer, stop_str)
+            return self._to_chat_completion(c)
+    def _generate(self, streamer, stop_str) -> Completion:
+        generated_text = ""
+        for new_text in streamer:
+            if new_text.endswith(stop_str):
+                new_text = new_text[: -len(stop_str)]
+            generated_text += new_text
+        c = Completion(
+            id=str(uuid.uuid1()),
+            object="text_completion",
             created=int(time.time()),
             model=self.model_uid,
             choices=[
-                ChatCompletionChoice(
-                    index=0,
-                    message={"role": "assistant", "content": answer},
-                    finish_reason="stop",
+                CompletionChoice(
+                    index=0, text=generated_text, finish_reason="stop", logprobs=None
                 )
             ],
             usage=CompletionUsage(
                 prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
             ),
         )
+        return c
+    def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
+        completion_id = str(uuid.uuid1())
+        for i, new_text in enumerate(streamer):
+            if new_text.endswith(stop_str):
+                new_text = new_text[: -len(stop_str)]
+            completion_choice = CompletionChoice(
+                text=new_text, index=0, logprobs=None, finish_reason=None
+            )
+            chunk = CompletionChunk(
+                id=completion_id,
+                object="text_completion",
+                created=int(time.time()),
+                model=self.model_uid,
+                choices=[completion_choice],
+            )
+            completion_usage = CompletionUsage(
+                prompt_tokens=-1,
+                completion_tokens=-1,
+                total_tokens=-1,
+            )
+            chunk["usage"] = completion_usage
+            yield chunk
+        completion_choice = CompletionChoice(
+            text="", index=0, logprobs=None, finish_reason="stop"
+        )
+        chunk = CompletionChunk(
+            id=completion_id,
+            object="text_completion",
+            created=int(time.time()),
+            model=self.model_uid,
+            choices=[completion_choice],
+        )
+        completion_usage = CompletionUsage(
+            prompt_tokens=-1,
+            completion_tokens=-1,
+            total_tokens=-1,
+        )
+        chunk["usage"] = completion_usage
+        yield chunk

xinference/model/llm/pytorch/qwen_vl.py CHANGED Viewed

@@ -22,9 +22,11 @@ from typing import Dict, Iterator, List, Optional, Union
 from ....model.utils import select_device
 from ....types import (
     ChatCompletion,
-    ChatCompletionChoice,
     ChatCompletionChunk,
     ChatCompletionMessage,
+    Completion,
+    CompletionChoice,
+    CompletionChunk,
     CompletionUsage,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -116,10 +118,6 @@ class QwenVLChatModel(PytorchChatModel):
         chat_history: Optional[List[ChatCompletionMessage]] = None,
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        if generate_config and generate_config.get("stream"):
-            raise Exception(
-                f"Chat with model {self.model_family.model_name} does not support stream."
-            )
         prompt = self._message_content_to_qwen(prompt)
         # Convert openai history to qwen vl history
         qwen_history = []
@@ -134,22 +132,79 @@ class QwenVLChatModel(PytorchChatModel):
             if len(query_to_response) == 2:
                 qwen_history.append(query_to_response)
                 query_to_response = []
+        stream = generate_config.get("stream", False) if generate_config else False
+        if stream:
+            it = self._generate_stream(prompt, qwen_history)
+            return self._to_chat_completion_chunks(it)
+        else:
+            c = self._generate(prompt, qwen_history)
+            return self._to_chat_completion(c)
+    def _generate(self, prompt: str, qwen_history: List) -> Completion:
         response, history = self._model.chat(
             self._tokenizer, query=prompt, history=qwen_history
         )
-        return ChatCompletion(
-            id="chat" + str(uuid.uuid1()),
-            object="chat.completion",
+        c = Completion(
+            id=str(uuid.uuid1()),
+            object="text_completion",
             created=int(time.time()),
             model=self.model_uid,
             choices=[
-                ChatCompletionChoice(
-                    index=0,
-                    message={"role": "assistant", "content": response},
-                    finish_reason="stop",
+                CompletionChoice(
+                    index=0, text=response, finish_reason="stop", logprobs=None
                 )
             ],
             usage=CompletionUsage(
                 prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
             ),
         )
+        return c
+    def _generate_stream(
+        self, prompt: str, qwen_history: List
+    ) -> Iterator[CompletionChunk]:
+        # response, history = model.chat(tokenizer, message, history=history)
+        response_generator = self._model.chat_stream(
+            self._tokenizer, query=prompt, history=qwen_history
+        )
+        full_response = ""
+        for response in response_generator:
+            inc_content = response[len(full_response) :]
+            full_response = response
+            completion_choice = CompletionChoice(
+                text=inc_content, index=0, logprobs=None, finish_reason=None
+            )
+            completion_chunk = CompletionChunk(
+                id=str(uuid.uuid1()),
+                object="text_completion",
+                created=int(time.time()),
+                model=self.model_uid,
+                choices=[completion_choice],
+            )
+            completion_usage = CompletionUsage(
+                prompt_tokens=-1,
+                completion_tokens=-1,
+                total_tokens=-1,
+            )
+            completion_chunk["usage"] = completion_usage
+            yield completion_chunk
+        completion_choice = CompletionChoice(
+            text="", index=0, logprobs=None, finish_reason="stop"
+        )
+        completion_chunk = CompletionChunk(
+            id=str(uuid.uuid1()),
+            object="text_completion",
+            created=int(time.time()),
+            model=self.model_uid,
+            choices=[completion_choice],
+        )
+        completion_usage = CompletionUsage(
+            prompt_tokens=-1,
+            completion_tokens=-1,
+            total_tokens=-1,
+        )
+        completion_chunk["usage"] = completion_usage
+        yield completion_chunk

xinference/model/llm/pytorch/yi_vl.py CHANGED Viewed

@@ -27,9 +27,11 @@ from PIL import Image
 from ....model.utils import select_device
 from ....types import (
     ChatCompletion,
-    ChatCompletionChoice,
     ChatCompletionChunk,
     ChatCompletionMessage,
+    Completion,
+    CompletionChoice,
+    CompletionChunk,
     CompletionUsage,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -122,38 +124,6 @@ class YiVLChatModel(PytorchChatModel):
                 raise RuntimeError("Only one image per message is supported by Yi VL.")
         return content
-    @staticmethod
-    def _parse_text(text):
-        lines = text.split("\n")
-        lines = [line for line in lines if line != ""]
-        count = 0
-        for i, line in enumerate(lines):
-            if "```" in line:
-                count += 1
-                items = line.split("`")
-                if count % 2 == 1:
-                    lines[i] = f'<pre><code class="language-{items[-1]}">'
-                else:
-                    lines[i] = f"<br></code></pre>"
-            else:
-                if i > 0:
-                    if count % 2 == 1:
-                        line = line.replace("`", r"\`")
-                        line = line.replace("<", "&lt;")
-                        line = line.replace(">", "&gt;")
-                        line = line.replace(" ", "&nbsp;")
-                        line = line.replace("*", "&ast;")
-                        line = line.replace("_", "&lowbar;")
-                        line = line.replace("-", "&#45;")
-                        line = line.replace(".", "&#46;")
-                        line = line.replace("!", "&#33;")
-                        line = line.replace("(", "&#40;")
-                        line = line.replace(")", "&#41;")
-                        line = line.replace("$", "&#36;")
-                    lines[i] = "<br>" + line
-        text = "".join(lines)
-        return text
     def chat(
         self,
         prompt: Union[str, List[Dict]],
@@ -164,12 +134,12 @@ class YiVLChatModel(PytorchChatModel):
         from transformers import TextIteratorStreamer
         # TODO(codingl2k1): implement stream mode.
-        if generate_config and generate_config.get("stream"):
-            raise Exception(
-                f"Chat with model {self.model_family.model_name} does not support stream."
-            )
         if not generate_config:
             generate_config = {}
+        stream = generate_config.get("stream", False)
         from ....thirdparty.llava.conversation import conv_templates
         from ....thirdparty.llava.mm_utils import (
             KeywordsStoppingCriteria,
@@ -229,25 +199,72 @@ class YiVLChatModel(PytorchChatModel):
         t = Thread(target=self._model.generate, kwargs=generate_kwargs)
         t.start()
+        if stream:
+            it = self._generate_stream(streamer, stop_str)
+            return self._to_chat_completion_chunks(it)
+        else:
+            c = self._generate(streamer, stop_str)
+            return self._to_chat_completion(c)
+    def _generate(self, streamer, stop_str) -> Completion:
         generated_text = ""
         for new_text in streamer:
             generated_text += new_text
             if generated_text.endswith(stop_str):
                 generated_text = generated_text[: -len(stop_str)]
-        r = self._parse_text(generated_text)
-        return ChatCompletion(
-            id="chat" + str(uuid.uuid1()),
-            object="chat.completion",
+        c = Completion(
+            id=str(uuid.uuid1()),
+            object="text_completion",
             created=int(time.time()),
             model=self.model_uid,
             choices=[
-                ChatCompletionChoice(
-                    index=0,
-                    message={"role": "assistant", "content": r},
-                    finish_reason="stop",
+                CompletionChoice(
+                    index=0, text=generated_text, finish_reason="stop", logprobs=None
                 )
             ],
             usage=CompletionUsage(
                 prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
             ),
         )
+        return c
+    def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
+        completion_id = str(uuid.uuid1())
+        for i, new_text in enumerate(streamer):
+            if not new_text.endswith(stop_str):
+                completion_choice = CompletionChoice(
+                    text=new_text, index=0, logprobs=None, finish_reason=None
+                )
+                chunk = CompletionChunk(
+                    id=completion_id,
+                    object="text_completion",
+                    created=int(time.time()),
+                    model=self.model_uid,
+                    choices=[completion_choice],
+                )
+                completion_usage = CompletionUsage(
+                    prompt_tokens=-1,
+                    completion_tokens=-1,
+                    total_tokens=-1,
+                )
+                chunk["usage"] = completion_usage
+                yield chunk
+        completion_choice = CompletionChoice(
+            text="", index=0, logprobs=None, finish_reason="stop"
+        )
+        chunk = CompletionChunk(
+            id=completion_id,
+            object="text_completion",
+            created=int(time.time()),
+            model=self.model_uid,
+            choices=[completion_choice],
+        )
+        completion_usage = CompletionUsage(
+            prompt_tokens=-1,
+            completion_tokens=-1,
+            total_tokens=-1,
+        )
+        chunk["usage"] = completion_usage
+        yield chunk

xinference 0.10.2.post1__py3-none-any.whl → 0.11.0__py3-none-any.whl

Potentially problematic release.

xinference 0.10.2.post1py3-none-any.whl → 0.11.0py3-none-any.whl