PyPI - xinference - Versions diffs - 0.10.3__py3-none-any.whl → 0.11.1__py3-none-any.whl - Mend

xinference 0.10.3py3-none-any.whl → 0.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (101) hide show

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -413,7 +413,7 @@
         ],
         "model_hub": "modelscope",
         "model_id": "ZhipuAI/chatglm3-6b",
-        "model_revision": "v1.0.0"
+        "model_revision": "v1.0.2"
       }
     ],
     "prompt_style": {
@@ -1289,7 +1289,7 @@
   },
   {
     "version": 1,
-    "context_length": 204800,
+    "context_length": 262144,
     "model_name": "Yi-200k",
     "model_lang": [
       "en",
@@ -1328,7 +1328,7 @@
   },
   {
     "version": 1,
-    "context_length": 204800,
+    "context_length": 4096,
     "model_name": "Yi-chat",
     "model_lang": [
       "en",
@@ -1349,6 +1349,18 @@
         "model_id": "01ai/Yi-34B-Chat-{quantization}",
         "model_revision": "master"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-6B-Chat",
+        "model_revision": "master"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 34,
@@ -1385,6 +1397,130 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "Yi-1.5",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-1.5-6B",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-1.5-9B",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-1.5-34B",
+        "model_revision": "master"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "Yi-1.5-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-1.5-6B-Chat",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-1.5-9B-Chat",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-1.5-34B-Chat",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATML",
+      "system_prompt": "",
+      "roles": [
+        "<|im_start|>user",
+        "<|im_start|>assistant"
+      ],
+      "intra_message_sep": "<|im_end|>",
+      "inter_message_sep": "",
+      "stop_token_ids": [
+        2,
+        6,
+        7,
+        8
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>",
+        "<|im_sep|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,
@@ -1937,6 +2073,17 @@
         "model_id": "qwen/Qwen1.5-72B-Chat",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 110,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/Qwen1.5-110B-Chat",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "gptq",
         "model_size_in_billions": "0_5",
@@ -2006,6 +2153,15 @@
         "model_id": "qwen/Qwen1.5-72B-Chat-GPTQ-{quantization}",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 110,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-110B-Chat-GPTQ-Int4",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "awq",
         "model_size_in_billions": "0_5",
@@ -2069,6 +2225,15 @@
         "model_id": "qwen/Qwen1.5-72B-Chat-AWQ",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 110,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "qwen/Qwen1.5-110B-Chat-AWQ",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": "0_5",
@@ -2267,7 +2432,7 @@
   },
   {
     "version": 1,
-    "context_length": 32768,
+    "context_length": 65536,
     "model_name": "codeqwen1.5-chat",
     "model_lang": [
       "en",
@@ -2726,7 +2891,7 @@
   },
   {
     "version": 1,
-    "context_length": 204800,
+    "context_length": 4096,
     "model_name": "yi-vl-chat",
     "model_lang": [
       "en",
@@ -3295,5 +3460,93 @@
         "model_revision": "master"
       }
     ]
+  },
+  {
+    "version": 1,
+    "context_length": 128000,
+    "model_name": "phi-3-mini-128k-instruct",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The Phi-3-Mini-128K-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "LLM-Research/Phi-3-mini-128k-instruct",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "PHI3",
+      "system_prompt": "You are a helpful AI assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "inter_message_sep": "<|end|>\n",
+      "stop_token_ids":[
+        32000,
+        32007
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|end|>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "phi-3-mini-4k-instruct",
+    "model_lang": [
+      "en"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "The Phi-3-Mini-4k-Instruct is a 3.8 billion-parameter, lightweight, state-of-the-art open model trained using the Phi-3 datasets.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 4,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "LLM-Research/Phi-3-mini-4k-instruct",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "PHI3",
+      "system_prompt": "You are a helpful AI assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "intra_message_sep": "\n",
+      "inter_message_sep": "<|end|>\n",
+      "stop_token_ids":[
+        32000,
+        32007
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|end|>"
+      ]
+    }
   }
 ]

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -147,14 +147,26 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             )
         else:
             stream = generate_config.get("stream", False)
+            stream_options = generate_config.pop("stream_options", None)
+            include_usage = (
+                stream_options["include_usage"]
+                if isinstance(stream_options, dict)
+                else False
+            )
             if stream:
                 def _stream_generator():
                     last_chunk_text_length = 0
                     chunk_id = "chat-" + str(uuid.uuid1())
+                    prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+                    inputs = self._tokenizer([prompt], return_tensors="pt")
+                    inputs = inputs.to(self._model.device)
+                    prompt_tokens = len(inputs["input_ids"][0])
                     for chunk_text, _ in self._model.stream_chat(
                         self._tokenizer, prompt, chat_history, **kwargs
                     ):
+                        completion_tokens = completion_tokens + 1
+                        total_tokens = prompt_tokens + completion_tokens
                         chunk_text = chunk_text[last_chunk_text_length:]
                         last_chunk_text_length += len(chunk_text)
                         completion_choice = CompletionChoice(
@@ -166,7 +178,43 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                             created=int(time.time()),
                             model=self.model_uid,
                             choices=[completion_choice],
+                            usage=CompletionUsage(
+                                prompt_tokens=prompt_tokens,
+                                completion_tokens=completion_tokens,
+                                total_tokens=total_tokens,
+                            ),
+                        )
+                    completion_choice = CompletionChoice(
+                        text="", index=0, logprobs=None, finish_reason="stop"
+                    )
+                    chunk = CompletionChunk(
+                        id=chunk_id,
+                        object="text_completion",
+                        created=int(time.time()),
+                        model=self.model_uid,
+                        choices=[completion_choice],
+                    )
+                    completion_usage = CompletionUsage(
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=total_tokens,
+                    )
+                    chunk["usage"] = completion_usage
+                    yield chunk
+                    if include_usage:
+                        chunk = CompletionChunk(
+                            id=chunk_id,
+                            object="text_completion",
+                            created=int(time.time()),
+                            model=self.model_uid,
+                            choices=[],
+                        )
+                        chunk["usage"] = CompletionUsage(
+                            prompt_tokens=prompt_tokens,
+                            completion_tokens=completion_tokens,
+                            total_tokens=total_tokens,
                         )
+                        yield chunk
                 return self._to_chat_completion_chunks(_stream_generator())
             else:

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -143,12 +143,17 @@ class PytorchModel(LLM):
                     f"Failed to import 'PeftModel' from 'peft'. Please make sure 'peft' is installed.\n\n"
                 )
-            for peft_model in self._peft_model:
-                # Apply LoRA
-                self._model = PeftModel.from_pretrained(
-                    self._model,
-                    peft_model.local_path,
-                )
+            for i, peft_model in enumerate(self._peft_model):
+                if i == 0:
+                    self._model = PeftModel.from_pretrained(
+                        self._model,
+                        peft_model.local_path,
+                        adapter_name=peft_model.lora_name,
+                    )
+                else:
+                    self._model.load_adapter(
+                        peft_model.local_path, adapter_name=peft_model.lora_name
+                    )
                 logger.info(
                     f"PEFT adaptor '{peft_model.lora_name}' successfully loaded for model '{self.model_uid}'."
                 )
@@ -302,6 +307,18 @@ class PytorchModel(LLM):
         assert self._model is not None
         assert self._tokenizer is not None
+        lora_model = generate_config.pop("lora_name")
+        if lora_model is not None and self._peft_model is not None:
+            for lora in self._peft_model:
+                if lora_model == lora.lora_name:
+                    self._model.set_adapter(lora_model)
+                    logger.info(f"Set lora model to {lora_model}")
+                    break
+            else:
+                self._model.disable_adapter()
+                logger.info(f"No lora model {lora_model} found, skip setting")
         stream = generate_config.get("stream", False)
         if not stream:
             if "falcon" in model_family_name:

xinference/model/llm/pytorch/deepseek_vl.py CHANGED Viewed

@@ -27,9 +27,11 @@ import torch
 from ....model.utils import select_device
 from ....types import (
     ChatCompletion,
-    ChatCompletionChoice,
     ChatCompletionChunk,
     ChatCompletionMessage,
+    Completion,
+    CompletionChoice,
+    CompletionChunk,
     CompletionUsage,
 )
 from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -67,12 +69,12 @@ class DeepSeekVLChatModel(PytorchChatModel):
         self._type = torch.float16 if self._device == "mps" else torch.bfloat16
         # specify the path to the model
-        self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
+        self._vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(  # type: ignore
             self.model_path
         )
         self._tokenizer = self._vl_chat_processor.tokenizer
-        vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
+        vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(  # type: ignore
             self.model_path, trust_remote_code=True, device_map=self._device
         )
         self._model = vl_gpt.to(self._type).eval()
@@ -149,10 +151,16 @@ class DeepSeekVLChatModel(PytorchChatModel):
         chat_history: Optional[List[ChatCompletionMessage]] = None,
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        if generate_config and generate_config.get("stream"):
-            raise Exception(
-                f"Chat with model {self.model_family.model_name} does not support stream."
-            )
+        if not generate_config:
+            generate_config = {}
+        stream = generate_config.get("stream", False)
+        stream_options = generate_config.pop("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
         prompt, images = self._message_content_to_deepseek(prompt)
         prompt_messages: List[Dict[str, Any]] = [
             {
@@ -184,6 +192,7 @@ class DeepSeekVLChatModel(PytorchChatModel):
         deepseek_history.extend(prompt_messages)
+        from ....thirdparty.deepseek_vl.serve.inference import generate
         from ....thirdparty.deepseek_vl.utils.io import load_pil_images
         # load images and prepare for inputs
@@ -192,41 +201,114 @@ class DeepSeekVLChatModel(PytorchChatModel):
             conversations=deepseek_history, images=pil_images, force_batchify=True
         ).to(self._model.device, self._model.dtype)
-        # run image encoder to get the image embeddings
-        inputs_embeds = self._model.prepare_inputs_embeds(**prepare_inputs)
-        # run the model to get the response
-        outputs = self._model.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=prepare_inputs.attention_mask,
-            pad_token_id=self._tokenizer.eos_token_id,
-            bos_token_id=self._tokenizer.bos_token_id,
-            eos_token_id=self._tokenizer.eos_token_id,
-            max_new_tokens=512,
-            do_sample=True,
-            top_p=0.95,
-            temperature=0.2,
-            repetition_penalty=1.1,
-            use_cache=True,
-        )
+        temperature = generate_config.get("temperature", 0.2)
+        top_p = generate_config.get("top_p", 0.95)
+        max_new_tokens = generate_config.get("max_tokens", 512)
+        repetition_penalty = generate_config.get("repetition_penalty", 1.1)
+        conversation = self._vl_chat_processor.new_chat_template()
+        stop_str = conversation.sep2
+        stop_words = [stop_str]
-        answer = self._tokenizer.decode(
-            outputs[0].cpu().tolist(), skip_special_tokens=True
+        streamer = generate(
+            vl_gpt=self._model,
+            tokenizer=self._tokenizer,
+            prepare_inputs=prepare_inputs,
+            max_gen_len=max_new_tokens,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            top_p=top_p,
+            stop_words=stop_words,
         )
-        return ChatCompletion(
-            id="chat" + str(uuid.uuid1()),
-            object="chat.completion",
+        if stream:
+            it = self._generate_stream(streamer, stop_str, include_usage, prompt)
+            return self._to_chat_completion_chunks(it)
+        else:
+            c = self._generate(streamer, stop_str)
+            return self._to_chat_completion(c)
+    def _generate(self, streamer, stop_str) -> Completion:
+        generated_text = ""
+        for new_text in streamer:
+            if new_text.endswith(stop_str):
+                new_text = new_text[: -len(stop_str)]
+            generated_text += new_text
+        c = Completion(
+            id=str(uuid.uuid1()),
+            object="text_completion",
             created=int(time.time()),
             model=self.model_uid,
             choices=[
-                ChatCompletionChoice(
-                    index=0,
-                    message={"role": "assistant", "content": answer},
-                    finish_reason="stop",
+                CompletionChoice(
+                    index=0, text=generated_text, finish_reason="stop", logprobs=None
                 )
             ],
             usage=CompletionUsage(
                 prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
             ),
         )
+        return c
+    def _generate_stream(
+        self, streamer, stop_str, include_usage, prompt
+    ) -> Iterator[CompletionChunk]:
+        completion_id = str(uuid.uuid1())
+        prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+        input_ids = self._tokenizer(prompt).input_ids
+        prompt_tokens = len(input_ids)
+        for i, new_text in enumerate(streamer):
+            if new_text.endswith(stop_str):
+                new_text = new_text[: -len(stop_str)]
+            completion_choice = CompletionChoice(
+                text=new_text, index=0, logprobs=None, finish_reason=None
+            )
+            chunk = CompletionChunk(
+                id=completion_id,
+                object="text_completion",
+                created=int(time.time()),
+                model=self.model_uid,
+                choices=[completion_choice],
+            )
+            completion_tokens = i
+            total_tokens = prompt_tokens + completion_tokens
+            completion_usage = CompletionUsage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+            )
+            chunk["usage"] = completion_usage
+            yield chunk
+        completion_choice = CompletionChoice(
+            text="", index=0, logprobs=None, finish_reason="stop"
+        )
+        chunk = CompletionChunk(
+            id=completion_id,
+            object="text_completion",
+            created=int(time.time()),
+            model=self.model_uid,
+            choices=[completion_choice],
+        )
+        completion_usage = CompletionUsage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+        )
+        chunk["usage"] = completion_usage
+        yield chunk
+        if include_usage:
+            chunk = CompletionChunk(
+                id=completion_id,
+                object="text_completion",
+                created=int(time.time()),
+                model=self.model_uid,
+                choices=[],
+            )
+            chunk["usage"] = CompletionUsage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+            )
+            yield chunk

xinference 0.10.3__py3-none-any.whl → 0.11.1__py3-none-any.whl

Potentially problematic release.

xinference 0.10.3py3-none-any.whl → 0.11.1py3-none-any.whl