PyPI - xinference - Versions diffs - 0.11.0__py3-none-any.whl → 0.11.1__py3-none-any.whl - Mend

xinference 0.11.0py3-none-any.whl → 0.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (37) hide show

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -1289,7 +1289,7 @@
   },
   {
     "version": 1,
-    "context_length": 204800,
+    "context_length": 262144,
     "model_name": "Yi-200k",
     "model_lang": [
       "en",
@@ -1328,7 +1328,7 @@
   },
   {
     "version": 1,
-    "context_length": 204800,
+    "context_length": 4096,
     "model_name": "Yi-chat",
     "model_lang": [
       "en",
@@ -1349,6 +1349,18 @@
         "model_id": "01ai/Yi-34B-Chat-{quantization}",
         "model_revision": "master"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-6B-Chat",
+        "model_revision": "master"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 34,
@@ -1385,6 +1397,130 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "Yi-1.5",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-1.5-6B",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-1.5-9B",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-1.5-34B",
+        "model_revision": "master"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "Yi-1.5-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-1.5-6B-Chat",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-1.5-9B-Chat",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "01ai/Yi-1.5-34B-Chat",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATML",
+      "system_prompt": "",
+      "roles": [
+        "<|im_start|>user",
+        "<|im_start|>assistant"
+      ],
+      "intra_message_sep": "<|im_end|>",
+      "inter_message_sep": "",
+      "stop_token_ids": [
+        2,
+        6,
+        7,
+        8
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>",
+        "<|im_sep|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,
@@ -2755,7 +2891,7 @@
   },
   {
     "version": 1,
-    "context_length": 204800,
+    "context_length": 4096,
     "model_name": "yi-vl-chat",
     "model_lang": [
       "en",

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -147,14 +147,26 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             )
         else:
             stream = generate_config.get("stream", False)
+            stream_options = generate_config.pop("stream_options", None)
+            include_usage = (
+                stream_options["include_usage"]
+                if isinstance(stream_options, dict)
+                else False
+            )
             if stream:
                 def _stream_generator():
                     last_chunk_text_length = 0
                     chunk_id = "chat-" + str(uuid.uuid1())
+                    prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+                    inputs = self._tokenizer([prompt], return_tensors="pt")
+                    inputs = inputs.to(self._model.device)
+                    prompt_tokens = len(inputs["input_ids"][0])
                     for chunk_text, _ in self._model.stream_chat(
                         self._tokenizer, prompt, chat_history, **kwargs
                     ):
+                        completion_tokens = completion_tokens + 1
+                        total_tokens = prompt_tokens + completion_tokens
                         chunk_text = chunk_text[last_chunk_text_length:]
                         last_chunk_text_length += len(chunk_text)
                         completion_choice = CompletionChoice(
@@ -166,7 +178,43 @@ class ChatglmPytorchChatModel(PytorchChatModel):
                             created=int(time.time()),
                             model=self.model_uid,
                             choices=[completion_choice],
+                            usage=CompletionUsage(
+                                prompt_tokens=prompt_tokens,
+                                completion_tokens=completion_tokens,
+                                total_tokens=total_tokens,
+                            ),
+                        )
+                    completion_choice = CompletionChoice(
+                        text="", index=0, logprobs=None, finish_reason="stop"
+                    )
+                    chunk = CompletionChunk(
+                        id=chunk_id,
+                        object="text_completion",
+                        created=int(time.time()),
+                        model=self.model_uid,
+                        choices=[completion_choice],
+                    )
+                    completion_usage = CompletionUsage(
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=total_tokens,
+                    )
+                    chunk["usage"] = completion_usage
+                    yield chunk
+                    if include_usage:
+                        chunk = CompletionChunk(
+                            id=chunk_id,
+                            object="text_completion",
+                            created=int(time.time()),
+                            model=self.model_uid,
+                            choices=[],
+                        )
+                        chunk["usage"] = CompletionUsage(
+                            prompt_tokens=prompt_tokens,
+                            completion_tokens=completion_tokens,
+                            total_tokens=total_tokens,
                         )
+                        yield chunk
                 return self._to_chat_completion_chunks(_stream_generator())
             else:

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -143,12 +143,17 @@ class PytorchModel(LLM):
                     f"Failed to import 'PeftModel' from 'peft'. Please make sure 'peft' is installed.\n\n"
                 )
-            for peft_model in self._peft_model:
-                # Apply LoRA
-                self._model = PeftModel.from_pretrained(
-                    self._model,
-                    peft_model.local_path,
-                )
+            for i, peft_model in enumerate(self._peft_model):
+                if i == 0:
+                    self._model = PeftModel.from_pretrained(
+                        self._model,
+                        peft_model.local_path,
+                        adapter_name=peft_model.lora_name,
+                    )
+                else:
+                    self._model.load_adapter(
+                        peft_model.local_path, adapter_name=peft_model.lora_name
+                    )
                 logger.info(
                     f"PEFT adaptor '{peft_model.lora_name}' successfully loaded for model '{self.model_uid}'."
                 )
@@ -302,6 +307,18 @@ class PytorchModel(LLM):
         assert self._model is not None
         assert self._tokenizer is not None
+        lora_model = generate_config.pop("lora_name")
+        if lora_model is not None and self._peft_model is not None:
+            for lora in self._peft_model:
+                if lora_model == lora.lora_name:
+                    self._model.set_adapter(lora_model)
+                    logger.info(f"Set lora model to {lora_model}")
+                    break
+            else:
+                self._model.disable_adapter()
+                logger.info(f"No lora model {lora_model} found, skip setting")
         stream = generate_config.get("stream", False)
         if not stream:
             if "falcon" in model_family_name:

xinference/model/llm/pytorch/deepseek_vl.py CHANGED Viewed

@@ -155,7 +155,12 @@ class DeepSeekVLChatModel(PytorchChatModel):
             generate_config = {}
         stream = generate_config.get("stream", False)
+        stream_options = generate_config.pop("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
         prompt, images = self._message_content_to_deepseek(prompt)
         prompt_messages: List[Dict[str, Any]] = [
             {
@@ -217,7 +222,7 @@ class DeepSeekVLChatModel(PytorchChatModel):
         )
         if stream:
-            it = self._generate_stream(streamer, stop_str)
+            it = self._generate_stream(streamer, stop_str, include_usage, prompt)
             return self._to_chat_completion_chunks(it)
         else:
             c = self._generate(streamer, stop_str)
@@ -246,8 +251,13 @@ class DeepSeekVLChatModel(PytorchChatModel):
         )
         return c
-    def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
+    def _generate_stream(
+        self, streamer, stop_str, include_usage, prompt
+    ) -> Iterator[CompletionChunk]:
         completion_id = str(uuid.uuid1())
+        prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+        input_ids = self._tokenizer(prompt).input_ids
+        prompt_tokens = len(input_ids)
         for i, new_text in enumerate(streamer):
             if new_text.endswith(stop_str):
                 new_text = new_text[: -len(stop_str)]
@@ -261,10 +271,12 @@ class DeepSeekVLChatModel(PytorchChatModel):
                 model=self.model_uid,
                 choices=[completion_choice],
             )
+            completion_tokens = i
+            total_tokens = prompt_tokens + completion_tokens
             completion_usage = CompletionUsage(
-                prompt_tokens=-1,
-                completion_tokens=-1,
-                total_tokens=-1,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
             )
             chunk["usage"] = completion_usage
             yield chunk
@@ -280,9 +292,23 @@ class DeepSeekVLChatModel(PytorchChatModel):
             choices=[completion_choice],
         )
         completion_usage = CompletionUsage(
-            prompt_tokens=-1,
-            completion_tokens=-1,
-            total_tokens=-1,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
         )
         chunk["usage"] = completion_usage
         yield chunk
+        if include_usage:
+            chunk = CompletionChunk(
+                id=completion_id,
+                object="text_completion",
+                created=int(time.time()),
+                model=self.model_uid,
+                choices=[],
+            )
+            chunk["usage"] = CompletionUsage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+            )
+            yield chunk

xinference/model/llm/pytorch/internlm2.py CHANGED Viewed

@@ -108,6 +108,12 @@ class Internlm2PytorchChatModel(PytorchChatModel):
             kwargs["max_length"] = int(max_new_tokens)
         stream = generate_config.get("stream", False)
+        stream_options = generate_config.pop("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
         if chat_history:
             input_history = [
                 (chat_history[i]["content"], (chat_history[i + 1]["content"]))
@@ -122,9 +128,15 @@ class Internlm2PytorchChatModel(PytorchChatModel):
             def _stream_generator():
                 last_chunk_text_length = 0
                 chunk_id = "chat-" + str(uuid.uuid1())
+                prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+                inputs = self._tokenizer([prompt], return_tensors="pt")
+                inputs = inputs.to(self._model.device)
+                prompt_tokens = len(inputs["input_ids"][0])
                 for chunk_text, _ in self._model.stream_chat(
-                    self._tokenizer, prompt, input_history, **kwargs
+                    self._tokenizer, prompt, chat_history, **kwargs
                 ):
+                    completion_tokens = completion_tokens + 1
+                    total_tokens = prompt_tokens + completion_tokens
                     chunk_text = chunk_text[last_chunk_text_length:]
                     last_chunk_text_length += len(chunk_text)
                     completion_choice = CompletionChoice(
@@ -136,7 +148,26 @@ class Internlm2PytorchChatModel(PytorchChatModel):
                         created=int(time.time()),
                         model=self.model_uid,
                         choices=[completion_choice],
+                        usage=CompletionUsage(
+                            prompt_tokens=prompt_tokens,
+                            completion_tokens=completion_tokens,
+                            total_tokens=total_tokens,
+                        ),
+                    )
+                if include_usage:
+                    chunk = CompletionChunk(
+                        id=chunk_id,
+                        object="text_completion",
+                        created=int(time.time()),
+                        model=self.model_uid,
+                        choices=[],
+                    )
+                    chunk["usage"] = CompletionUsage(
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=total_tokens,
                     )
+                    yield chunk
             return self._to_chat_completion_chunks(_stream_generator())
         else:

xinference/model/llm/pytorch/qwen_vl.py CHANGED Viewed

@@ -134,9 +134,16 @@ class QwenVLChatModel(PytorchChatModel):
                 query_to_response = []
         stream = generate_config.get("stream", False) if generate_config else False
+        stream_options = (
+            generate_config.pop("stream_options", None) if generate_config else None
+        )
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
         if stream:
-            it = self._generate_stream(prompt, qwen_history)
+            it = self._generate_stream(prompt, qwen_history, include_usage)
             return self._to_chat_completion_chunks(it)
         else:
             c = self._generate(prompt, qwen_history)
@@ -163,12 +170,16 @@ class QwenVLChatModel(PytorchChatModel):
         return c
     def _generate_stream(
-        self, prompt: str, qwen_history: List
+        self, prompt: str, qwen_history: List, include_usage
     ) -> Iterator[CompletionChunk]:
         # response, history = model.chat(tokenizer, message, history=history)
         response_generator = self._model.chat_stream(
             self._tokenizer, query=prompt, history=qwen_history
         )
+        completion_id = str(uuid.uuid1())
+        prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+        input_ids = self._tokenizer(prompt, allowed_special="all").input_ids
+        prompt_tokens = len(input_ids)
         full_response = ""
         for response in response_generator:
             inc_content = response[len(full_response) :]
@@ -177,16 +188,18 @@ class QwenVLChatModel(PytorchChatModel):
                 text=inc_content, index=0, logprobs=None, finish_reason=None
             )
             completion_chunk = CompletionChunk(
-                id=str(uuid.uuid1()),
+                id=completion_id,
                 object="text_completion",
                 created=int(time.time()),
                 model=self.model_uid,
                 choices=[completion_choice],
             )
+            completion_tokens = completion_tokens + 1
+            total_tokens = prompt_tokens + completion_tokens
             completion_usage = CompletionUsage(
-                prompt_tokens=-1,
-                completion_tokens=-1,
-                total_tokens=-1,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
             )
             completion_chunk["usage"] = completion_usage
             yield completion_chunk
@@ -195,16 +208,30 @@ class QwenVLChatModel(PytorchChatModel):
             text="", index=0, logprobs=None, finish_reason="stop"
         )
         completion_chunk = CompletionChunk(
-            id=str(uuid.uuid1()),
+            id=completion_id,
             object="text_completion",
             created=int(time.time()),
             model=self.model_uid,
             choices=[completion_choice],
         )
         completion_usage = CompletionUsage(
-            prompt_tokens=-1,
-            completion_tokens=-1,
-            total_tokens=-1,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
         )
         completion_chunk["usage"] = completion_usage
         yield completion_chunk
+        if include_usage:
+            chunk = CompletionChunk(
+                id=completion_id,
+                object="text_completion",
+                created=int(time.time()),
+                model=self.model_uid,
+                choices=[],
+            )
+            chunk["usage"] = CompletionUsage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+            )
+            yield chunk

xinference/model/llm/pytorch/utils.py CHANGED Viewed

@@ -106,6 +106,10 @@ def generate_stream(
     context_len = get_context_length(model.config)
     stream_interval = generate_config.get("stream_interval", 2)
     stream = generate_config.get("stream", False)
+    stream_options = generate_config.pop("stream_options", None)
+    include_usage = (
+        stream_options["include_usage"] if isinstance(stream_options, dict) else False
+    )
     len_prompt = len(prompt)
@@ -333,6 +337,21 @@ def generate_stream(
     yield completion_chunk, completion_usage
+    if include_usage:
+        completion_chunk = CompletionChunk(
+            id=str(uuid.uuid1()),
+            object="text_completion",
+            created=int(time.time()),
+            model=model_uid,
+            choices=[],
+        )
+        completion_usage = CompletionUsage(
+            prompt_tokens=input_echo_len,
+            completion_tokens=i,
+            total_tokens=(input_echo_len + i),
+        )
+        yield completion_chunk, completion_usage
     # clean
     del past_key_values, out
     gc.collect()
@@ -352,7 +371,10 @@ def generate_stream_falcon(
     context_len = get_context_length(model.config)
     stream_interval = generate_config.get("stream_interval", 2)
     stream = generate_config.get("stream", False)
+    stream_options = generate_config.pop("stream_options", None)
+    include_usage = (
+        stream_options["include_usage"] if isinstance(stream_options, dict) else False
+    )
     len_prompt = len(prompt)
     temperature = float(generate_config.get("temperature", 1.0))
@@ -488,6 +510,21 @@ def generate_stream_falcon(
     yield completion_chunk, completion_usage
+    if include_usage:
+        completion_chunk = CompletionChunk(
+            id=str(uuid.uuid1()),
+            object="text_completion",
+            created=int(time.time()),
+            model=model_uid,
+            choices=[],
+        )
+        completion_usage = CompletionUsage(
+            prompt_tokens=input_echo_len,
+            completion_tokens=i,
+            total_tokens=(input_echo_len + i),
+        )
+        yield completion_chunk, completion_usage
     # clean
     gc.collect()
     empty_cache()

xinference 0.11.0__py3-none-any.whl → 0.11.1__py3-none-any.whl

Potentially problematic release.

xinference 0.11.0py3-none-any.whl → 0.11.1py3-none-any.whl