PyPI - xinference - Versions diffs - 0.11.0__py3-none-any.whl → 0.11.1__py3-none-any.whl - Mend

xinference 0.11.0py3-none-any.whl → 0.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (37) hide show

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-05-11T17:30:18+0800",
+ "date": "2024-05-17T14:10:09+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "21be5abd6ff8411015a9b8862cbdb6b070bc2b1c",
- "version": "0.11.0"
+ "full-revisionid": "55a0200079eacf4fd6ee10c5868f0eaba244db29",
+ "version": "0.11.1"
 }
 '''  # END VERSION_JSON

xinference/core/chat_interface.py CHANGED Viewed

@@ -109,6 +109,7 @@ class GradioInterface:
             history: List[List[str]],
             max_tokens: int,
             temperature: float,
+            lora_name: str,
         ) -> Generator:
             from ..client import RESTfulClient
@@ -127,6 +128,7 @@ class GradioInterface:
                     "max_tokens": int(max_tokens),
                     "temperature": temperature,
                     "stream": True,
+                    "lora_name": lora_name,
                 },
             ):
                 assert isinstance(chunk, dict)
@@ -152,6 +154,7 @@ class GradioInterface:
                 gr.Slider(
                     minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
                 ),
+                gr.Text(label="LoRA Name"),
             ],
             title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
             css="""
@@ -331,7 +334,7 @@ class GradioInterface:
                 history: hist,
             }
-        def complete(text, hist, max_tokens, temperature) -> Generator:
+        def complete(text, hist, max_tokens, temperature, lora_name) -> Generator:
             from ..client import RESTfulClient
             client = RESTfulClient(self.endpoint)
@@ -349,6 +352,7 @@ class GradioInterface:
                     "max_tokens": max_tokens,
                     "temperature": temperature,
                     "stream": True,
+                    "lora_name": lora_name,
                 },
             ):
                 assert isinstance(chunk, dict)
@@ -368,7 +372,7 @@ class GradioInterface:
                 history: hist,
             }
-        def retry(text, hist, max_tokens, temperature) -> Generator:
+        def retry(text, hist, max_tokens, temperature, lora_name) -> Generator:
             from ..client import RESTfulClient
             client = RESTfulClient(self.endpoint)
@@ -387,6 +391,7 @@ class GradioInterface:
                     "max_tokens": max_tokens,
                     "temperature": temperature,
                     "stream": True,
+                    "lora_name": lora_name,
                 },
             ):
                 assert isinstance(chunk, dict)
@@ -470,10 +475,11 @@ class GradioInterface:
                     temperature = gr.Slider(
                         minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
                     )
+                    lora_name = gr.Text(label="LoRA Name")
                 btn_generate.click(
                     fn=complete,
-                    inputs=[textbox, history, length, temperature],
+                    inputs=[textbox, history, length, temperature, lora_name],
                     outputs=[textbox, history],
                 )
@@ -485,7 +491,7 @@ class GradioInterface:
                 btn_retry.click(
                     fn=retry,
-                    inputs=[textbox, history, length, temperature],
+                    inputs=[textbox, history, length, temperature, lora_name],
                     outputs=[textbox, history],
                 )

xinference/core/model.py CHANGED Viewed

@@ -257,7 +257,7 @@ class ModelActor(xo.StatelessActor):
             for v in gen:
                 if time_to_first_token is None:
                     time_to_first_token = (time.time() - start_time) * 1000
-                final_usage = v.pop("usage", None)
+                final_usage = v.get("usage", None)
                 v = dict(data=json.dumps(v))
                 yield sse_starlette.sse.ensure_bytes(v, None)
         except OutOfMemoryError:
@@ -289,7 +289,7 @@ class ModelActor(xo.StatelessActor):
             async for v in gen:
                 if time_to_first_token is None:
                     time_to_first_token = (time.time() - start_time) * 1000
-                final_usage = v.pop("usage", None)
+                final_usage = v.get("usage", None)
                 v = await asyncio.to_thread(json.dumps, v)
                 v = dict(data=v)  # noqa: F821
                 yield await asyncio.to_thread(sse_starlette.sse.ensure_bytes, v, None)

xinference/fields.py CHANGED Viewed

@@ -75,7 +75,9 @@ stream_field = Field(
 )
 stream_option_field = Field(
-    default={},
+    default={
+        "include_usage": False,
+    },
     description="If set, an additional chunk will be streamed before the `data: [DONE]` message.",
 )

xinference/model/llm/ggml/chatglm.py CHANGED Viewed

@@ -108,10 +108,11 @@ class ChatglmCppChatModel(LLM):
     @staticmethod
     def _convert_raw_text_chunks_to_chat(
-        tokens: Iterator[Any], model_name: str
+        tokens: Iterator[Any], model_name: str, include_usage: bool, input_ids
     ) -> Iterator[ChatCompletionChunk]:
+        request_id = str(uuid.uuid4())
         yield {
-            "id": "chat" + f"cmpl-{str(uuid.uuid4())}",
+            "id": "chat" + f"cmpl-{request_id}",
             "model": model_name,
             "object": "chat.completion.chunk",
             "created": int(time.time()),
@@ -125,9 +126,13 @@ class ChatglmCppChatModel(LLM):
                 }
             ],
         }
+        prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
         for token in tokens:
+            prompt_tokens = len(input_ids)
+            completion_tokens = completion_tokens + 1
+            total_tokens = prompt_tokens + completion_tokens
             yield {
-                "id": "chat" + f"cmpl-{str(uuid.uuid4())}",
+                "id": "chat" + f"cmpl-{request_id}",
                 "model": model_name,
                 "object": "chat.completion.chunk",
                 "created": int(time.time()),
@@ -143,6 +148,35 @@ class ChatglmCppChatModel(LLM):
                     }
                 ],
             }
+        # stop
+        yield {
+            "id": "chat" + f"cmpl-{request_id}",
+            "model": model_name,
+            "object": "chat.completion.chunk",
+            "created": int(time.time()),
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {
+                        "content": "",
+                    },
+                    "finish_reason": "stop",
+                }
+            ],
+        }
+        if include_usage:
+            yield {
+                "id": "chat" + f"cmpl-{request_id}",
+                "model": model_name,
+                "object": "chat.completion.chunk",
+                "created": int(time.time()),
+                "choices": [],
+                "usage": {
+                    "prompt_tokens": prompt_tokens,
+                    "completion_tokens": completion_tokens,
+                    "total_tokens": total_tokens,
+                },
+            }
     @classmethod
     def _convert_raw_text_completion_to_chat(
@@ -273,7 +307,7 @@ class ChatglmCppChatModel(LLM):
         params = {
             "max_length": generate_config.get("max_tokens"),
-            "max_context_length": generate_config.get("max_tokens"),
+            "max_context_length": generate_config.get("max_tokens", 1024),
             "top_k": generate_config.get("top_k"),
             "top_p": generate_config.get("top_p"),
             "temperature": generate_config.get("temperature"),
@@ -286,13 +320,27 @@ class ChatglmCppChatModel(LLM):
         assert self._llm is not None
         chat_history_messages = self._to_chatglm_chat_messages(chat_history_list)
-        if generate_config["stream"]:
+        stream = generate_config.get("stream")
+        stream_options = generate_config.get("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
+        if stream:
             it = self._llm.chat(
                 chat_history_messages,
                 **params,
             )
             assert not isinstance(it, str)
-            return self._convert_raw_text_chunks_to_chat(it, self.model_uid)
+            input_ids = self._llm.tokenizer.encode_messages(
+                chat_history_messages, params["max_context_length"]
+            )
+            return self._convert_raw_text_chunks_to_chat(
+                it, self.model_uid, include_usage, input_ids
+            )
         else:
             c = self._llm.chat(
                 chat_history_messages,
@@ -320,11 +368,13 @@ class ChatglmCppChatModel(LLM):
     @staticmethod
     def _convert_str_to_completion_chunk(
-        tokens: Iterator[str], model_name: str
+        tokens: Iterator[str], model_name: str, include_usage: bool, input_ids
     ) -> Iterator[CompletionChunk]:
-        for token in tokens:
+        request_id = str(uuid.uuid4())
+        prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+        for i, token in enumerate(tokens):
             yield {
-                "id": "generate" + f"-{str(uuid.uuid4())}",
+                "id": "generate" + f"-{request_id}",
                 "model": model_name,
                 "object": "text_completion",
                 "created": int(time.time()),
@@ -332,6 +382,32 @@ class ChatglmCppChatModel(LLM):
                     {"index": 0, "text": token, "finish_reason": None, "logprobs": None}
                 ],
             }
+            prompt_tokens = len(input_ids)
+            completion_tokens = i
+            total_tokens = prompt_tokens + completion_tokens
+        # stop
+        yield {
+            "id": "chat" + f"cmpl-{request_id}",
+            "model": model_name,
+            "object": "text_completion",
+            "created": int(time.time()),
+            "choices": [
+                {"index": 0, "text": "", "finish_reason": "stop", "logprobs": None}
+            ],
+        }
+        if include_usage:
+            yield {
+                "id": "chat" + f"cmpl-{request_id}",
+                "model": model_name,
+                "object": "text_completion",
+                "created": int(time.time()),
+                "choices": [],
+                "usage": {
+                    "prompt_tokens": prompt_tokens,
+                    "completion_tokens": completion_tokens,
+                    "total_tokens": total_tokens,
+                },
+            }
     def generate(
         self,
@@ -344,7 +420,7 @@ class ChatglmCppChatModel(LLM):
         params = {
             "max_length": generate_config.get("max_tokens"),
-            "max_context_length": generate_config.get("max_tokens"),
+            "max_context_length": generate_config.get("max_tokens", 1024),
             "top_k": generate_config.get("top_k"),
             "top_p": generate_config.get("top_p"),
             "temperature": generate_config.get("temperature"),
@@ -355,14 +431,23 @@ class ChatglmCppChatModel(LLM):
         params = {k: v for k, v in params.items() if v is not None}
         assert self._llm is not None
-        if generate_config["stream"]:
+        stream = generate_config.get("stream")
+        stream_options = generate_config.get("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
+        if stream:
             it = self._llm.generate(
                 prompt,
                 **params,
             )
             assert not isinstance(it, str)
-            return self._convert_str_to_completion_chunk(it, self.model_uid)
+            input_ids = self._llm.tokenizer.encode(prompt, params["max_context_length"])
+            return self._convert_str_to_completion_chunk(
+                it, self.model_uid, include_usage, input_ids
+            )
         else:
             c = self._llm.generate(
                 prompt,

xinference/model/llm/ggml/llamacpp.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import datetime
 import logging
 import os
+import time
 from typing import Iterable, Iterator, List, Optional, Union
 from ....types import (
@@ -22,6 +23,7 @@ from ....types import (
     ChatCompletionMessage,
     Completion,
     CompletionChunk,
+    CompletionUsage,
     CreateCompletionLlamaCpp,
     Embedding,
     LlamaCppGenerateConfig,
@@ -100,6 +102,8 @@ class LlamaCppModel(LLM):
             generate_config = LlamaCppGenerateConfig(
                 **CreateCompletionLlamaCpp(**generate_config).dict()
             )
+        # Currently, llama.cpp does not support lora
+        generate_config.pop("lora_name", None)  # type: ignore
         return generate_config
     def _convert_ggml_to_gguf(self, model_path: str) -> str:
@@ -195,16 +199,59 @@ class LlamaCppModel(LLM):
             _generate_config: LlamaCppGenerateConfig,
         ) -> Iterator[CompletionChunk]:
             assert self._llm is not None
-            for _completion_chunk in self._llm(prompt=_prompt, **_generate_config):
+            prompt_token_ids: List[int] = (
+                (
+                    self._llm.tokenize(prompt.encode("utf-8"), special=True)
+                    if prompt != ""
+                    else [self._llm.token_bos()]
+                )
+                if isinstance(prompt, str)
+                else prompt
+            )
+            prompt_tokens = len(prompt_token_ids)
+            completion_tokens, total_tokens = 0, 0
+            request_id = 0
+            for index, _completion_chunk in enumerate(
+                self._llm(prompt=_prompt, **_generate_config)
+            ):
+                request_id = _completion_chunk["id"]
+                choice = _completion_chunk["choices"][0]
+                if choice["finish_reason"] is not None:
+                    completion_tokens = index
+                total_tokens = prompt_tokens + completion_tokens
+                _completion_chunk["usage"] = CompletionUsage(
+                    prompt_tokens=total_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
+                )
                 yield _completion_chunk
+            if include_usage:
+                chunk = CompletionChunk(
+                    id=request_id,
+                    object="text_completion",
+                    created=int(time.time()),
+                    model=self.model_uid,
+                    choices=[],
+                )
+                chunk["usage"] = CompletionUsage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
+                )
+                yield chunk
         logger.debug(
             "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
         )
         generate_config = self._sanitize_generate_config(generate_config)
         stream = generate_config.get("stream", False)
+        stream_options = generate_config.pop("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
         if not stream:
             assert self._llm is not None

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -3651,7 +3651,7 @@
   },
   {
     "version": 1,
-    "context_length": 204800,
+    "context_length": 262144,
     "model_name": "Yi-200k",
     "model_lang": [
       "en",
@@ -3688,7 +3688,7 @@
   },
   {
     "version": 1,
-    "context_length": 204800,
+    "context_length": 4096,
     "model_name": "Yi-chat",
     "model_lang": [
       "en",
@@ -3707,6 +3707,17 @@
         ],
         "model_id": "01-ai/Yi-34B-Chat-{quantization}"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-6B-Chat",
+        "model_revision": "1c20c960895e4c3877cf478bc2df074221b81d7b"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 34,
@@ -3762,6 +3773,124 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "Yi-1.5",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-1.5-6B",
+        "model_revision": "741a657c42d2081f777ce4c6c5572090f8b8c886"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-1.5-9B",
+        "model_revision": "9a6839c5b9db3dbb245fb98a072bfabc242621f2"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-1.5-34B",
+        "model_revision": "4f83007957ec3eec76d87df19ad061eb0f57b5c5"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "Yi-1.5-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Yi-1.5 is an upgraded version of Yi. It is continuously pre-trained on Yi with a high-quality corpus of 500B tokens and fine-tuned on 3M diverse fine-tuning samples.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 6,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-1.5-6B-Chat",
+        "model_revision": "d68dab90947a3c869e28c9cb2806996af99a6080"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-1.5-9B-Chat",
+        "model_revision": "1dc6e2b8dcfc12b95bede8dec67e6b6332ac64c6"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 34,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "01-ai/Yi-1.5-34B-Chat",
+        "model_revision": "fa695ee438bfcd0ec2b378fa1c7e0dea1b40393e"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "CHATML",
+      "system_prompt": "",
+      "roles": [
+        "<|im_start|>user",
+        "<|im_start|>assistant"
+      ],
+      "intra_message_sep": "<|im_end|>",
+      "inter_message_sep": "",
+      "stop_token_ids": [
+        2,
+        6,
+        7,
+        8
+      ],
+      "stop": [
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>",
+        "<|im_sep|>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 2048,
@@ -4684,7 +4813,7 @@
   },
   {
     "version": 1,
-    "context_length": 204800,
+    "context_length": 4096,
     "model_name": "yi-vl-chat",
     "model_lang": [
       "en",

xinference 0.11.0__py3-none-any.whl → 0.11.1__py3-none-any.whl

Potentially problematic release.

xinference 0.11.0py3-none-any.whl → 0.11.1py3-none-any.whl