PyPI - xinference - Versions diffs - 0.11.0__py3-none-any.whl → 0.11.2__py3-none-any.whl - Mend - Supply Chain Defender

xinference 0.11.0py3-none-any.whl → 0.11.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (56) hide show

xinference/model/llm/ggml/chatglm.py CHANGED Viewed

@@ -108,10 +108,11 @@ class ChatglmCppChatModel(LLM):
     @staticmethod
     def _convert_raw_text_chunks_to_chat(
-        tokens: Iterator[Any], model_name: str
+        tokens: Iterator[Any], model_name: str, include_usage: bool, input_ids
     ) -> Iterator[ChatCompletionChunk]:
+        request_id = str(uuid.uuid4())
         yield {
-            "id": "chat" + f"cmpl-{str(uuid.uuid4())}",
+            "id": "chat" + f"cmpl-{request_id}",
             "model": model_name,
             "object": "chat.completion.chunk",
             "created": int(time.time()),
@@ -125,9 +126,13 @@ class ChatglmCppChatModel(LLM):
                 }
             ],
         }
+        prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
         for token in tokens:
+            prompt_tokens = len(input_ids)
+            completion_tokens = completion_tokens + 1
+            total_tokens = prompt_tokens + completion_tokens
             yield {
-                "id": "chat" + f"cmpl-{str(uuid.uuid4())}",
+                "id": "chat" + f"cmpl-{request_id}",
                 "model": model_name,
                 "object": "chat.completion.chunk",
                 "created": int(time.time()),
@@ -143,6 +148,35 @@ class ChatglmCppChatModel(LLM):
                     }
                 ],
             }
+        # stop
+        yield {
+            "id": "chat" + f"cmpl-{request_id}",
+            "model": model_name,
+            "object": "chat.completion.chunk",
+            "created": int(time.time()),
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {
+                        "content": "",
+                    },
+                    "finish_reason": "stop",
+                }
+            ],
+        }
+        if include_usage:
+            yield {
+                "id": "chat" + f"cmpl-{request_id}",
+                "model": model_name,
+                "object": "chat.completion.chunk",
+                "created": int(time.time()),
+                "choices": [],
+                "usage": {
+                    "prompt_tokens": prompt_tokens,
+                    "completion_tokens": completion_tokens,
+                    "total_tokens": total_tokens,
+                },
+            }
     @classmethod
     def _convert_raw_text_completion_to_chat(
@@ -273,7 +307,7 @@ class ChatglmCppChatModel(LLM):
         params = {
             "max_length": generate_config.get("max_tokens"),
-            "max_context_length": generate_config.get("max_tokens"),
+            "max_context_length": generate_config.get("max_tokens", 1024),
             "top_k": generate_config.get("top_k"),
             "top_p": generate_config.get("top_p"),
             "temperature": generate_config.get("temperature"),
@@ -286,13 +320,27 @@ class ChatglmCppChatModel(LLM):
         assert self._llm is not None
         chat_history_messages = self._to_chatglm_chat_messages(chat_history_list)
-        if generate_config["stream"]:
+        stream = generate_config.get("stream")
+        stream_options = generate_config.get("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
+        if stream:
             it = self._llm.chat(
                 chat_history_messages,
                 **params,
             )
             assert not isinstance(it, str)
-            return self._convert_raw_text_chunks_to_chat(it, self.model_uid)
+            input_ids = self._llm.tokenizer.encode_messages(
+                chat_history_messages, params["max_context_length"]
+            )
+            return self._convert_raw_text_chunks_to_chat(
+                it, self.model_uid, include_usage, input_ids
+            )
         else:
             c = self._llm.chat(
                 chat_history_messages,
@@ -320,11 +368,13 @@ class ChatglmCppChatModel(LLM):
     @staticmethod
     def _convert_str_to_completion_chunk(
-        tokens: Iterator[str], model_name: str
+        tokens: Iterator[str], model_name: str, include_usage: bool, input_ids
     ) -> Iterator[CompletionChunk]:
-        for token in tokens:
+        request_id = str(uuid.uuid4())
+        prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+        for i, token in enumerate(tokens):
             yield {
-                "id": "generate" + f"-{str(uuid.uuid4())}",
+                "id": "generate" + f"-{request_id}",
                 "model": model_name,
                 "object": "text_completion",
                 "created": int(time.time()),
@@ -332,6 +382,32 @@ class ChatglmCppChatModel(LLM):
                     {"index": 0, "text": token, "finish_reason": None, "logprobs": None}
                 ],
             }
+            prompt_tokens = len(input_ids)
+            completion_tokens = i
+            total_tokens = prompt_tokens + completion_tokens
+        # stop
+        yield {
+            "id": "chat" + f"cmpl-{request_id}",
+            "model": model_name,
+            "object": "text_completion",
+            "created": int(time.time()),
+            "choices": [
+                {"index": 0, "text": "", "finish_reason": "stop", "logprobs": None}
+            ],
+        }
+        if include_usage:
+            yield {
+                "id": "chat" + f"cmpl-{request_id}",
+                "model": model_name,
+                "object": "text_completion",
+                "created": int(time.time()),
+                "choices": [],
+                "usage": {
+                    "prompt_tokens": prompt_tokens,
+                    "completion_tokens": completion_tokens,
+                    "total_tokens": total_tokens,
+                },
+            }
     def generate(
         self,
@@ -344,7 +420,7 @@ class ChatglmCppChatModel(LLM):
         params = {
             "max_length": generate_config.get("max_tokens"),
-            "max_context_length": generate_config.get("max_tokens"),
+            "max_context_length": generate_config.get("max_tokens", 1024),
             "top_k": generate_config.get("top_k"),
             "top_p": generate_config.get("top_p"),
             "temperature": generate_config.get("temperature"),
@@ -355,14 +431,23 @@ class ChatglmCppChatModel(LLM):
         params = {k: v for k, v in params.items() if v is not None}
         assert self._llm is not None
-        if generate_config["stream"]:
+        stream = generate_config.get("stream")
+        stream_options = generate_config.get("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
+        if stream:
             it = self._llm.generate(
                 prompt,
                 **params,
             )
             assert not isinstance(it, str)
-            return self._convert_str_to_completion_chunk(it, self.model_uid)
+            input_ids = self._llm.tokenizer.encode(prompt, params["max_context_length"])
+            return self._convert_str_to_completion_chunk(
+                it, self.model_uid, include_usage, input_ids
+            )
         else:
             c = self._llm.generate(
                 prompt,

xinference/model/llm/ggml/llamacpp.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import datetime
 import logging
 import os
+import time
 from typing import Iterable, Iterator, List, Optional, Union
 from ....types import (
@@ -22,6 +23,7 @@ from ....types import (
     ChatCompletionMessage,
     Completion,
     CompletionChunk,
+    CompletionUsage,
     CreateCompletionLlamaCpp,
     Embedding,
     LlamaCppGenerateConfig,
@@ -100,6 +102,8 @@ class LlamaCppModel(LLM):
             generate_config = LlamaCppGenerateConfig(
                 **CreateCompletionLlamaCpp(**generate_config).dict()
             )
+        # Currently, llama.cpp does not support lora
+        generate_config.pop("lora_name", None)  # type: ignore
         return generate_config
     def _convert_ggml_to_gguf(self, model_path: str) -> str:
@@ -195,16 +199,59 @@ class LlamaCppModel(LLM):
             _generate_config: LlamaCppGenerateConfig,
         ) -> Iterator[CompletionChunk]:
             assert self._llm is not None
-            for _completion_chunk in self._llm(prompt=_prompt, **_generate_config):
+            prompt_token_ids: List[int] = (
+                (
+                    self._llm.tokenize(prompt.encode("utf-8"), special=True)
+                    if prompt != ""
+                    else [self._llm.token_bos()]
+                )
+                if isinstance(prompt, str)
+                else prompt
+            )
+            prompt_tokens = len(prompt_token_ids)
+            completion_tokens, total_tokens = 0, 0
+            request_id = 0
+            for index, _completion_chunk in enumerate(
+                self._llm(prompt=_prompt, **_generate_config)
+            ):
+                request_id = _completion_chunk["id"]
+                choice = _completion_chunk["choices"][0]
+                if choice["finish_reason"] is not None:
+                    completion_tokens = index
+                total_tokens = prompt_tokens + completion_tokens
+                _completion_chunk["usage"] = CompletionUsage(
+                    prompt_tokens=total_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
+                )
                 yield _completion_chunk
+            if include_usage:
+                chunk = CompletionChunk(
+                    id=request_id,
+                    object="text_completion",
+                    created=int(time.time()),
+                    model=self.model_uid,
+                    choices=[],
+                )
+                chunk["usage"] = CompletionUsage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
+                )
+                yield chunk
         logger.debug(
             "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
         )
         generate_config = self._sanitize_generate_config(generate_config)
         stream = generate_config.get("stream", False)
+        stream_options = generate_config.pop("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
         if not stream:
             assert self._llm is not None