PyPI - xinference - Versions diffs - 0.7.4.1__py3-none-any.whl → 0.7.5__py3-none-any.whl - Mend

xinference 0.7.4.1py3-none-any.whl → 0.7.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (31) hide show

xinference/model/llm/pytorch/utils.py CHANGED Viewed

@@ -527,10 +527,12 @@ def generate_stream_chatglm(
     top_p = float(generate_config.get("top_p", 1.0))
     max_new_tokens = int(generate_config.get("max_tokens", 256))
     echo = generate_config.get("echo", False)
+    stop_str = generate_config.get("stop", None)
+    eos_token_id = generate_config.get("stop_token_ids", [])
+    eos_token_id.append(tokenizer.eos_token_id)
     inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
     input_echo_len = len(inputs["input_ids"][0])
     gen_kwargs = {
         "max_length": max_new_tokens + input_echo_len,
         "do_sample": True if temperature > 1e-5 else False,
@@ -543,7 +545,9 @@ def generate_stream_chatglm(
     total_len = 0
     last_response_length = 0
-    for total_ids in model.stream_generate(**inputs, **gen_kwargs):
+    for total_ids in model.stream_generate(
+        **inputs, eos_token_id=eos_token_id, **gen_kwargs
+    ):
         total_ids = total_ids.tolist()[0]
         total_len = len(total_ids)
         if echo:
@@ -553,29 +557,57 @@ def generate_stream_chatglm(
         response = tokenizer.decode(output_ids)
         response = process_response(response)
+        partially_stopped = False
+        stopped = False
+        if stop_str:
+            if isinstance(stop_str, str):
+                pos = response.rfind(stop_str, 0)
+                if pos != -1:
+                    response = response[:pos]
+                    stopped = True
+                else:
+                    partially_stopped = is_partial_stop(response, stop_str)
+            elif isinstance(stop_str, Iterable):
+                for each_stop in stop_str:
+                    pos = response.rfind(each_stop, 0)
+                    if pos != -1:
+                        response = response[:pos]
+                        stopped = True
+                        break
+                    else:
+                        partially_stopped = is_partial_stop(response, each_stop)
+                        if partially_stopped:
+                            break
+            else:
+                raise ValueError("Invalid stop field type.")
         if stream:
             response = response.strip("�")
             tmp_response_length = len(response)
             response = response[last_response_length:]
             last_response_length = tmp_response_length
-        completion_choice = CompletionChoice(
-            text=response, index=0, logprobs=None, finish_reason=None
-        )
-        completion_chunk = CompletionChunk(
-            id=str(uuid.uuid1()),
-            object="text_completion",
-            created=int(time.time()),
-            model=model_uid,
-            choices=[completion_choice],
-        )
-        completion_usage = CompletionUsage(
-            prompt_tokens=input_echo_len,
-            completion_tokens=(total_len - input_echo_len),
-            total_tokens=total_len,
-        )
+        if not partially_stopped:
+            completion_choice = CompletionChoice(
+                text=response, index=0, logprobs=None, finish_reason=None
+            )
+            completion_chunk = CompletionChunk(
+                id=str(uuid.uuid1()),
+                object="text_completion",
+                created=int(time.time()),
+                model=model_uid,
+                choices=[completion_choice],
+            )
+            completion_usage = CompletionUsage(
+                prompt_tokens=input_echo_len,
+                completion_tokens=(total_len - input_echo_len),
+                total_tokens=total_len,
+            )
+            yield completion_chunk, completion_usage
-        yield completion_chunk, completion_usage
+        if stopped:
+            break
     if total_len - input_echo_len == max_new_tokens - 1:
         finish_reason = "length"

xinference/model/llm/utils.py CHANGED Viewed

@@ -141,7 +141,7 @@ class ChatModelMixin:
             return ret
         elif prompt_style.style_name == "CHATGLM3":
             prompts = (
-                [f"<|system|>\n{prompt_style.system_prompt}"]
+                [f"<|system|>\n {prompt_style.system_prompt}"]
                 if prompt_style.system_prompt
                 else []
             )
@@ -155,7 +155,7 @@ class ChatModelMixin:
                 if content:
                     if role == "tool":
                         role = "observation"
-                    prompts.append(f"<|{role}|>\n{content}")
+                    prompts.append(f"<|{role}|>\n {content}")
                 else:
                     prompts.append(f"<|{role}|>")
             return "\n".join(prompts)

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -37,6 +37,7 @@ from ....types import (
     CompletionUsage,
 )
 from .. import LLM, LLMFamilyV1, LLMSpecV1
+from ..llm_family import CustomLLMFamilyV1
 from ..utils import ChatModelMixin
 logger = logging.getLogger(__name__)
@@ -197,8 +198,12 @@ class VLLMModel(LLM):
             # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
             if "4" not in quantization:
                 return False
-        if llm_family.model_name not in VLLM_SUPPORTED_MODELS:
-            return False
+        if isinstance(llm_family, CustomLLMFamilyV1):
+            if llm_family.model_family not in VLLM_SUPPORTED_MODELS:
+                return False
+        else:
+            if llm_family.model_name not in VLLM_SUPPORTED_MODELS:
+                return False
         if "generate" not in llm_family.model_ability:
             return False
         return VLLM_INSTALLED
@@ -329,8 +334,12 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
             if "4" not in quantization:
                 return False
-        if llm_family.model_name not in VLLM_SUPPORTED_CHAT_MODELS:
-            return False
+        if isinstance(llm_family, CustomLLMFamilyV1):
+            if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
+                return False
+        else:
+            if llm_family.model_name not in VLLM_SUPPORTED_CHAT_MODELS:
+                return False
         if "chat" not in llm_family.model_ability:
             return False
         return VLLM_INSTALLED

xinference/web/ui/build/asset-manifest.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
   "files": {
-    "main.js": "./static/js/main.31d347d8.js",
+    "main.js": "./static/js/main.236e72e7.js",
     "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
     "index.html": "./index.html",
-    "main.31d347d8.js.map": "./static/js/main.31d347d8.js.map"
+    "main.236e72e7.js.map": "./static/js/main.236e72e7.js.map"
   },
   "entrypoints": [
-    "static/js/main.31d347d8.js"
+    "static/js/main.236e72e7.js"
   ]
 }

xinference/web/ui/build/index.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.~~31d347d8~~.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1	+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.236e72e7.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

xinference 0.7.4.1__py3-none-any.whl → 0.7.5__py3-none-any.whl

Potentially problematic release.

xinference 0.7.4.1py3-none-any.whl → 0.7.5py3-none-any.whl