PyPI - xinference - Versions diffs - 0.7.4__py3-none-any.whl → 0.7.5__py3-none-any.whl - Mend

xinference 0.7.4py3-none-any.whl → 0.7.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (34) hide show

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -17,7 +17,7 @@ import os
 import platform
 import shutil
 from threading import Lock
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
 from pydantic import BaseModel, Field, Protocol, ValidationError, validator
 from pydantic.error_wrappers import ErrorWrapper
@@ -41,6 +41,8 @@ logger = logging.getLogger(__name__)
 DEFAULT_CONTEXT_LENGTH = 2048
 BUILTIN_LLM_PROMPT_STYLE: Dict[str, "PromptStyleV1"] = {}
+BUILTIN_LLM_MODEL_CHAT_FAMILIES: Set[str] = set()
+BUILTIN_LLM_MODEL_GENERATE_FAMILIES: Set[str] = set()
 class GgmlLLMSpecV1(BaseModel):
@@ -105,6 +107,8 @@ class LLMFamilyV1(BaseModel):
     model_lang: List[str]
     model_ability: List[Literal["embed", "generate", "chat"]]
     model_description: Optional[str]
+    # reason for not required str here: legacy registration
+    model_family: Optional[str]
     model_specs: List["LLMSpecV1"]
     prompt_style: Optional["PromptStyleV1"]
@@ -134,7 +138,39 @@ class CustomLLMFamilyV1(LLMFamilyV1):
             )
         except (ValueError, TypeError, UnicodeDecodeError) as e:
             raise ValidationError([ErrorWrapper(e, loc=ROOT_KEY)], cls)
-        llm_spec = cls.parse_obj(obj)
+        llm_spec: CustomLLMFamilyV1 = cls.parse_obj(obj)
+        # check model_family
+        if llm_spec.model_family is None:
+            raise ValueError(
+                f"You must specify `model_family` when registering custom LLM models."
+            )
+        assert isinstance(llm_spec.model_family, str)
+        if (
+            llm_spec.model_family != "other"
+            and "chat" in llm_spec.model_ability
+            and llm_spec.model_family not in BUILTIN_LLM_MODEL_CHAT_FAMILIES
+        ):
+            raise ValueError(
+                f"`model_family` for chat model must be `other` or one of the following values: \n"
+                f"{', '.join(list(BUILTIN_LLM_MODEL_CHAT_FAMILIES))}"
+            )
+        if (
+            llm_spec.model_family != "other"
+            and "chat" not in llm_spec.model_ability
+            and llm_spec.model_family not in BUILTIN_LLM_MODEL_GENERATE_FAMILIES
+        ):
+            raise ValueError(
+                f"`model_family` for generate model must be `other` or one of the following values: \n"
+                f"{', '.join(list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES))}"
+            )
+        # set prompt style when it is the builtin model family
+        if (
+            llm_spec.prompt_style is None
+            and llm_spec.model_family != "other"
+            and "chat" in llm_spec.model_ability
+        ):
+            llm_spec.prompt_style = llm_spec.model_family
         # handle prompt style when user choose existing style
         if llm_spec.prompt_style is not None and isinstance(llm_spec.prompt_style, str):

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -331,6 +331,15 @@
       "roles": [
         "user",
         "assistant"
+      ],
+      "stop_token_ids": [
+        64795,
+        64797,
+        2
+      ],
+      "stop":[
+        "<|user|>",
+        "<|observation|>"
       ]
     }
   },
@@ -357,7 +366,7 @@
         ],
         "model_hub": "modelscope",
         "model_id": "ZhipuAI/chatglm3-6b-32k",
-        "model_revision": "v1.0.0"
+        "model_revision": "master"
       }
     ],
     "prompt_style": {

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -58,6 +58,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         tokenizer = AutoTokenizer.from_pretrained(
             self.model_path,
             trust_remote_code=kwargs["trust_remote_code"],
+            encode_special_tokens=True,
             revision=kwargs["revision"],
         )
         model = AutoModel.from_pretrained(

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -409,7 +409,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
     ) -> PytorchGenerateConfig:
         generate_config = super()._sanitize_generate_config(generate_config)
         if (
-            generate_config.get("stop", None) is None
+            (not generate_config.get("stop"))
             and self.model_family.prompt_style
             and self.model_family.prompt_style.stop
         ):

xinference/model/llm/pytorch/utils.py CHANGED Viewed

@@ -527,10 +527,12 @@ def generate_stream_chatglm(
     top_p = float(generate_config.get("top_p", 1.0))
     max_new_tokens = int(generate_config.get("max_tokens", 256))
     echo = generate_config.get("echo", False)
+    stop_str = generate_config.get("stop", None)
+    eos_token_id = generate_config.get("stop_token_ids", [])
+    eos_token_id.append(tokenizer.eos_token_id)
     inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
     input_echo_len = len(inputs["input_ids"][0])
     gen_kwargs = {
         "max_length": max_new_tokens + input_echo_len,
         "do_sample": True if temperature > 1e-5 else False,
@@ -543,7 +545,9 @@ def generate_stream_chatglm(
     total_len = 0
     last_response_length = 0
-    for total_ids in model.stream_generate(**inputs, **gen_kwargs):
+    for total_ids in model.stream_generate(
+        **inputs, eos_token_id=eos_token_id, **gen_kwargs
+    ):
         total_ids = total_ids.tolist()[0]
         total_len = len(total_ids)
         if echo:
@@ -553,29 +557,57 @@ def generate_stream_chatglm(
         response = tokenizer.decode(output_ids)
         response = process_response(response)
+        partially_stopped = False
+        stopped = False
+        if stop_str:
+            if isinstance(stop_str, str):
+                pos = response.rfind(stop_str, 0)
+                if pos != -1:
+                    response = response[:pos]
+                    stopped = True
+                else:
+                    partially_stopped = is_partial_stop(response, stop_str)
+            elif isinstance(stop_str, Iterable):
+                for each_stop in stop_str:
+                    pos = response.rfind(each_stop, 0)
+                    if pos != -1:
+                        response = response[:pos]
+                        stopped = True
+                        break
+                    else:
+                        partially_stopped = is_partial_stop(response, each_stop)
+                        if partially_stopped:
+                            break
+            else:
+                raise ValueError("Invalid stop field type.")
         if stream:
             response = response.strip("�")
             tmp_response_length = len(response)
             response = response[last_response_length:]
             last_response_length = tmp_response_length
-        completion_choice = CompletionChoice(
-            text=response, index=0, logprobs=None, finish_reason=None
-        )
-        completion_chunk = CompletionChunk(
-            id=str(uuid.uuid1()),
-            object="text_completion",
-            created=int(time.time()),
-            model=model_uid,
-            choices=[completion_choice],
-        )
-        completion_usage = CompletionUsage(
-            prompt_tokens=input_echo_len,
-            completion_tokens=(total_len - input_echo_len),
-            total_tokens=total_len,
-        )
+        if not partially_stopped:
+            completion_choice = CompletionChoice(
+                text=response, index=0, logprobs=None, finish_reason=None
+            )
+            completion_chunk = CompletionChunk(
+                id=str(uuid.uuid1()),
+                object="text_completion",
+                created=int(time.time()),
+                model=model_uid,
+                choices=[completion_choice],
+            )
+            completion_usage = CompletionUsage(
+                prompt_tokens=input_echo_len,
+                completion_tokens=(total_len - input_echo_len),
+                total_tokens=total_len,
+            )
+            yield completion_chunk, completion_usage
-        yield completion_chunk, completion_usage
+        if stopped:
+            break
     if total_len - input_echo_len == max_new_tokens - 1:
         finish_reason = "length"

xinference/model/llm/utils.py CHANGED Viewed

@@ -141,7 +141,7 @@ class ChatModelMixin:
             return ret
         elif prompt_style.style_name == "CHATGLM3":
             prompts = (
-                [f"<|system|>\n{prompt_style.system_prompt}"]
+                [f"<|system|>\n {prompt_style.system_prompt}"]
                 if prompt_style.system_prompt
                 else []
             )
@@ -155,7 +155,7 @@ class ChatModelMixin:
                 if content:
                     if role == "tool":
                         role = "observation"
-                    prompts.append(f"<|{role}|>\n{content}")
+                    prompts.append(f"<|{role}|>\n {content}")
                 else:
                     prompts.append(f"<|{role}|>")
             return "\n".join(prompts)

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -37,6 +37,7 @@ from ....types import (
     CompletionUsage,
 )
 from .. import LLM, LLMFamilyV1, LLMSpecV1
+from ..llm_family import CustomLLMFamilyV1
 from ..utils import ChatModelMixin
 logger = logging.getLogger(__name__)
@@ -197,8 +198,12 @@ class VLLMModel(LLM):
             # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
             if "4" not in quantization:
                 return False
-        if llm_family.model_name not in VLLM_SUPPORTED_MODELS:
-            return False
+        if isinstance(llm_family, CustomLLMFamilyV1):
+            if llm_family.model_family not in VLLM_SUPPORTED_MODELS:
+                return False
+        else:
+            if llm_family.model_name not in VLLM_SUPPORTED_MODELS:
+                return False
         if "generate" not in llm_family.model_ability:
             return False
         return VLLM_INSTALLED
@@ -329,8 +334,12 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
             if "4" not in quantization:
                 return False
-        if llm_family.model_name not in VLLM_SUPPORTED_CHAT_MODELS:
-            return False
+        if isinstance(llm_family, CustomLLMFamilyV1):
+            if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
+                return False
+        else:
+            if llm_family.model_name not in VLLM_SUPPORTED_CHAT_MODELS:
+                return False
         if "chat" not in llm_family.model_ability:
             return False
         return VLLM_INSTALLED

xinference/model/multimodal/core.py CHANGED Viewed

@@ -96,7 +96,7 @@ class LVLMDescription(ModelDescription):
     def to_dict(self):
         return {
-            "model_type": "LVLM",
+            "model_type": "multimodal",
             "address": self.address,
             "accelerators": self.devices,
             "model_name": self._model_family.model_name,

xinference/model/multimodal/qwen_vl.py CHANGED Viewed

@@ -11,11 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import base64
+import logging
 import operator
+import tempfile
 import time
 import uuid
 from typing import Dict, Iterator, List, Optional, Union
+from urllib.parse import urlparse
 from ...types import (
     ChatCompletion,
@@ -26,6 +29,8 @@ from ...types import (
 from ..utils import select_device
 from .core import LVLM, LVLMFamilyV1, LVLMSpecV1
+logger = logging.getLogger(__name__)
 class QwenVLChat(LVLM):
     def __init__(self, *args, **kwargs):
@@ -67,9 +72,32 @@ class QwenVLChat(LVLM):
         )
     def _message_content_to_qwen(self, content) -> str:
+        def _ensure_url(_url):
+            try:
+                if _url.startswith("data:"):
+                    raise "Not a valid url."
+                parsed = urlparse(_url)
+                if not parsed.scheme:
+                    raise "Not a valid url."
+                return _url
+            except Exception:
+                logging.info("Parse url by base64 decoder.")
+                # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
+                # e.g. f"data:image/jpeg;base64,{base64_image}"
+                _type, data = _url.split(";")
+                _, ext = _type.split("/")
+                data = data[len("base64,") :]
+                data = base64.b64decode(data.encode("utf-8"))
+                with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as f:
+                    f.write(data)
+                logging.info("Dump base64 data to %s", f.name)
+                return f.name
         if not isinstance(content, str):
+            # TODO(codingl2k1): Optimize _ensure_url
             content = [
-                {"image": c["image_url"]["url"], "type": "image"}
+                {"image": _ensure_url(c["image_url"]["url"]), "type": "image"}
                 if c.get("type") == "image_url"
                 else c
                 for c in content
@@ -85,6 +113,10 @@ class QwenVLChat(LVLM):
         chat_history: Optional[List[Dict]] = None,
         generate_config: Optional[Dict] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        if generate_config and generate_config.get("stream"):
+            raise Exception(
+                f"Chat with model {self.model_family.model_name} does not support stream."
+            )
         prompt = self._message_content_to_qwen(prompt)
         # Convert openai history to qwen vl history
         qwen_history = []

xinference/web/ui/build/asset-manifest.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
   "files": {
-    "main.js": "./static/js/main.31d347d8.js",
+    "main.js": "./static/js/main.236e72e7.js",
     "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
     "index.html": "./index.html",
-    "main.31d347d8.js.map": "./static/js/main.31d347d8.js.map"
+    "main.236e72e7.js.map": "./static/js/main.236e72e7.js.map"
   },
   "entrypoints": [
-    "static/js/main.31d347d8.js"
+    "static/js/main.236e72e7.js"
   ]
 }

xinference/web/ui/build/index.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.~~31d347d8~~.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1	+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.236e72e7.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

xinference 0.7.4__py3-none-any.whl → 0.7.5__py3-none-any.whl

Potentially problematic release.

xinference 0.7.4py3-none-any.whl → 0.7.5py3-none-any.whl