PyPI - xinference - Versions diffs - 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

xinference 0.7.5py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (120) hide show

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -94,6 +94,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
     "code-llama-python",
     "code-llama-instruct",
     "mistral-instruct-v0.1",
+    "mistral-instruct-v0.2",
     "chatglm3",
 ]
@@ -170,7 +171,7 @@ class VLLMModel(LLM):
         )
         sanitized.setdefault("temperature", generate_config.get("temperature", 1.0))
         sanitized.setdefault("top_p", generate_config.get("top_p", 1.0))
-        sanitized.setdefault("max_tokens", generate_config.get("max_tokens", 16))
+        sanitized.setdefault("max_tokens", generate_config.get("max_tokens", 1024))
         sanitized.setdefault("stop", generate_config.get("stop", None))
         sanitized.setdefault(
             "stop_token_ids", generate_config.get("stop_token_ids", None)
@@ -303,6 +304,16 @@ class VLLMModel(LLM):
                     delta = choice["text"][len(previous_texts[i]) :]
                     previous_texts[i] = choice["text"]
                     choice["text"] = delta
+                prompt_tokens = len(_request_output.prompt_token_ids)
+                completion_tokens = sum(
+                    len(output.token_ids) for output in _request_output.outputs
+                )
+                total_tokens = prompt_tokens + completion_tokens
+                chunk["usage"] = CompletionUsage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
+                )
                 yield chunk
         if stream:
@@ -379,7 +390,8 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         generate_config = self._sanitize_chat_config(generate_config)
         # TODO(codingl2k1): qwen hacky to set stop for function call.
-        if tools and self.model_family.model_name == "qwen-chat":
+        model_family = self.model_family.model_family or self.model_family.model_name
+        if tools and "qwen-chat" == model_family:
             stop = generate_config.get("stop")
             if isinstance(stop, str):
                 generate_config["stop"] = [stop, "Observation:"]
@@ -400,6 +412,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
             assert not isinstance(c, AsyncGenerator)
             if tools:
                 return self._tool_calls_completion(
-                    self.model_family.model_name, self.model_uid, c, tools
+                    self.model_family, self.model_uid, c, tools
                 )
             return self._to_chat_completion(c)

xinference/model/multimodal/__init__.py CHANGED Viewed

@@ -30,16 +30,23 @@ MODEL_CLASSES.append(QwenVLChat)
 def _install():
-    json_path = os.path.join(
+    json_path_huggingface = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "model_spec.json"
     )
-    for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")):
-        model_family = LVLMFamilyV1.parse_obj(json_obj)
-        BUILTIN_LVLM_FAMILIES.append(model_family)
-        for model_spec in model_family.model_specs:
-            MODEL_NAME_TO_REVISION[model_family.model_name].append(
-                model_spec.model_revision
-            )
+    json_path_modelscope = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "model_spec_modelscope.json"
+    )
+    for builtin_family, json_path in [
+        (BUILTIN_LVLM_FAMILIES, json_path_huggingface),
+        (BUILTIN_MODELSCOPE_LVLM_FAMILIES, json_path_modelscope),
+    ]:
+        for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")):
+            model_family = LVLMFamilyV1.parse_obj(json_obj)
+            builtin_family.append(model_family)
+            for model_spec in model_family.model_specs:
+                MODEL_NAME_TO_REVISION[model_family.model_name].append(
+                    model_spec.model_revision
+                )
 _install()

xinference/model/multimodal/core.py CHANGED Viewed

@@ -203,6 +203,8 @@ def match_multimodal(
                 and matched_quantization is None
             ):
                 continue
+            # Copy spec to avoid _apply_format_to_model_id modify the original spec.
+            spec = spec.copy()
             if quantization:
                 return (
                     family,
@@ -328,6 +330,11 @@ def _skip_download(
                     logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
                     return True
             return False
+    elif model_format in ["ggmlv3", "ggufv2", "gptq"]:
+        assert quantization is not None
+        return os.path.exists(
+            _get_meta_path(cache_dir, model_format, model_hub, quantization)
+        )
     else:
         raise ValueError(f"Unsupported format: {model_format}")
@@ -414,7 +421,7 @@ def cache_from_huggingface(
     ):
         return cache_dir
-    if model_spec.model_format in ["pytorch"]:
+    if model_spec.model_format in ["pytorch", "gptq"]:
         assert isinstance(model_spec, LVLMSpecV1)
         retry_download(
             huggingface_hub.snapshot_download,

xinference/model/multimodal/model_spec.json CHANGED Viewed

@@ -20,6 +20,15 @@
         ],
         "model_id": "Qwen/Qwen-VL-Chat",
         "model_revision": "6665c780ade5ff3f08853b4262dcb9c8f9598d42"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "Qwen/Qwen-VL-Chat-{quantization}",
+        "model_revision": "5d3a5aa033ed2c502300d426c81cc5b13bcd1409"
       }
     ],
     "prompt_style": {

xinference/model/multimodal/model_spec_modelscope.json ADDED Viewed

@@ -0,0 +1,45 @@
+[
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "qwen-vl-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "Qwen/Qwen-VL-Chat",
+        "model_revision": "master"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "Qwen/Qwen-VL-Chat-{quantization}",
+        "model_revision": "master"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ]
+    }
+  }
+]

xinference/model/multimodal/qwen_vl.py CHANGED Viewed

@@ -18,7 +18,6 @@ import tempfile
 import time
 import uuid
 from typing import Dict, Iterator, List, Optional, Union
-from urllib.parse import urlparse
 from ...types import (
     ChatCompletion,
@@ -73,14 +72,7 @@ class QwenVLChat(LVLM):
     def _message_content_to_qwen(self, content) -> str:
         def _ensure_url(_url):
-            try:
-                if _url.startswith("data:"):
-                    raise "Not a valid url."
-                parsed = urlparse(_url)
-                if not parsed.scheme:
-                    raise "Not a valid url."
-                return _url
-            except Exception:
+            if _url.startswith("data:"):
                 logging.info("Parse url by base64 decoder.")
                 # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
                 # e.g. f"data:image/jpeg;base64,{base64_image}"
@@ -93,6 +85,10 @@ class QwenVLChat(LVLM):
                     f.write(data)
                 logging.info("Dump base64 data to %s", f.name)
                 return f.name
+            else:
+                if len(_url) > 2048:
+                    raise Exception(f"Image url is too long, {len(_url)} > 2048.")
+                return _url
         if not isinstance(content, str):
             # TODO(codingl2k1): Optimize _ensure_url

xinference/model/utils.py CHANGED Viewed

@@ -153,8 +153,13 @@ def is_model_cached(model_spec: Any, name_to_revisions_mapping: Dict):
 def is_valid_model_name(model_name: str) -> bool:
-    model_name = model_name.strip()
-    return 0 < len(model_name) <= 100
+    import re
+    if len(model_name) == 0:
+        return False
+    # check if contains +/?%#&=\s
+    return re.match(r"^[^+\/?%#&=\s]*$", model_name) is not None
 def parse_uri(uri: str) -> Tuple[str, str]:

xinference/types.py CHANGED Viewed

@@ -110,6 +110,7 @@ class CompletionChunk(TypedDict):
     created: int
     model: str
     choices: List[CompletionChoice]
+    usage: NotRequired[CompletionUsage]
 class Completion(TypedDict):
@@ -160,6 +161,7 @@ class ChatCompletionChunk(TypedDict):
     object: Literal["chat.completion.chunk"]
     created: int
     choices: List[ChatCompletionChunkChoice]
+    usage: NotRequired[CompletionUsage]
 class ChatglmCppModelConfig(TypedDict, total=False):

xinference/web/ui/build/asset-manifest.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
   "files": {
-    "main.js": "./static/js/main.236e72e7.js",
+    "main.js": "./static/js/main.b83095c2.js",
     "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
     "index.html": "./index.html",
-    "main.236e72e7.js.map": "./static/js/main.236e72e7.js.map"
+    "main.b83095c2.js.map": "./static/js/main.b83095c2.js.map"
   },
   "entrypoints": [
-    "static/js/main.236e72e7.js"
+    "static/js/main.b83095c2.js"
   ]
 }

xinference/web/ui/build/index.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.~~236e72e7~~.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1	+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.b83095c2.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

xinference 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

xinference 0.7.5py3-none-any.whl → 0.8.1py3-none-any.whl