PyPI - xinference - Versions diffs - 0.15.3__py3-none-any.whl → 0.16.0__py3-none-any.whl - Mend

xinference 0.15.3py3-none-any.whl → 0.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (65) hide show

xinference/__init__.py +0 -4
xinference/_version.py +3 -3
xinference/api/restful_api.py +29 -2
xinference/client/restful/restful_client.py +10 -0
xinference/constants.py +7 -3
xinference/core/image_interface.py +76 -23
xinference/core/model.py +158 -46
xinference/core/progress_tracker.py +187 -0
xinference/core/scheduler.py +10 -7
xinference/core/supervisor.py +11 -0
xinference/core/utils.py +9 -0
xinference/core/worker.py +1 -0
xinference/deploy/supervisor.py +4 -0
xinference/model/__init__.py +4 -0
xinference/model/audio/chattts.py +2 -1
xinference/model/audio/core.py +0 -2
xinference/model/audio/model_spec.json +8 -0
xinference/model/audio/model_spec_modelscope.json +9 -0
xinference/model/image/core.py +6 -7
xinference/model/image/scheduler/__init__.py +13 -0
xinference/model/image/scheduler/flux.py +533 -0
xinference/model/image/sdapi.py +35 -4
xinference/model/image/stable_diffusion/core.py +215 -110
xinference/model/image/utils.py +39 -3
xinference/model/llm/__init__.py +2 -0
xinference/model/llm/llm_family.json +185 -17
xinference/model/llm/llm_family_modelscope.json +124 -12
xinference/model/llm/transformers/chatglm.py +104 -0
xinference/model/llm/transformers/cogvlm2.py +2 -1
xinference/model/llm/transformers/cogvlm2_video.py +2 -0
xinference/model/llm/transformers/core.py +43 -113
xinference/model/llm/transformers/deepseek_v2.py +0 -226
xinference/model/llm/transformers/deepseek_vl.py +2 -0
xinference/model/llm/transformers/glm4v.py +2 -1
xinference/model/llm/transformers/intern_vl.py +2 -0
xinference/model/llm/transformers/internlm2.py +3 -95
xinference/model/llm/transformers/minicpmv25.py +2 -0
xinference/model/llm/transformers/minicpmv26.py +2 -0
xinference/model/llm/transformers/omnilmm.py +2 -0
xinference/model/llm/transformers/opt.py +68 -0
xinference/model/llm/transformers/qwen2_audio.py +11 -4
xinference/model/llm/transformers/qwen2_vl.py +2 -28
xinference/model/llm/transformers/qwen_vl.py +2 -1
xinference/model/llm/transformers/utils.py +36 -283
xinference/model/llm/transformers/yi_vl.py +2 -0
xinference/model/llm/utils.py +60 -16
xinference/model/llm/vllm/core.py +68 -9
xinference/model/llm/vllm/utils.py +0 -1
xinference/model/utils.py +7 -4
xinference/model/video/core.py +0 -2
xinference/utils.py +2 -3
xinference/web/ui/build/asset-manifest.json +3 -3
xinference/web/ui/build/index.html +1 -1
xinference/web/ui/build/static/js/{main.e51a356d.js → main.f7da0140.js} +3 -3
xinference/web/ui/build/static/js/main.f7da0140.js.map +1 -0
xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +1 -0
{xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/METADATA +38 -6
{xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/RECORD +63 -59
xinference/web/ui/build/static/js/main.e51a356d.js.map +0 -1
xinference/web/ui/node_modules/.cache/babel-loader/4385c1095eefbff0a8ec3b2964ba6e5a66a05ab31be721483ca2f43e2a91f6ff.json +0 -1
/xinference/web/ui/build/static/js/{main.e51a356d.js.LICENSE.txt → main.f7da0140.js.LICENSE.txt} +0 -0
{xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/LICENSE +0 -0
{xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/WHEEL +0 -0
{xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/entry_points.txt +0 -0
{xinference-0.15.3.dist-info → xinference-0.16.0.dist-info}/top_level.txt +0 -0

xinference/model/llm/utils.py CHANGED Viewed

@@ -29,6 +29,7 @@ from ...types import (
     ChatCompletion,
     ChatCompletionChoice,
     ChatCompletionChunk,
+    ChatCompletionMessage,
     Completion,
     CompletionChoice,
     CompletionChunk,
@@ -50,6 +51,7 @@ QWEN_TOOL_CALL_FAMILY = [
     "qwen1.5-moe-chat",
     "qwen2-instruct",
     "qwen2-moe-instruct",
+    "qwen2.5-instruct",
 ]
 GLM4_TOOL_CALL_FAMILY = [
@@ -57,6 +59,10 @@ GLM4_TOOL_CALL_FAMILY = [
     "glm4-chat-1m",
 ]
+LLAMA3_TOOL_CALL_FAMILY = [
+    "llama-3.1-instruct",
+]
 QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
@@ -113,7 +119,7 @@ class ChatModelMixin:
             return self._build_from_raw_template(messages, chat_template, **kwargs)
     @staticmethod
-    def get_specific_prompt(model_family: str, messages: List[Dict]):
+    def get_specific_prompt(model_family: str, messages: List[ChatCompletionMessage]):
         """
         Inspired by FastChat. Format chat history into a prompt according to the prompty style of
         different models.
@@ -129,7 +135,7 @@ class ChatModelMixin:
             ret = (
                 "<s>"
                 if system_prompt == ""
-                else "<s><|im_start|>system\n"
+                else "<s><|im_start|>system\n"  # type: ignore
                 + system_prompt
                 + intra_message_sep
                 + "\n"
@@ -333,8 +339,9 @@ class ChatModelMixin:
         for content in contents:
             content = content.strip()
             if content:
-                if content.startswith(QWEN_TOOL_CALL_SYMBOLS[0]):
-                    content = content[len(QWEN_TOOL_CALL_SYMBOLS[0]) :]
+                pos = content.find(QWEN_TOOL_CALL_SYMBOLS[0])
+                if pos != -1:
+                    content = content[pos + len(QWEN_TOOL_CALL_SYMBOLS[0]) :]
                 content = content.strip()
                 try:
                     res = json.loads(content)
@@ -353,6 +360,15 @@ class ChatModelMixin:
         text = c["choices"][0]["text"]
         return cls._handle_qwen_tool_result(text)
+    @classmethod
+    def _eval_llama3_chat_arguments(cls, c) -> List[Tuple]:
+        text = c["choices"][0]["text"]
+        try:
+            data = eval(text, {}, {})
+            return [(None, data["name"], data["parameters"])]
+        except Exception:
+            return [(text, None, None)]
     @classmethod
     def _eval_tool_arguments(cls, model_family, c):
         family = model_family.model_family or model_family.model_name
@@ -360,6 +376,8 @@ class ChatModelMixin:
             result = cls._eval_glm_chat_arguments(c)
         elif family in QWEN_TOOL_CALL_FAMILY:
             result = cls._eval_qwen_chat_arguments(c)
+        elif family in LLAMA3_TOOL_CALL_FAMILY:
+            result = cls._eval_llama3_chat_arguments(c)
         else:
             raise Exception(
                 f"Model {model_family.model_name} is not support tool calls."
@@ -368,24 +386,22 @@ class ChatModelMixin:
         return result
     @classmethod
-    def _tool_calls_completion_chunk(cls, model_family, model_uid, c):
-        _id = str(uuid.uuid4())
+    def _tool_calls_completion_chunk(cls, model_family, model_uid, c, chunk_id=None):
+        _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
         tool_result = cls._eval_tool_arguments(model_family, c)
         tool_calls = []
         failed_contents = []
         for content, func, args in tool_result:
             if func:
                 tool_calls.append(
-                    [
-                        {
-                            "id": f"call_{_id}",
-                            "type": "function",
-                            "function": {
-                                "name": func,
-                                "arguments": json.dumps(args, ensure_ascii=False),
-                            },
-                        }
-                    ]
+                    {
+                        "id": f"call_{_id}",
+                        "type": "function",
+                        "function": {
+                            "name": func,
+                            "arguments": json.dumps(args, ensure_ascii=False),
+                        },
+                    }
                 )
             else:
                 failed_contents.append(content)
@@ -471,6 +487,34 @@ class ChatModelMixin:
             "usage": usage,
         }
+    def _transform_messages(
+        self,
+        messages: List[ChatCompletionMessage],
+    ):
+        transformed_messages = []
+        for msg in messages:
+            new_content = []
+            role = msg["role"]
+            content = msg["content"]
+            if isinstance(content, str):
+                new_content.append({"type": "text", "text": content})
+            elif isinstance(content, List):
+                for item in content:  # type: ignore
+                    if "text" in item:
+                        new_content.append({"type": "text", "text": item["text"]})
+                    elif "image_url" in item:
+                        new_content.append(
+                            {"type": "image", "image": item["image_url"]["url"]}
+                        )
+                    elif "video_url" in item:
+                        new_content.append(
+                            {"type": "video", "video": item["video_url"]["url"]}
+                        )
+            new_message = {"role": role, "content": new_content}
+            transformed_messages.append(new_message)
+        return transformed_messages
 def get_file_location(
     llm_family: LLMFamilyV1, spec: LLMSpecV1, quantization: str

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -34,6 +34,7 @@ from typing import (
 from ....types import (
     ChatCompletion,
     ChatCompletionChunk,
+    ChatCompletionMessage,
     Completion,
     CompletionChoice,
     CompletionChunk,
@@ -175,6 +176,9 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
 if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
+if VLLM_INSTALLED and vllm.__version__ >= "0.6.3":
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct")
 class VLLMModel(LLM):
     def __init__(
@@ -309,11 +313,6 @@ class VLLMModel(LLM):
         model_config.setdefault("max_num_seqs", 256)
         model_config.setdefault("quantization", None)
         model_config.setdefault("max_model_len", None)
-        model_config["limit_mm_per_prompt"] = (
-            json.loads(model_config.get("limit_mm_per_prompt"))  # type: ignore
-            if model_config.get("limit_mm_per_prompt")
-            else None
-        )
         return model_config
@@ -718,11 +717,26 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format != "pytorch":
+        if not cls._has_cuda_device():
+            return False
+        if not cls._is_linux():
+            return False
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
             return False
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
                 return False
+        if llm_spec.model_format == "awq":
+            # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
+            if "4" not in quantization:
+                return False
+        if llm_spec.model_format == "gptq":
+            if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
+                if not any(q in quantization for q in ("3", "4", "8")):
+                    return False
+            else:
+                if "4" not in quantization:
+                    return False
         if isinstance(llm_family, CustomLLMFamilyV1):
             if llm_family.model_family not in VLLM_SUPPORTED_VISION_MODEL_LIST:
                 return False
@@ -733,6 +747,33 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
             return False
         return VLLM_INSTALLED
+    def _sanitize_model_config(
+        self, model_config: Optional[VLLMModelConfig]
+    ) -> VLLMModelConfig:
+        if model_config is None:
+            model_config = VLLMModelConfig()
+        cuda_count = self._get_cuda_count()
+        model_config.setdefault("tokenizer_mode", "auto")
+        model_config.setdefault("trust_remote_code", True)
+        model_config.setdefault("tensor_parallel_size", cuda_count)
+        model_config.setdefault("block_size", 16)
+        model_config.setdefault("swap_space", 4)
+        model_config.setdefault("gpu_memory_utilization", 0.90)
+        model_config.setdefault("max_num_seqs", 256)
+        model_config.setdefault("quantization", None)
+        model_config.setdefault("max_model_len", None)
+        model_config["limit_mm_per_prompt"] = (
+            json.loads(model_config.get("limit_mm_per_prompt"))  # type: ignore
+            if model_config.get("limit_mm_per_prompt")
+            else {
+                "image": 2,  # default 2 images all chat
+            }
+        )
+        return model_config
     def _sanitize_chat_config(
         self,
         generate_config: Optional[Dict] = None,
@@ -755,14 +796,32 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
     @vllm_check
     async def async_chat(
         self,
-        messages: List[Dict],
+        messages: List[ChatCompletionMessage],  # type: ignore
         generate_config: Optional[Dict] = None,
         request_id: Optional[str] = None,
     ) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
+        messages = self._transform_messages(messages)
+        tools = generate_config.pop("tools", []) if generate_config else None
         model_family = self.model_family.model_family or self.model_family.model_name
-        prompt, images = self.get_specific_prompt(model_family, messages)
-        if len(images) == 0:
+        if "internvl2" not in model_family.lower():
+            from qwen_vl_utils import process_vision_info
+            full_context_kwargs = {}
+            if tools and model_family in QWEN_TOOL_CALL_FAMILY:
+                full_context_kwargs["tools"] = tools
+            assert self.model_family.chat_template is not None
+            prompt = self.get_full_context(
+                messages, self.model_family.chat_template, **full_context_kwargs
+            )
+            images, video_inputs = process_vision_info(messages)
+            if video_inputs:
+                raise ValueError("Not support video input now.")
+        else:
+            prompt, images = self.get_specific_prompt(model_family, messages)
+        if not images:
             inputs = {
                 "prompt": prompt,
             }

xinference/model/llm/vllm/utils.py CHANGED Viewed

@@ -26,7 +26,6 @@ def vllm_check(fn):
     @functools.wraps(fn)
     async def _async_wrapper(self, *args, **kwargs):
-        logger.info("vllm_check")
         try:
             return await fn(self, *args, **kwargs)
         except AsyncEngineDeadError:

xinference/model/utils.py CHANGED Viewed

@@ -23,12 +23,15 @@ import huggingface_hub
 import numpy as np
 import torch
-from ..constants import XINFERENCE_CACHE_DIR, XINFERENCE_ENV_MODEL_SRC
+from ..constants import (
+    XINFERENCE_CACHE_DIR,
+    XINFERENCE_DOWNLOAD_MAX_ATTEMPTS,
+    XINFERENCE_ENV_MODEL_SRC,
+)
 from ..device_utils import get_available_device, is_device_available
 from .core import CacheableModelSpec
 logger = logging.getLogger(__name__)
-MAX_ATTEMPTS = 3
 IS_NEW_HUGGINGFACE_HUB: bool = huggingface_hub.__version__ >= "0.23.0"
@@ -100,11 +103,11 @@ def retry_download(
     **kwargs,
 ):
     last_ex = None
-    for current_attempt in range(1, MAX_ATTEMPTS + 1):
+    for current_attempt in range(1, XINFERENCE_DOWNLOAD_MAX_ATTEMPTS + 1):
         try:
             return download_func(*args, **kwargs)
         except Exception as e:
-            remaining_attempts = MAX_ATTEMPTS - current_attempt
+            remaining_attempts = XINFERENCE_DOWNLOAD_MAX_ATTEMPTS - current_attempt
             last_ex = e
             logger.debug(
                 "Download failed: %s, download func: %s, download args: %s, kwargs: %s",

xinference/model/video/core.py CHANGED Viewed

@@ -21,8 +21,6 @@ from ..core import CacheableModelSpec, ModelDescription
 from ..utils import valid_model_revision
 from .diffusers import DiffUsersVideoModel
-MAX_ATTEMPTS = 3
 logger = logging.getLogger(__name__)
 MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)

xinference/utils.py CHANGED Viewed

@@ -13,9 +13,8 @@
 # limitations under the License.
-import torch
 def cuda_count():
+    import torch
     # even if install torch cpu, this interface would return 0.
     return torch.cuda.device_count()

xinference/web/ui/build/asset-manifest.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
   "files": {
     "main.css": "./static/css/main.5061c4c3.css",
-    "main.js": "./static/js/main.e51a356d.js",
+    "main.js": "./static/js/main.f7da0140.js",
     "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
     "index.html": "./index.html",
     "main.5061c4c3.css.map": "./static/css/main.5061c4c3.css.map",
-    "main.e51a356d.js.map": "./static/js/main.e51a356d.js.map"
+    "main.f7da0140.js.map": "./static/js/main.f7da0140.js.map"
   },
   "entrypoints": [
     "static/css/main.5061c4c3.css",
-    "static/js/main.e51a356d.js"
+    "static/js/main.f7da0140.js"
   ]
 }

xinference/web/ui/build/index.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.~~e51a356d~~.js"></script><link href="./static/css/main.5061c4c3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1	+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.f7da0140.js"></script><link href="./static/css/main.5061c4c3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

xinference 0.15.3__py3-none-any.whl → 0.16.0__py3-none-any.whl

Potentially problematic release.

xinference 0.15.3py3-none-any.whl → 0.16.0py3-none-any.whl