PyPI - xinference - Versions diffs - 0.10.3__py3-none-any.whl → 0.11.1__py3-none-any.whl - Mend

xinference 0.10.3py3-none-any.whl → 0.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (101) hide show

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -37,6 +37,7 @@ from ....types import (
     CompletionChoice,
     CompletionChunk,
     CompletionUsage,
+    LoRA,
     ToolCallFunction,
     ToolCalls,
 )
@@ -64,16 +65,19 @@ class VLLMModelConfig(TypedDict, total=False):
 class VLLMGenerateConfig(TypedDict, total=False):
+    lora_name: Optional[str]
     n: int
     best_of: Optional[int]
     presence_penalty: float
     frequency_penalty: float
     temperature: float
     top_p: float
+    top_k: int
     max_tokens: int
     stop_token_ids: Optional[List[int]]
     stop: Optional[Union[str, List[str]]]
     stream: bool  # non-sampling param, should not be passed to the engine.
+    stream_options: Optional[Union[dict, None]]
 try:
@@ -90,6 +94,7 @@ VLLM_SUPPORTED_MODELS = [
     "internlm-16k",
     "mistral-v0.1",
     "Yi",
+    "Yi-1.5",
     "code-llama",
     "code-llama-python",
 ]
@@ -106,10 +111,12 @@ VLLM_SUPPORTED_CHAT_MODELS = [
     "internlm2-chat",
     "qwen-chat",
     "Yi-chat",
+    "Yi-1.5-chat",
     "code-llama-instruct",
     "mistral-instruct-v0.1",
     "mistral-instruct-v0.2",
     "mixtral-instruct-v0.1",
+    "mixtral-8x22B-instruct-v0.1",
     "chatglm3",
     "chatglm3-32k",
     "chatglm3-128k",
@@ -142,16 +149,30 @@ class VLLMModel(LLM):
         quantization: str,
         model_path: str,
         model_config: Optional[VLLMModelConfig],
+        peft_model: Optional[List[LoRA]] = None,
     ):
+        try:
+            from vllm.lora.request import LoRARequest
+        except ImportError:
+            error_message = "Failed to import module 'vllm'"
+            installation_guide = [
+                "Please make sure 'vllm' is installed. ",
+                "You can install it by `pip install vllm`\n",
+            ]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
         super().__init__(model_uid, model_family, model_spec, quantization, model_path)
         self._model_config = model_config
         self._engine = None
+        self.lora_modules = peft_model
+        self.lora_requests: List[LoRARequest] = []
     def load(self):
         try:
             import vllm
             from vllm.engine.arg_utils import AsyncEngineArgs
             from vllm.engine.async_llm_engine import AsyncLLMEngine
+            from vllm.lora.request import LoRARequest
         except ImportError:
             error_message = "Failed to import module 'vllm'"
             installation_guide = [
@@ -170,11 +191,33 @@ class VLLMModel(LLM):
             multiprocessing.set_start_method("fork", force=True)
         self._model_config = self._sanitize_model_config(self._model_config)
+        if self.lora_modules is None:
+            self.lora_requests = []
+        else:
+            self.lora_requests = [
+                LoRARequest(
+                    lora_name=lora.lora_name,
+                    lora_int_id=i,
+                    lora_local_path=lora.local_path,
+                )
+                for i, lora in enumerate(self.lora_modules, start=1)
+            ]
+        enable_lora = len(self.lora_requests) > 0
+        max_loras = len(self.lora_requests)
         logger.info(
             f"Loading {self.model_uid} with following model config: {self._model_config}"
+            f"Enable lora: {enable_lora}. Lora count: {max_loras}."
         )
-        engine_args = AsyncEngineArgs(model=self.model_path, **self._model_config)
+        engine_args = AsyncEngineArgs(
+            model=self.model_path,
+            enable_lora=enable_lora,
+            max_loras=max_loras,
+            **self._model_config,
+        )
         self._engine = AsyncLLMEngine.from_engine_args(engine_args)
     def _sanitize_model_config(
@@ -205,6 +248,7 @@ class VLLMModel(LLM):
             generate_config = {}
         sanitized = VLLMGenerateConfig()
+        sanitized.setdefault("lora_name", generate_config.get("lora_name", None))
         sanitized.setdefault("n", generate_config.get("n", 1))
         sanitized.setdefault("best_of", generate_config.get("best_of", None))
         sanitized.setdefault(
@@ -215,12 +259,16 @@ class VLLMModel(LLM):
         )
         sanitized.setdefault("temperature", generate_config.get("temperature", 1.0))
         sanitized.setdefault("top_p", generate_config.get("top_p", 1.0))
+        sanitized.setdefault("top_k", generate_config.get("top_k", -1))
         sanitized.setdefault("max_tokens", generate_config.get("max_tokens", 1024))
         sanitized.setdefault("stop", generate_config.get("stop", None))
         sanitized.setdefault(
             "stop_token_ids", generate_config.get("stop_token_ids", None)
         )
-        sanitized.setdefault("stream", generate_config.get("stream", None))
+        sanitized.setdefault("stream", generate_config.get("stream", False))
+        sanitized.setdefault(
+            "stream_options", generate_config.get("stream_options", None)
+        )
         return sanitized
@@ -239,10 +287,17 @@ class VLLMModel(LLM):
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
                 return False
-        if llm_spec.model_format in ["gptq", "awq"]:
-            # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
+        if llm_spec.model_format == "awq":
+            # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
             if "4" not in quantization:
                 return False
+        if llm_spec.model_format == "gptq":
+            if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
+                if not any(q in quantization for q in ("3", "4", "8")):
+                    return False
+            else:
+                if "4" not in quantization:
+                    return False
         if isinstance(llm_family, CustomLLMFamilyV1):
             if llm_family.model_family not in VLLM_SUPPORTED_MODELS:
                 return False
@@ -330,16 +385,34 @@ class VLLMModel(LLM):
             "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
         )
+        lora_model = sanitized_generate_config.pop("lora_name")
+        lora_request = None
+        if lora_model is not None:
+            for lora in self.lora_requests:
+                if lora_model == lora.lora_name:
+                    lora_request = lora
+                    break
         stream = sanitized_generate_config.pop("stream")
+        stream_options = sanitized_generate_config.pop("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
         sampling_params = SamplingParams(**sanitized_generate_config)
         request_id = str(uuid.uuid1())
         assert self._engine is not None
-        results_generator = self._engine.generate(prompt, sampling_params, request_id)
+        results_generator = self._engine.generate(
+            prompt, sampling_params, request_id, lora_request=lora_request
+        )
         async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
             previous_texts = [""] * sanitized_generate_config["n"]
             tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family)
+            prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
             async for _request_output in results_generator:
                 chunk = self._convert_request_output_to_completion_chunk(
                     request_id=request_id,
@@ -390,6 +463,20 @@ class VLLMModel(LLM):
                     total_tokens=total_tokens,
                 )
                 yield chunk
+            if include_usage:
+                chunk = CompletionChunk(
+                    id=request_id,
+                    object="text_completion",
+                    created=int(time.time()),
+                    model=self.model_uid,
+                    choices=[],
+                )
+                chunk["usage"] = CompletionUsage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
+                )
+                yield chunk
         if stream:
             return stream_results()
@@ -416,10 +503,17 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
         if llm_spec.model_format == "pytorch":
             if quantization != "none" and not (quantization is None):
                 return False
-        if llm_spec.model_format in ["gptq", "awq"]:
-            # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
+        if llm_spec.model_format == "awq":
+            # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
             if "4" not in quantization:
                 return False
+        if llm_spec.model_format == "gptq":
+            if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
+                if not any(q in quantization for q in ("3", "4", "8")):
+                    return False
+            else:
+                if "4" not in quantization:
+                    return False
         if isinstance(llm_family, CustomLLMFamilyV1):
             if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
                 return False

xinference/thirdparty/omnilmm/chat.py CHANGED Viewed

@@ -4,7 +4,6 @@ import json
 import os
 import torch
-from accelerate import init_empty_weights, load_checkpoint_and_dispatch
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer
@@ -20,6 +19,8 @@ DEFAULT_IM_END_TOKEN = "<im_end>"
 def init_omni_lmm(model_path, device_map):
+    from accelerate import init_empty_weights, load_checkpoint_and_dispatch
     torch.backends.cuda.matmul.allow_tf32 = True
     disable_torch_init()
     model_name = os.path.expanduser(model_path)

xinference/thirdparty/omnilmm/model/omnilmm.py CHANGED Viewed

@@ -2,7 +2,6 @@ import gc
 import math
 from typing import List, Optional, Tuple, Union
-import timm
 import torch
 import torch.nn as nn
 from torch import Tensor
@@ -37,6 +36,8 @@ class Identity(torch.nn.Identity):
 def create_vision_module(config):
+    import timm
     vision_tower = timm.create_model(
         "eva02_enormous_patch14_clip_224.laion2b_plus",
         pretrained=False,

xinference/types.py CHANGED Viewed

@@ -33,6 +33,7 @@ from .fields import (
     stop_field,
     stream_field,
     stream_interval_field,
+    stream_option_field,
     temperature_field,
     top_k_field,
     top_p_field,
@@ -186,6 +187,8 @@ class ChatglmCppGenerateConfig(TypedDict, total=False):
     top_p: float
     temperature: float
     stream: bool
+    lora_name: Optional[str]
+    stream_options: Optional[Union[dict, None]]
 class QWenCppModelConfig(TypedDict, total=False):
@@ -230,6 +233,7 @@ class LlamaCppGenerateConfig(TypedDict, total=False):
     repetition_penalty: float
     top_k: int
     stream: bool
+    stream_options: Optional[Union[dict, None]]
     tfs_z: float
     mirostat_mode: int
     mirostat_tau: float
@@ -278,6 +282,8 @@ class PytorchGenerateConfig(TypedDict, total=False):
     stream_interval: int
     model: Optional[str]
     tools: Optional[List[Dict]]
+    lora_name: Optional[str]
+    stream_options: Optional[Union[dict, None]]
 class PytorchModelConfig(TypedDict, total=False):
@@ -349,10 +355,12 @@ class CreateCompletionTorch(BaseModel):
     stop: Optional[Union[str, List[str]]] = stop_field
     stop_token_ids: Optional[Union[int, List[int]]] = none_field
     stream: bool = stream_field
+    stream_options: Optional[Union[dict, None]] = stream_option_field
     stream_interval: int = stream_interval_field
     temperature: float = temperature_field
     top_p: float = top_p_field
     top_k: int = top_k_field
+    lora_name: Optional[str]
 CreateCompletionLlamaCpp: BaseModel
@@ -365,6 +373,8 @@ try:
         include_fields={
             "grammar": (Optional[Any], None),
             "max_tokens": (Optional[int], max_tokens_field),
+            "lora_name": (Optional[str], None),
+            "stream_options": (Optional[Union[dict, None]], None),
         },
     )
 except ImportError:
@@ -392,6 +402,7 @@ class _CreateCompletionOpenAIFallback(BaseModel):
     seed: Optional[int] = none_field
     stop: Optional[Union[str, List[str]]] = stop_field
     stream: bool = stream_field
+    stream_options: Optional[Union[dict, None]] = stream_option_field
     suffix: Optional[str] = none_field
     temperature: float = temperature_field
     top_p: float = top_p_field

xinference/web/ui/build/asset-manifest.json CHANGED Viewed

@@ -1,11 +1,14 @@
 {
   "files": {
-    "main.js": "./static/js/main.26fdbfbe.js",
+    "main.css": "./static/css/main.54bca460.css",
+    "main.js": "./static/js/main.551aa479.js",
     "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
     "index.html": "./index.html",
-    "main.26fdbfbe.js.map": "./static/js/main.26fdbfbe.js.map"
+    "main.54bca460.css.map": "./static/css/main.54bca460.css.map",
+    "main.551aa479.js.map": "./static/js/main.551aa479.js.map"
   },
   "entrypoints": [
-    "static/js/main.26fdbfbe.js"
+    "static/css/main.54bca460.css",
+    "static/js/main.551aa479.js"
   ]
 }

xinference/web/ui/build/index.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.~~26fdbfbe~~.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1	+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.551aa479.js"></script><link href="./static/css/main.54bca460.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

xinference/web/ui/build/static/css/main.54bca460.css ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ .formBox{max-height:80vh;max-width:50vw;min-width:50vw;overflow:auto;padding:40px 20px 0 0;position:relative;transition:all .4s ease-in-out}.broaden{max-width:100%;min-width:100%;padding-right:0}.show-json{align-items:center;color:#444;display:flex;position:fixed;right:60px;top:90px}.icon{cursor:pointer;margin-left:20px;position:absolute;right:-40px}.icon:hover{color:#1976d2}.arrow{font-size:24px!important}.jsonBox{min-height:80vh;position:relative;transition:all .4s ease-in-out;width:100%}.hide{overflow:hidden;-webkit-transform:translate(30vw);transform:translate(30vw);width:0}.jsonBox-header{font-weight:700;line-height:40px}.textarea{border:1px solid #ddd;border-radius:5px;color:#444;height:calc(100% - 40px);padding:5px 10px;resize:none;width:100%}.copyIcon{color:#555;cursor:pointer;font-size:16px!important;position:absolute;right:5px;top:13px}.copyIcon:hover{color:#1976d2}.addBtn{margin-left:20px!important}.item{background-color:#eee;border-radius:10px;margin:10px 50px 0;overflow:hidden;padding:20px;position:relative}.item:hover .deleteBtn{-webkit-transform:translateX(-50px);transform:translateX(-50px)}.deleteBtn{background-color:#1976d2;border-radius:25px;height:50px;line-height:70px;position:absolute;right:20px;text-align:center;top:calc(50% - 25px);-webkit-transform:translateX(80px);transform:translateX(80px);transition:all .3s ease-in-out;width:50px}.deleteBtn:hover{box-shadow:0 0 10px #aaa;cursor:pointer}.deleteIcon{color:#fff;font-size:28px!important}
2	+ /# sourceMappingURL=main.54bca460.css.map/

xinference/web/ui/build/static/css/main.54bca460.css.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"static/css/main.54bca460.css","mappings":"AAAA,SAIE,eAAgB,CAFhB,cAAe,CACf,cAAe,CAEf,aAAc,CACd,qBAAsB,CALtB,iBAAkB,CAMlB,8BACF,CAEA,SACE,cAAe,CACf,cAAe,CACf,eACF,CAEA,WAEE,kBAAmB,CAInB,UAAW,CALX,YAAa,CAEb,cAAe,CAEf,UAAW,CADX,QAGF,CAEA,MAGE,cAAe,CACf,gBAAiB,CAHjB,iBAAkB,CAClB,WAGF,CAEA,YACE,aACF,CAEA,OACE,wBACF,CAEA,SAEE,eAAgB,CADhB,iBAAkB,CAGlB,8BAAgC,CADhC,UAEF,CAEA,MAGE,eAAgB,CADhB,iCAA6B,CAA7B,yBAA6B,CAD7B,OAGF,CAEA,gBAEE,eAAgB,CADhB,gBAEF,CAEA,UAIE,qBAAsB,CACtB,iBAAkB,CAElB,UAAW,CALX,wBAAyB,CACzB,gBAAiB,CAGjB,WAAY,CALZ,UAOF,CAEA,UAME,UAAW,CALX,cAAe,CAIf,wBAA0B,CAH1B,iBAAkB,CAElB,SAAU,CADV,QAIF,CAEA,gBACE,aACF,CAEA,QACE,0BACF,CAEA,MAEE,qBAAsB,CAGtB,kBAAmB,CAFnB,kBAAmB,CAGnB,eAAgB,CAFhB,YAAa,CAHb,iBAMF,CAEA,uBACE,mCAA4B,CAA5B,2BACF,CAEA,WAUE,wBAAyB,CADzB,kBAAmB,CAJnB,WAAY,CAGZ,gBAAiB,CAPjB,iBAAkB,CAClB,UAAW,CAKX,iBAAkB,CAJlB,oBAAqB,CAGrB,kCAA2B,CAA3B,0BAA2B,CAK3B,8BAAgC,CAPhC,UAQF,CAEA,iBAEE,wBAAyB,CADzB,cAEF,CAEA,YAEE,UAAW,CADX,wBAEF","sources":["scenes/register_model/styles/registerModelStyle.css"],"sourcesContent":[".formBox {\n position: relative;\n max-width: 50vw;\n min-width: 50vw;\n max-height: 80vh;\n overflow: auto;\n padding: 40px 20px 0 0;\n transition: all 0.4s ease-in-out;\n}\n\n.broaden {\n max-width: 100%;\n min-width: 100%;\n padding-right: 0;\n}\n\n.show-json {\n display: flex;\n align-items: center;\n position: fixed;\n top: 90px;\n right: 60px;\n color: #444;\n}\n\n.icon {\n position: absolute;\n right: -40px;\n cursor: pointer;\n margin-left: 20px;\n}\n\n.icon:hover {\n color: #1976d2;\n}\n\n.arrow {\n font-size: 24px !important;\n}\n\n.jsonBox {\n position: relative;\n min-height: 80vh;\n width: 100%;\n transition: all 0.4s ease-in-out;\n}\n\n.hide {\n width: 0;\n transform: translate(30vw, 0);\n overflow: hidden;\n}\n\n.jsonBox-header {\n line-height: 40px;\n font-weight: 700;\n}\n\n.textarea {\n width: 100%;\n height: calc(100% - 40px);\n padding: 5px 10px;\n border: 1px solid #ddd;\n border-radius: 5px;\n resize: none;\n color: #444;\n}\n\n.copyIcon {\n cursor: pointer;\n position: absolute;\n top: 13px;\n right: 5px;\n font-size: 16px !important;\n color: #555;\n}\n\n.copyIcon:hover {\n color: #1976d2;\n}\n\n.addBtn {\n margin-left: 20px !important;\n}\n\n.item {\n position: relative;\n background-color: #eee;\n margin: 10px 50px 0;\n padding: 20px;\n border-radius: 10px;\n overflow: hidden;\n}\n\n.item:hover .deleteBtn {\n transform: translateX(-50px);\n}\n\n.deleteBtn {\n position: absolute;\n right: 20px;\n top: calc(50% - 25px);\n width: 50px;\n height: 50px;\n transform: translateX(80px);\n text-align: center;\n line-height: 70px;\n border-radius: 25px;\n background-color: #1976d2;\n transition: all 0.3s ease-in-out;\n}\n\n.deleteBtn:hover {\n cursor: pointer;\n box-shadow: 0 0 10px #aaa;\n}\n\n.deleteIcon {\n font-size: 28px !important;\n color: #fff;\n}\n"],"names":[],"sourceRoot":""}

xinference 0.10.3__py3-none-any.whl → 0.11.1__py3-none-any.whl

Potentially problematic release.

xinference 0.10.3py3-none-any.whl → 0.11.1py3-none-any.whl