PyPI - xinference - Versions diffs - 0.11.0__py3-none-any.whl → 0.11.1__py3-none-any.whl - Mend

xinference 0.11.0py3-none-any.whl → 0.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (37) hide show

xinference/model/llm/pytorch/yi_vl.py CHANGED Viewed

@@ -139,6 +139,12 @@ class YiVLChatModel(PytorchChatModel):
             generate_config = {}
         stream = generate_config.get("stream", False)
+        stream_options = generate_config.pop("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
         from ....thirdparty.llava.conversation import conv_templates
         from ....thirdparty.llava.mm_utils import (
@@ -166,11 +172,11 @@ class YiVLChatModel(PytorchChatModel):
         )
         images = state.get_images(return_pil=True)
-        image = images[0]
-        image_tensor = self._image_processor.preprocess(image, return_tensors="pt")[
-            "pixel_values"
-        ][0]
+        if images:
+            image = images[0]
+            image_tensor = self._image_processor.preprocess(image, return_tensors="pt")[
+                "pixel_values"
+            ][0]
         stop_str = state.sep
         keywords = [stop_str]
@@ -187,7 +193,9 @@ class YiVLChatModel(PytorchChatModel):
             "input_ids": input_ids,
             "images": image_tensor.unsqueeze(0)
             .to(dtype=torch.bfloat16)
-            .to(self._model.device),
+            .to(self._model.device)
+            if images
+            else None,
             "streamer": streamer,
             "do_sample": True,
             "top_p": float(top_p),
@@ -200,7 +208,7 @@ class YiVLChatModel(PytorchChatModel):
         t.start()
         if stream:
-            it = self._generate_stream(streamer, stop_str)
+            it = self._generate_stream(streamer, stop_str, input_ids, include_usage)
             return self._to_chat_completion_chunks(it)
         else:
             c = self._generate(streamer, stop_str)
@@ -229,8 +237,12 @@ class YiVLChatModel(PytorchChatModel):
         )
         return c
-    def _generate_stream(self, streamer, stop_str) -> Iterator[CompletionChunk]:
+    def _generate_stream(
+        self, streamer, stop_str, input_ids, include_usage
+    ) -> Iterator[CompletionChunk]:
         completion_id = str(uuid.uuid1())
+        prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
+        prompt_tokens = len(input_ids[0])
         for i, new_text in enumerate(streamer):
             if not new_text.endswith(stop_str):
                 completion_choice = CompletionChoice(
@@ -243,10 +255,12 @@ class YiVLChatModel(PytorchChatModel):
                     model=self.model_uid,
                     choices=[completion_choice],
                 )
+                completion_tokens = i
+                total_tokens = prompt_tokens + completion_tokens
                 completion_usage = CompletionUsage(
-                    prompt_tokens=-1,
-                    completion_tokens=-1,
-                    total_tokens=-1,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
                 )
                 chunk["usage"] = completion_usage
                 yield chunk
@@ -262,9 +276,23 @@ class YiVLChatModel(PytorchChatModel):
             choices=[completion_choice],
         )
         completion_usage = CompletionUsage(
-            prompt_tokens=-1,
-            completion_tokens=-1,
-            total_tokens=-1,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
         )
         chunk["usage"] = completion_usage
         yield chunk
+        if include_usage:
+            chunk = CompletionChunk(
+                id=completion_id,
+                object="text_completion",
+                created=int(time.time()),
+                model=self.model_uid,
+                choices=[],
+            )
+            chunk["usage"] = CompletionUsage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+            )
+            yield chunk

xinference/model/llm/sglang/core.py CHANGED Viewed

@@ -53,6 +53,7 @@ class SGLANGGenerateConfig(TypedDict, total=False):
     stop: Optional[Union[str, List[str]]]
     ignore_eos: bool
     stream: bool
+    stream_options: Optional[Union[dict, None]]
 try:
@@ -157,6 +158,8 @@ class SGLANGModel(LLM):
         )
         generate_config.setdefault("stop", [])
         generate_config.setdefault("stream", False)
+        stream_options = generate_config.get("stream_options")
+        generate_config.setdefault("stream_options", stream_options)
         generate_config.setdefault("ignore_eos", False)
         return generate_config
@@ -192,7 +195,7 @@ class SGLANGModel(LLM):
     @staticmethod
     def _convert_state_to_completion_chunk(
-        request_id: str, model: str, output_text: str, meta_info: Dict
+        request_id: str, model: str, output_text: str
     ) -> CompletionChunk:
         choices: List[CompletionChoice] = [
             CompletionChoice(
@@ -209,13 +212,6 @@ class SGLANGModel(LLM):
             model=model,
             choices=choices,
         )
-        prompt_tokens = meta_info["prompt_tokens"]
-        completion_tokens = meta_info["completion_tokens"]
-        chunk["usage"] = CompletionUsage(
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            total_tokens=prompt_tokens + completion_tokens,
-        )
         return chunk
     @staticmethod
@@ -272,6 +268,9 @@ class SGLANGModel(LLM):
             "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
         )
         stream = sanitized_generate_config.pop("stream")
+        stream_options = sanitized_generate_config.pop("stream_options")
+        if isinstance(stream_options, dict):
+            include_usage = stream_options.pop("include_usage", False)
         request_id = str(uuid.uuid1())
         state = pipeline.run(
             question=prompt,
@@ -289,11 +288,34 @@ class SGLANGModel(LLM):
         else:
             async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
+                prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
                 async for out, meta_info in state.text_async_iter(
                     var_name="answer", return_meta_data=True
                 ):
                     chunk = self._convert_state_to_completion_chunk(
-                        request_id, self.model_uid, output_text=out, meta_info=meta_info
+                        request_id, self.model_uid, output_text=out
+                    )
+                    prompt_tokens = meta_info["prompt_tokens"]
+                    completion_tokens = meta_info["completion_tokens"]
+                    total_tokens = prompt_tokens + completion_tokens
+                    chunk["usage"] = CompletionUsage(
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=total_tokens,
+                    )
+                    yield chunk
+                if include_usage:
+                    chunk = CompletionChunk(
+                        id=request_id,
+                        object="text_completion",
+                        created=int(time.time()),
+                        model=self.model_uid,
+                        choices=[],
+                    )
+                    chunk["usage"] = CompletionUsage(
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=total_tokens,
                     )
                     yield chunk

xinference/model/llm/utils.py CHANGED Viewed

@@ -482,9 +482,6 @@ Begin!"""
                 for i, choice in enumerate(chunk["choices"])
             ],
         }
-        usage = chunk.get("usage")
-        if usage is not None:
-            chat_chunk["usage"] = usage
         return cast(ChatCompletionChunk, chat_chunk)
     @classmethod
@@ -508,6 +505,19 @@ Begin!"""
                 for i, choice in enumerate(chunk["choices"])
             ],
         }
+        return cast(ChatCompletionChunk, chat_chunk)
+    @classmethod
+    def _get_final_chat_completion_chunk(
+        cls, chunk: CompletionChunk
+    ) -> ChatCompletionChunk:
+        chat_chunk = {
+            "id": "chat" + chunk["id"],
+            "model": chunk["model"],
+            "created": chunk["created"],
+            "object": "chat.completion.chunk",
+            "choices": [],
+        }
         usage = chunk.get("usage")
         if usage is not None:
             chat_chunk["usage"] = usage
@@ -521,7 +531,12 @@ Begin!"""
         for i, chunk in enumerate(chunks):
             if i == 0:
                 yield cls._get_first_chat_completion_chunk(chunk)
-            yield cls._to_chat_completion_chunk(chunk)
+            # usage
+            choices = chunk.get("choices")
+            if not choices:
+                yield cls._get_final_chat_completion_chunk(chunk)
+            else:
+                yield cls._to_chat_completion_chunk(chunk)
     @classmethod
     async def _async_to_chat_completion_chunks(
@@ -532,7 +547,12 @@ Begin!"""
         async for chunk in chunks:
             if i == 0:
                 yield cls._get_first_chat_completion_chunk(chunk)
-            yield cls._to_chat_completion_chunk(chunk)
+            # usage
+            choices = chunk.get("choices")
+            if not choices:
+                yield cls._get_final_chat_completion_chunk(chunk)
+            else:
+                yield cls._to_chat_completion_chunk(chunk)
             i += 1
     @staticmethod

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -37,6 +37,7 @@ from ....types import (
     CompletionChoice,
     CompletionChunk,
     CompletionUsage,
+    LoRA,
     ToolCallFunction,
     ToolCalls,
 )
@@ -64,16 +65,19 @@ class VLLMModelConfig(TypedDict, total=False):
 class VLLMGenerateConfig(TypedDict, total=False):
+    lora_name: Optional[str]
     n: int
     best_of: Optional[int]
     presence_penalty: float
     frequency_penalty: float
     temperature: float
     top_p: float
+    top_k: int
     max_tokens: int
     stop_token_ids: Optional[List[int]]
     stop: Optional[Union[str, List[str]]]
     stream: bool  # non-sampling param, should not be passed to the engine.
+    stream_options: Optional[Union[dict, None]]
 try:
@@ -90,6 +94,7 @@ VLLM_SUPPORTED_MODELS = [
     "internlm-16k",
     "mistral-v0.1",
     "Yi",
+    "Yi-1.5",
     "code-llama",
     "code-llama-python",
 ]
@@ -106,6 +111,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
     "internlm2-chat",
     "qwen-chat",
     "Yi-chat",
+    "Yi-1.5-chat",
     "code-llama-instruct",
     "mistral-instruct-v0.1",
     "mistral-instruct-v0.2",
@@ -143,16 +149,30 @@ class VLLMModel(LLM):
         quantization: str,
         model_path: str,
         model_config: Optional[VLLMModelConfig],
+        peft_model: Optional[List[LoRA]] = None,
     ):
+        try:
+            from vllm.lora.request import LoRARequest
+        except ImportError:
+            error_message = "Failed to import module 'vllm'"
+            installation_guide = [
+                "Please make sure 'vllm' is installed. ",
+                "You can install it by `pip install vllm`\n",
+            ]
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
         super().__init__(model_uid, model_family, model_spec, quantization, model_path)
         self._model_config = model_config
         self._engine = None
+        self.lora_modules = peft_model
+        self.lora_requests: List[LoRARequest] = []
     def load(self):
         try:
             import vllm
             from vllm.engine.arg_utils import AsyncEngineArgs
             from vllm.engine.async_llm_engine import AsyncLLMEngine
+            from vllm.lora.request import LoRARequest
         except ImportError:
             error_message = "Failed to import module 'vllm'"
             installation_guide = [
@@ -171,11 +191,33 @@ class VLLMModel(LLM):
             multiprocessing.set_start_method("fork", force=True)
         self._model_config = self._sanitize_model_config(self._model_config)
+        if self.lora_modules is None:
+            self.lora_requests = []
+        else:
+            self.lora_requests = [
+                LoRARequest(
+                    lora_name=lora.lora_name,
+                    lora_int_id=i,
+                    lora_local_path=lora.local_path,
+                )
+                for i, lora in enumerate(self.lora_modules, start=1)
+            ]
+        enable_lora = len(self.lora_requests) > 0
+        max_loras = len(self.lora_requests)
         logger.info(
             f"Loading {self.model_uid} with following model config: {self._model_config}"
+            f"Enable lora: {enable_lora}. Lora count: {max_loras}."
         )
-        engine_args = AsyncEngineArgs(model=self.model_path, **self._model_config)
+        engine_args = AsyncEngineArgs(
+            model=self.model_path,
+            enable_lora=enable_lora,
+            max_loras=max_loras,
+            **self._model_config,
+        )
         self._engine = AsyncLLMEngine.from_engine_args(engine_args)
     def _sanitize_model_config(
@@ -206,6 +248,7 @@ class VLLMModel(LLM):
             generate_config = {}
         sanitized = VLLMGenerateConfig()
+        sanitized.setdefault("lora_name", generate_config.get("lora_name", None))
         sanitized.setdefault("n", generate_config.get("n", 1))
         sanitized.setdefault("best_of", generate_config.get("best_of", None))
         sanitized.setdefault(
@@ -216,12 +259,16 @@ class VLLMModel(LLM):
         )
         sanitized.setdefault("temperature", generate_config.get("temperature", 1.0))
         sanitized.setdefault("top_p", generate_config.get("top_p", 1.0))
+        sanitized.setdefault("top_k", generate_config.get("top_k", -1))
         sanitized.setdefault("max_tokens", generate_config.get("max_tokens", 1024))
         sanitized.setdefault("stop", generate_config.get("stop", None))
         sanitized.setdefault(
             "stop_token_ids", generate_config.get("stop_token_ids", None)
         )
-        sanitized.setdefault("stream", generate_config.get("stream", None))
+        sanitized.setdefault("stream", generate_config.get("stream", False))
+        sanitized.setdefault(
+            "stream_options", generate_config.get("stream_options", None)
+        )
         return sanitized
@@ -338,16 +385,34 @@ class VLLMModel(LLM):
             "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
         )
+        lora_model = sanitized_generate_config.pop("lora_name")
+        lora_request = None
+        if lora_model is not None:
+            for lora in self.lora_requests:
+                if lora_model == lora.lora_name:
+                    lora_request = lora
+                    break
         stream = sanitized_generate_config.pop("stream")
+        stream_options = sanitized_generate_config.pop("stream_options", None)
+        include_usage = (
+            stream_options["include_usage"]
+            if isinstance(stream_options, dict)
+            else False
+        )
         sampling_params = SamplingParams(**sanitized_generate_config)
         request_id = str(uuid.uuid1())
         assert self._engine is not None
-        results_generator = self._engine.generate(prompt, sampling_params, request_id)
+        results_generator = self._engine.generate(
+            prompt, sampling_params, request_id, lora_request=lora_request
+        )
         async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
             previous_texts = [""] * sanitized_generate_config["n"]
             tools_token_filter = ChatModelMixin._tools_token_filter(self.model_family)
+            prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
             async for _request_output in results_generator:
                 chunk = self._convert_request_output_to_completion_chunk(
                     request_id=request_id,
@@ -398,6 +463,20 @@ class VLLMModel(LLM):
                     total_tokens=total_tokens,
                 )
                 yield chunk
+            if include_usage:
+                chunk = CompletionChunk(
+                    id=request_id,
+                    object="text_completion",
+                    created=int(time.time()),
+                    model=self.model_uid,
+                    choices=[],
+                )
+                chunk["usage"] = CompletionUsage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
+                )
+                yield chunk
         if stream:
             return stream_results()

xinference/types.py CHANGED Viewed

@@ -187,6 +187,8 @@ class ChatglmCppGenerateConfig(TypedDict, total=False):
     top_p: float
     temperature: float
     stream: bool
+    lora_name: Optional[str]
+    stream_options: Optional[Union[dict, None]]
 class QWenCppModelConfig(TypedDict, total=False):
@@ -231,6 +233,7 @@ class LlamaCppGenerateConfig(TypedDict, total=False):
     repetition_penalty: float
     top_k: int
     stream: bool
+    stream_options: Optional[Union[dict, None]]
     tfs_z: float
     mirostat_mode: int
     mirostat_tau: float
@@ -279,6 +282,8 @@ class PytorchGenerateConfig(TypedDict, total=False):
     stream_interval: int
     model: Optional[str]
     tools: Optional[List[Dict]]
+    lora_name: Optional[str]
+    stream_options: Optional[Union[dict, None]]
 class PytorchModelConfig(TypedDict, total=False):
@@ -350,10 +355,12 @@ class CreateCompletionTorch(BaseModel):
     stop: Optional[Union[str, List[str]]] = stop_field
     stop_token_ids: Optional[Union[int, List[int]]] = none_field
     stream: bool = stream_field
+    stream_options: Optional[Union[dict, None]] = stream_option_field
     stream_interval: int = stream_interval_field
     temperature: float = temperature_field
     top_p: float = top_p_field
     top_k: int = top_k_field
+    lora_name: Optional[str]
 CreateCompletionLlamaCpp: BaseModel
@@ -366,6 +373,8 @@ try:
         include_fields={
             "grammar": (Optional[Any], None),
             "max_tokens": (Optional[int], max_tokens_field),
+            "lora_name": (Optional[str], None),
+            "stream_options": (Optional[Union[dict, None]], None),
         },
     )
 except ImportError:
@@ -393,7 +402,7 @@ class _CreateCompletionOpenAIFallback(BaseModel):
     seed: Optional[int] = none_field
     stop: Optional[Union[str, List[str]]] = stop_field
     stream: bool = stream_field
-    stream_options: Optional[dict] = stream_option_field
+    stream_options: Optional[Union[dict, None]] = stream_option_field
     suffix: Optional[str] = none_field
     temperature: float = temperature_field
     top_p: float = top_p_field

xinference/web/ui/build/asset-manifest.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
   "files": {
     "main.css": "./static/css/main.54bca460.css",
-    "main.js": "./static/js/main.8e44da4b.js",
+    "main.js": "./static/js/main.551aa479.js",
     "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
     "index.html": "./index.html",
     "main.54bca460.css.map": "./static/css/main.54bca460.css.map",
-    "main.8e44da4b.js.map": "./static/js/main.8e44da4b.js.map"
+    "main.551aa479.js.map": "./static/js/main.551aa479.js.map"
   },
   "entrypoints": [
     "static/css/main.54bca460.css",
-    "static/js/main.8e44da4b.js"
+    "static/js/main.551aa479.js"
   ]
 }

xinference/web/ui/build/index.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.~~8e44da4b~~.js"></script><link href="./static/css/main.54bca460.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1	+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.551aa479.js"></script><link href="./static/css/main.54bca460.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

xinference 0.11.0__py3-none-any.whl → 0.11.1__py3-none-any.whl

Potentially problematic release.

xinference 0.11.0py3-none-any.whl → 0.11.1py3-none-any.whl