PyPI - xinference - Versions diffs - 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

xinference 1.4.1py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (104) hide show

xinference/core/chat_interface.py CHANGED Viewed

@@ -16,11 +16,13 @@ import base64
 import html
 import logging
 import os
+import tempfile
 from io import BytesIO
-from typing import Dict, Generator, List, Optional
+from typing import Generator, List, Optional
 import gradio as gr
 import PIL.Image
+from gradio import ChatMessage
 from gradio.components import Markdown, Textbox
 from gradio.layouts import Accordion, Column, Row
@@ -65,13 +67,13 @@ class GradioInterface:
     def build(self) -> "gr.Blocks":
         if "vision" in self.model_ability:
-            interface = self.build_chat_vl_interface()
+            interface = self.build_chat_multimodel_interface()
         elif "chat" in self.model_ability:
             interface = self.build_chat_interface()
         else:
             interface = self.build_generate_interface()
-        interface.queue()
+        interface.queue(default_concurrency_limit=os.cpu_count())
         # Gradio initiates the queue during a startup event, but since the app has already been
         # started, that event will not run, so manually invoke the startup events.
         # See: https://github.com/gradio-app/gradio/issues/5228
@@ -91,25 +93,10 @@ class GradioInterface:
         interface.favicon_path = favicon_path
         return interface
-    def build_chat_interface(
-        self,
-    ) -> "gr.Blocks":
-        def flatten(matrix: List[List[str]]) -> List[str]:
-            flat_list = []
-            for row in matrix:
-                flat_list += row
-            return flat_list
-        def to_chat(lst: List[str]) -> List[Dict]:
-            res = []
-            for i in range(len(lst)):
-                role = "assistant" if i % 2 == 1 else "user"
-                res.append(dict(role=role, content=lst[i]))
-            return res
+    def build_chat_interface(self) -> "gr.Blocks":
         def generate_wrapper(
             message: str,
-            history: List[List[str]],
+            history: List[ChatMessage],
             max_tokens: int,
             temperature: float,
             lora_name: str,
@@ -121,13 +108,22 @@ class GradioInterface:
             client._set_token(self._access_token)
             model = client.get_model(self.model_uid)
             assert isinstance(model, RESTfulChatModelHandle)
-            messages = to_chat(flatten(history))
-            messages.append(dict(role="user", content=message))
+            # Convert history to messages format
+            messages = []
+            for msg in history:
+                # ignore thinking content
+                if msg["metadata"]:
+                    continue
+                messages.append({"role": msg["role"], "content": msg["content"]})
             if stream:
                 response_content = ""
+                reasoning_content = ""
+                is_first_reasoning_content = True
+                is_first_content = True
                 for chunk in model.chat(
-                    messages,
+                    messages=messages,
                     generate_config={
                         "max_tokens": int(max_tokens),
                         "temperature": temperature,
@@ -137,46 +133,79 @@ class GradioInterface:
                 ):
                     assert isinstance(chunk, dict)
                     delta = chunk["choices"][0]["delta"]
-                    if "content" not in delta or delta["content"] is None:
-                        continue
-                    else:
-                        # some model like deepseek-r1-distill-qwen
-                        # will generate <think>...</think> ...
-                        # in gradio, no output will be rendered,
-                        # thus escape html tags in advance
-                        response_content += html.escape(delta["content"])
-                        yield response_content
-                yield response_content
+                    if (
+                        "reasoning_content" in delta
+                        and delta["reasoning_content"] is not None
+                        and is_first_reasoning_content
+                    ):
+                        reasoning_content += html.escape(delta["reasoning_content"])
+                        history.append(
+                            ChatMessage(
+                                role="assistant",
+                                content=reasoning_content,
+                                metadata={"title": "💭 Thinking Process"},
+                            )
+                        )
+                        is_first_reasoning_content = False
+                    elif (
+                        "reasoning_content" in delta
+                        and delta["reasoning_content"] is not None
+                    ):
+                        reasoning_content += html.escape(delta["reasoning_content"])
+                        history[-1] = ChatMessage(
+                            role="assistant",
+                            content=reasoning_content,
+                            metadata={"title": "💭  Thinking Process"},
+                        )
+                    elif (
+                        "content" in delta
+                        and delta["content"] is not None
+                        and is_first_content
+                    ):
+                        response_content += html.escape(delta["content"])
+                        history.append(
+                            ChatMessage(role="assistant", content=response_content)
+                        )
+                        is_first_content = False
+                    elif "content" in delta and delta["content"] is not None:
+                        response_content += html.escape(delta["content"])
+                        # Replace thinking message with actual response
+                        history[-1] = ChatMessage(
+                            role="assistant", content=response_content
+                        )
+                    yield history
             else:
                 result = model.chat(
-                    messages,
+                    messages=messages,
                     generate_config={
                         "max_tokens": int(max_tokens),
                         "temperature": temperature,
                         "lora_name": lora_name,
                     },
                 )
-                yield html.escape(result["choices"][0]["message"]["content"])  # type: ignore
+                assert isinstance(result, dict)
+                mg = result["choices"][0]["message"]
+                if "reasoning_content" in mg:
+                    reasoning_content = mg["reasoning_content"]
+                    if reasoning_content is not None:
+                        reasoning_content = html.escape(str(reasoning_content))
+                        history.append(
+                            ChatMessage(
+                                role="assistant",
+                                content=reasoning_content,
+                                metadata={"title": "💭 Thinking Process"},
+                            )
+                        )
-        return gr.ChatInterface(
-            fn=generate_wrapper,
-            additional_inputs=[
-                gr.Slider(
-                    minimum=1,
-                    maximum=self.context_length,
-                    value=512
-                    if "reasoning" not in self.model_ability
-                    else self.context_length // 2,
-                    step=1,
-                    label="Max Tokens",
-                ),
-                gr.Slider(
-                    minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
-                ),
-                gr.Text(label="LoRA Name"),
-                gr.Checkbox(label="Stream", value=True),
-            ],
+                content = mg["content"]
+                response_content = (
+                    html.escape(str(content)) if content is not None else ""
+                )
+                history.append(ChatMessage(role="assistant", content=response_content))
+                yield history
+        with gr.Blocks(
             title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
             css="""
             .center{
@@ -186,25 +215,123 @@ class GradioInterface:
                 padding: 0px;
                 color: #9ea4b0 !important;
             }
-            """,
-            description=f"""
-            <div class="center">
-            Model ID: {self.model_uid}
-            </div>
-            <div class="center">
-            Model Size: {self.model_size_in_billions} Billion Parameters
-            </div>
-            <div class="center">
-            Model Format: {self.model_format}
-            </div>
-            <div class="center">
-            Model Quantization: {self.quantization}
-            </div>
+            .main-container {
+                display: flex;
+                flex-direction: column;
+                padding: 0.5rem;
+                box-sizing: border-box;
+                gap: 0.25rem;
+                flex-grow: 1;
+                min-width: min(320px, 100%);
+                height: calc(100vh - 70px)!important;
+            }
+            .header {
+                flex-grow: 0!important;
+            }
+            .header h1 {
+                margin: 0.5rem 0;
+                font-size: 1.5rem;
+            }
+            .center {
+                font-size: 0.9rem;
+                margin: 0.1rem 0;
+            }
+            .chat-container {
+                flex: 1;
+                display: flex;
+                min-height: 0;
+                margin: 0.25rem 0;
+            }
+            .chat-container .block {
+                height: 100%!important;
+            }
+            .input-container {
+                flex-grow: 0!important;
+            }
             """,
             analytics_enabled=False,
-        )
+        ) as chat_interface:
+            with gr.Column(elem_classes="main-container"):
+                # Header section
+                with gr.Column(elem_classes="header"):
+                    gr.Markdown(
+                        f"""<h1 style='text-align: center; margin-bottom: 1rem'>🚀 Xinference Chat Bot : {self.model_name} 🚀</h1>"""
+                    )
+                    gr.Markdown(
+                        f"""
+                        <div class="center">Model ID: {self.model_uid}</div>
+                        <div class="center">Model Size: {self.model_size_in_billions} Billion Parameters</div>
+                        <div class="center">Model Format: {self.model_format}</div>
+                        <div class="center">Model Quantization: {self.quantization}</div>
+                        """
+                    )
+                # Chat container
+                with gr.Column(elem_classes="chat-container"):
+                    chatbot = gr.Chatbot(
+                        type="messages",
+                        label=self.model_name,
+                        show_label=True,
+                        render_markdown=True,
+                        container=True,
+                    )
-    def build_chat_vl_interface(
+                # Input container
+                with gr.Column(elem_classes="input-container"):
+                    with gr.Row():
+                        with gr.Column(scale=12):
+                            textbox = gr.Textbox(
+                                show_label=False,
+                                placeholder="Type a message...",
+                                container=False,
+                            )
+                        with gr.Column(scale=1, min_width=50):
+                            submit_btn = gr.Button("Enter", variant="primary")
+                    with gr.Accordion("Additional Inputs", open=False):
+                        max_tokens = gr.Slider(
+                            minimum=1,
+                            maximum=self.context_length,
+                            value=512
+                            if "reasoning" not in self.model_ability
+                            else self.context_length // 2,
+                            step=1,
+                            label="Max Tokens",
+                        )
+                        temperature = gr.Slider(
+                            minimum=0,
+                            maximum=2,
+                            value=1,
+                            step=0.01,
+                            label="Temperature",
+                        )
+                        stream = gr.Checkbox(label="Stream", value=True)
+                        lora_name = gr.Text(label="LoRA Name")
+            # deal with message submit
+            textbox.submit(
+                lambda m, h: ("", h + [ChatMessage(role="user", content=m)]),
+                [textbox, chatbot],
+                [textbox, chatbot],
+            ).then(
+                generate_wrapper,
+                [textbox, chatbot, max_tokens, temperature, lora_name, stream],
+                chatbot,
+            )
+            submit_btn.click(
+                lambda m, h: ("", h + [ChatMessage(role="user", content=m)]),
+                [textbox, chatbot],
+                [textbox, chatbot],
+            ).then(
+                generate_wrapper,
+                [textbox, chatbot, max_tokens, temperature, lora_name, stream],
+                chatbot,
+            )
+            return chat_interface
+    def build_chat_multimodel_interface(
         self,
     ) -> "gr.Blocks":
         def predict(history, bot, max_tokens, temperature, stream):
@@ -251,11 +378,46 @@ class GradioInterface:
                     },
                 )
                 history.append(response["choices"][0]["message"])
-                bot[-1][1] = history[-1]["content"]
-                yield history, bot
+                if "audio" in history[-1]:
+                    # audio output
+                    audio_bytes = base64.b64decode(history[-1]["audio"]["data"])
+                    audio_file = tempfile.NamedTemporaryFile(
+                        delete=False, suffix=".wav"
+                    )
+                    audio_file.write(audio_bytes)
+                    audio_file.close()
+                    def audio_to_base64(audio_path):
+                        with open(audio_path, "rb") as audio_file:
+                            return base64.b64encode(audio_file.read()).decode("utf-8")
+                    def generate_html_audio(audio_path):
+                        base64_audio = audio_to_base64(audio_path)
+                        audio_format = audio_path.split(".")[-1]
+                        return (
+                            f"<audio controls style='max-width:100%;'>"
+                            f"<source src='data:audio/{audio_format};base64,{base64_audio}' type='audio/{audio_format}'>"
+                            f"Your browser does not support the audio tag.</audio>"
+                        )
-        def add_text(history, bot, text, image, video):
-            logger.debug("Add text, text: %s, image: %s, video: %s", text, image, video)
+                    bot[-1] = (bot[-1][0], history[-1]["content"])
+                    yield history, bot
+                    # append html audio tag instead of gr.Audio
+                    bot.append((None, generate_html_audio(audio_file.name)))
+                    yield history, bot
+                else:
+                    bot[-1][1] = history[-1]["content"]
+                    yield history, bot
+        def add_text(history, bot, text, image, video, audio):
+            logger.debug(
+                "Add text, text: %s, image: %s, video: %s, audio: %s",
+                text,
+                image,
+                video,
+                audio,
+            )
             if image:
                 buffered = BytesIO()
                 with PIL.Image.open(image) as img:
@@ -306,20 +468,54 @@ class GradioInterface:
                         },
                     ],
                 }
+            elif audio:
+                def audio_to_base64(audio_path):
+                    with open(audio_path, "rb") as audio_file:
+                        encoded_string = base64.b64encode(audio_file.read()).decode(
+                            "utf-8"
+                        )
+                    return encoded_string
+                def generate_html_audio(audio_path):
+                    base64_audio = audio_to_base64(audio_path)
+                    audio_format = audio_path.split(".")[-1]
+                    return (
+                        f"<audio controls style='max-width:100%;'>"
+                        f"<source src='data:audio/{audio_format};base64,{base64_audio}' type='audio/{audio_format}'>"
+                        f"Your browser does not support the audio tag.</audio>"
+                    )
+                display_content = f"{generate_html_audio(audio)}<br>{text}"
+                message = {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": text},
+                        {
+                            "type": "audio_url",
+                            "audio_url": {"url": audio},
+                        },
+                    ],
+                }
             else:
                 display_content = text
                 message = {"role": "user", "content": text}
             history = history + [message]
             bot = bot + [[display_content, None]]
-            return history, bot, "", None, None
+            return history, bot, "", None, None, None
         def clear_history():
             logger.debug("Clear history.")
-            return [], None, "", None, None
+            return [], None, "", None, None, None
         def update_button(text):
             return gr.update(interactive=bool(text))
+        has_vision = "vision" in self.model_ability
+        has_audio = "audio" in self.model_ability
         with gr.Blocks(
             title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
             css="""
@@ -358,11 +554,29 @@ class GradioInterface:
             state = gr.State([])
             with gr.Row():
                 chatbot = gr.Chatbot(
-                    elem_id="chatbot", label=self.model_name, height=700, scale=7
+                    elem_id="chatbot", label=self.model_name, scale=7, min_height=900
                 )
                 with gr.Column(scale=3):
-                    imagebox = gr.Image(type="filepath")
-                    videobox = gr.Video()
+                    if has_vision:
+                        imagebox = gr.Image(type="filepath")
+                        videobox = gr.Video()
+                    else:
+                        imagebox = gr.Image(type="filepath", visible=False)
+                        videobox = gr.Video(visible=False)
+                    if has_audio:
+                        audiobox = gr.Audio(
+                            sources=["microphone", "upload"],
+                            type="filepath",
+                            visible=True,
+                        )
+                    else:
+                        audiobox = gr.Audio(
+                            sources=["microphone", "upload"],
+                            type="filepath",
+                            visible=False,
+                        )
                     textbox = gr.Textbox(
                         show_label=False,
                         placeholder="Enter text and press ENTER",
@@ -390,8 +604,8 @@ class GradioInterface:
             textbox.submit(
                 add_text,
-                [state, chatbot, textbox, imagebox, videobox],
-                [state, chatbot, textbox, imagebox, videobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
                 queue=False,
             ).then(
                 predict,
@@ -401,8 +615,8 @@ class GradioInterface:
             submit_btn.click(
                 add_text,
-                [state, chatbot, textbox, imagebox, videobox],
-                [state, chatbot, textbox, imagebox, videobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
                 queue=False,
             ).then(
                 predict,
@@ -413,7 +627,7 @@ class GradioInterface:
             clear_btn.click(
                 clear_history,
                 None,
-                [state, chatbot, textbox, imagebox, videobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
                 queue=False,
             )

xinference/core/model.py CHANGED Viewed

@@ -231,6 +231,7 @@ class ModelActor(xo.StatelessActor, CancelMixin):
         driver_info: Optional[dict] = None,  # for model across workers
     ):
         super().__init__()
         from ..model.llm.llama_cpp.core import XllamaCppModel
         from ..model.llm.lmdeploy.core import LMDeployModel
         from ..model.llm.sglang.core import SGLANGModel

xinference/core/progress_tracker.py CHANGED Viewed

@@ -92,16 +92,20 @@ class ProgressTrackerActor(xo.StatelessActor):
             await asyncio.sleep(self._check_interval)
-    def start(self, request_id: str):
+    def start(self, request_id: str, info: Optional[str] = None):
         self._request_id_to_progress[request_id] = _ProgressInfo(
-            progress=0.0, last_updated=time.time()
+            progress=0.0, last_updated=time.time(), info=info
         )
-    def set_progress(self, request_id: str, progress: float):
+    def set_progress(
+        self, request_id: str, progress: float, info: Optional[str] = None
+    ):
         assert progress <= 1.0
-        info = self._request_id_to_progress[request_id]
-        info.progress = progress
-        info.last_updated = time.time()
+        info_ = self._request_id_to_progress[request_id]
+        info_.progress = progress
+        info_.last_updated = time.time()
+        if info:
+            info_.info = info
         logger.debug(
             "Setting progress, request id: %s, progress: %s", request_id, progress
         )
@@ -109,6 +113,10 @@ class ProgressTrackerActor(xo.StatelessActor):
     def get_progress(self, request_id: str) -> float:
         return self._request_id_to_progress[request_id].progress
+    def get_progress_info(self, request_id: str) -> Tuple[float, Optional[str]]:
+        info = self._request_id_to_progress[request_id]
+        return info.progress, info.info
 class Progressor:
     _sub_progress_stack: List[Tuple[float, float]]
@@ -169,7 +177,7 @@ class Progressor:
             self.set_progress(1.0)
         return False
-    def set_progress(self, progress: float):
+    def set_progress(self, progress: float, info: Optional[str] = None):
         if self.request_id:
             self._current_progress = (
                 self._current_sub_progress_start
@@ -179,7 +187,7 @@ class Progressor:
             if (
                 self._current_progress - self._last_report_progress >= self._upload_span
                 or 1.0 - progress < 1e-5
-            ):
+            ) or info:
                 set_progress = self.progress_tracker_ref.set_progress(
                     self.request_id, self._current_progress
                 )

xinference/core/supervisor.py CHANGED Viewed

@@ -18,11 +18,13 @@ import os
 import signal
 import time
 import typing
-from dataclasses import dataclass
+from collections import defaultdict
+from dataclasses import dataclass, field
 from logging import getLogger
 from typing import (
     TYPE_CHECKING,
     Any,
+    DefaultDict,
     Dict,
     Iterator,
     List,
@@ -91,6 +93,9 @@ class WorkerStatus:
 class ReplicaInfo:
     replica: int
     scheduler: Iterator
+    replica_to_worker_refs: DefaultDict[
+        int, List[xo.ActorRefType["WorkerActor"]]
+    ] = field(default_factory=lambda: defaultdict(list))
 class SupervisorActor(xo.StatelessActor):
@@ -1113,6 +1118,9 @@ class SupervisorActor(xo.StatelessActor):
                         if target_ip_worker_ref is not None
                         else await self._choose_worker()
                     )
+                    self._model_uid_to_replica_info[model_uid].replica_to_worker_refs[
+                        _idx
+                    ].append(worker_ref)
                     if enable_xavier and _idx == 0:
                         """
                         Start the rank 0 model actor on the worker that holds the rank 1 replica,
@@ -1260,6 +1268,9 @@ class SupervisorActor(xo.StatelessActor):
                     driver_info = None
                     for i_worker in range(n_worker):
                         worker_ref = await self._choose_worker(available_workers)
+                        self._model_uid_to_replica_info[
+                            model_uid
+                        ].replica_to_worker_refs[_idx].append(worker_ref)
                         nonlocal model_type
                         model_type = model_type or "LLM"
                         if i_worker > 1:
@@ -1344,6 +1355,39 @@ class SupervisorActor(xo.StatelessActor):
             task.add_done_callback(lambda _: callback_for_async_launch(model_uid))  # type: ignore
         return model_uid
+    async def get_launch_builtin_model_progress(self, model_uid: str) -> float:
+        info = self._model_uid_to_replica_info[model_uid]
+        all_progress = 0.0
+        i = 0
+        for rep_model_uid in iter_replica_model_uid(model_uid, info.replica):
+            request_id = f"launching-{rep_model_uid}"
+            try:
+                all_progress += await self._progress_tracker.get_progress(request_id)
+                i += 1
+            except KeyError:
+                continue
+        return all_progress / i if i > 0 else 0.0
+    async def cancel_launch_builtin_model(self, model_uid: str):
+        info = self._model_uid_to_replica_info[model_uid]
+        coros = []
+        for i, rep_model_uid in enumerate(
+            iter_replica_model_uid(model_uid, info.replica)
+        ):
+            worker_refs = self._model_uid_to_replica_info[
+                model_uid
+            ].replica_to_worker_refs[i]
+            for worker_ref in worker_refs:
+                coros.append(worker_ref.cancel_launch_model(rep_model_uid))
+        try:
+            await asyncio.gather(*coros)
+        except RuntimeError:
+            # some may have finished
+            pass
+        # remove replica info
+        self._model_uid_to_replica_info.pop(model_uid, None)
     async def get_instance_info(
         self, model_name: Optional[str], model_uid: Optional[str]
     ) -> List[Dict]:

xinference 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl

Potentially problematic release.

xinference 1.4.1py3-none-any.whl → 1.5.0py3-none-any.whl