PyPI - xinference - Versions diffs - 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

xinference 1.4.0py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (132) hide show

xinference/core/chat_interface.py CHANGED Viewed

@@ -16,11 +16,13 @@ import base64
 import html
 import logging
 import os
+import tempfile
 from io import BytesIO
-from typing import Dict, Generator, List, Optional
+from typing import Generator, List, Optional
 import gradio as gr
 import PIL.Image
+from gradio import ChatMessage
 from gradio.components import Markdown, Textbox
 from gradio.layouts import Accordion, Column, Row
@@ -65,13 +67,13 @@ class GradioInterface:
     def build(self) -> "gr.Blocks":
         if "vision" in self.model_ability:
-            interface = self.build_chat_vl_interface()
+            interface = self.build_chat_multimodel_interface()
         elif "chat" in self.model_ability:
             interface = self.build_chat_interface()
         else:
             interface = self.build_generate_interface()
-        interface.queue()
+        interface.queue(default_concurrency_limit=os.cpu_count())
         # Gradio initiates the queue during a startup event, but since the app has already been
         # started, that event will not run, so manually invoke the startup events.
         # See: https://github.com/gradio-app/gradio/issues/5228
@@ -91,25 +93,10 @@ class GradioInterface:
         interface.favicon_path = favicon_path
         return interface
-    def build_chat_interface(
-        self,
-    ) -> "gr.Blocks":
-        def flatten(matrix: List[List[str]]) -> List[str]:
-            flat_list = []
-            for row in matrix:
-                flat_list += row
-            return flat_list
-        def to_chat(lst: List[str]) -> List[Dict]:
-            res = []
-            for i in range(len(lst)):
-                role = "assistant" if i % 2 == 1 else "user"
-                res.append(dict(role=role, content=lst[i]))
-            return res
+    def build_chat_interface(self) -> "gr.Blocks":
         def generate_wrapper(
             message: str,
-            history: List[List[str]],
+            history: List[ChatMessage],
             max_tokens: int,
             temperature: float,
             lora_name: str,
@@ -121,13 +108,22 @@ class GradioInterface:
             client._set_token(self._access_token)
             model = client.get_model(self.model_uid)
             assert isinstance(model, RESTfulChatModelHandle)
-            messages = to_chat(flatten(history))
-            messages.append(dict(role="user", content=message))
+            # Convert history to messages format
+            messages = []
+            for msg in history:
+                # ignore thinking content
+                if msg["metadata"]:
+                    continue
+                messages.append({"role": msg["role"], "content": msg["content"]})
             if stream:
                 response_content = ""
+                reasoning_content = ""
+                is_first_reasoning_content = True
+                is_first_content = True
                 for chunk in model.chat(
-                    messages,
+                    messages=messages,
                     generate_config={
                         "max_tokens": int(max_tokens),
                         "temperature": temperature,
@@ -137,46 +133,79 @@ class GradioInterface:
                 ):
                     assert isinstance(chunk, dict)
                     delta = chunk["choices"][0]["delta"]
-                    if "content" not in delta or delta["content"] is None:
-                        continue
-                    else:
-                        # some model like deepseek-r1-distill-qwen
-                        # will generate <think>...</think> ...
-                        # in gradio, no output will be rendered,
-                        # thus escape html tags in advance
-                        response_content += html.escape(delta["content"])
-                        yield response_content
-                yield response_content
+                    if (
+                        "reasoning_content" in delta
+                        and delta["reasoning_content"] is not None
+                        and is_first_reasoning_content
+                    ):
+                        reasoning_content += html.escape(delta["reasoning_content"])
+                        history.append(
+                            ChatMessage(
+                                role="assistant",
+                                content=reasoning_content,
+                                metadata={"title": "💭 Thinking Process"},
+                            )
+                        )
+                        is_first_reasoning_content = False
+                    elif (
+                        "reasoning_content" in delta
+                        and delta["reasoning_content"] is not None
+                    ):
+                        reasoning_content += html.escape(delta["reasoning_content"])
+                        history[-1] = ChatMessage(
+                            role="assistant",
+                            content=reasoning_content,
+                            metadata={"title": "💭  Thinking Process"},
+                        )
+                    elif (
+                        "content" in delta
+                        and delta["content"] is not None
+                        and is_first_content
+                    ):
+                        response_content += html.escape(delta["content"])
+                        history.append(
+                            ChatMessage(role="assistant", content=response_content)
+                        )
+                        is_first_content = False
+                    elif "content" in delta and delta["content"] is not None:
+                        response_content += html.escape(delta["content"])
+                        # Replace thinking message with actual response
+                        history[-1] = ChatMessage(
+                            role="assistant", content=response_content
+                        )
+                    yield history
             else:
                 result = model.chat(
-                    messages,
+                    messages=messages,
                     generate_config={
                         "max_tokens": int(max_tokens),
                         "temperature": temperature,
                         "lora_name": lora_name,
                     },
                 )
-                yield html.escape(result["choices"][0]["message"]["content"])  # type: ignore
+                assert isinstance(result, dict)
+                mg = result["choices"][0]["message"]
+                if "reasoning_content" in mg:
+                    reasoning_content = mg["reasoning_content"]
+                    if reasoning_content is not None:
+                        reasoning_content = html.escape(str(reasoning_content))
+                        history.append(
+                            ChatMessage(
+                                role="assistant",
+                                content=reasoning_content,
+                                metadata={"title": "💭 Thinking Process"},
+                            )
+                        )
-        return gr.ChatInterface(
-            fn=generate_wrapper,
-            additional_inputs=[
-                gr.Slider(
-                    minimum=1,
-                    maximum=self.context_length,
-                    value=512
-                    if "reasoning" not in self.model_ability
-                    else self.context_length // 2,
-                    step=1,
-                    label="Max Tokens",
-                ),
-                gr.Slider(
-                    minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
-                ),
-                gr.Text(label="LoRA Name"),
-                gr.Checkbox(label="Stream", value=True),
-            ],
+                content = mg["content"]
+                response_content = (
+                    html.escape(str(content)) if content is not None else ""
+                )
+                history.append(ChatMessage(role="assistant", content=response_content))
+                yield history
+        with gr.Blocks(
             title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
             css="""
             .center{
@@ -186,25 +215,123 @@ class GradioInterface:
                 padding: 0px;
                 color: #9ea4b0 !important;
             }
-            """,
-            description=f"""
-            <div class="center">
-            Model ID: {self.model_uid}
-            </div>
-            <div class="center">
-            Model Size: {self.model_size_in_billions} Billion Parameters
-            </div>
-            <div class="center">
-            Model Format: {self.model_format}
-            </div>
-            <div class="center">
-            Model Quantization: {self.quantization}
-            </div>
+            .main-container {
+                display: flex;
+                flex-direction: column;
+                padding: 0.5rem;
+                box-sizing: border-box;
+                gap: 0.25rem;
+                flex-grow: 1;
+                min-width: min(320px, 100%);
+                height: calc(100vh - 70px)!important;
+            }
+            .header {
+                flex-grow: 0!important;
+            }
+            .header h1 {
+                margin: 0.5rem 0;
+                font-size: 1.5rem;
+            }
+            .center {
+                font-size: 0.9rem;
+                margin: 0.1rem 0;
+            }
+            .chat-container {
+                flex: 1;
+                display: flex;
+                min-height: 0;
+                margin: 0.25rem 0;
+            }
+            .chat-container .block {
+                height: 100%!important;
+            }
+            .input-container {
+                flex-grow: 0!important;
+            }
             """,
             analytics_enabled=False,
-        )
+        ) as chat_interface:
+            with gr.Column(elem_classes="main-container"):
+                # Header section
+                with gr.Column(elem_classes="header"):
+                    gr.Markdown(
+                        f"""<h1 style='text-align: center; margin-bottom: 1rem'>🚀 Xinference Chat Bot : {self.model_name} 🚀</h1>"""
+                    )
+                    gr.Markdown(
+                        f"""
+                        <div class="center">Model ID: {self.model_uid}</div>
+                        <div class="center">Model Size: {self.model_size_in_billions} Billion Parameters</div>
+                        <div class="center">Model Format: {self.model_format}</div>
+                        <div class="center">Model Quantization: {self.quantization}</div>
+                        """
+                    )
+                # Chat container
+                with gr.Column(elem_classes="chat-container"):
+                    chatbot = gr.Chatbot(
+                        type="messages",
+                        label=self.model_name,
+                        show_label=True,
+                        render_markdown=True,
+                        container=True,
+                    )
-    def build_chat_vl_interface(
+                # Input container
+                with gr.Column(elem_classes="input-container"):
+                    with gr.Row():
+                        with gr.Column(scale=12):
+                            textbox = gr.Textbox(
+                                show_label=False,
+                                placeholder="Type a message...",
+                                container=False,
+                            )
+                        with gr.Column(scale=1, min_width=50):
+                            submit_btn = gr.Button("Enter", variant="primary")
+                    with gr.Accordion("Additional Inputs", open=False):
+                        max_tokens = gr.Slider(
+                            minimum=1,
+                            maximum=self.context_length,
+                            value=512
+                            if "reasoning" not in self.model_ability
+                            else self.context_length // 2,
+                            step=1,
+                            label="Max Tokens",
+                        )
+                        temperature = gr.Slider(
+                            minimum=0,
+                            maximum=2,
+                            value=1,
+                            step=0.01,
+                            label="Temperature",
+                        )
+                        stream = gr.Checkbox(label="Stream", value=True)
+                        lora_name = gr.Text(label="LoRA Name")
+            # deal with message submit
+            textbox.submit(
+                lambda m, h: ("", h + [ChatMessage(role="user", content=m)]),
+                [textbox, chatbot],
+                [textbox, chatbot],
+            ).then(
+                generate_wrapper,
+                [textbox, chatbot, max_tokens, temperature, lora_name, stream],
+                chatbot,
+            )
+            submit_btn.click(
+                lambda m, h: ("", h + [ChatMessage(role="user", content=m)]),
+                [textbox, chatbot],
+                [textbox, chatbot],
+            ).then(
+                generate_wrapper,
+                [textbox, chatbot, max_tokens, temperature, lora_name, stream],
+                chatbot,
+            )
+            return chat_interface
+    def build_chat_multimodel_interface(
         self,
     ) -> "gr.Blocks":
         def predict(history, bot, max_tokens, temperature, stream):
@@ -251,11 +378,46 @@ class GradioInterface:
                     },
                 )
                 history.append(response["choices"][0]["message"])
-                bot[-1][1] = history[-1]["content"]
-                yield history, bot
+                if "audio" in history[-1]:
+                    # audio output
+                    audio_bytes = base64.b64decode(history[-1]["audio"]["data"])
+                    audio_file = tempfile.NamedTemporaryFile(
+                        delete=False, suffix=".wav"
+                    )
+                    audio_file.write(audio_bytes)
+                    audio_file.close()
+                    def audio_to_base64(audio_path):
+                        with open(audio_path, "rb") as audio_file:
+                            return base64.b64encode(audio_file.read()).decode("utf-8")
+                    def generate_html_audio(audio_path):
+                        base64_audio = audio_to_base64(audio_path)
+                        audio_format = audio_path.split(".")[-1]
+                        return (
+                            f"<audio controls style='max-width:100%;'>"
+                            f"<source src='data:audio/{audio_format};base64,{base64_audio}' type='audio/{audio_format}'>"
+                            f"Your browser does not support the audio tag.</audio>"
+                        )
-        def add_text(history, bot, text, image, video):
-            logger.debug("Add text, text: %s, image: %s, video: %s", text, image, video)
+                    bot[-1] = (bot[-1][0], history[-1]["content"])
+                    yield history, bot
+                    # append html audio tag instead of gr.Audio
+                    bot.append((None, generate_html_audio(audio_file.name)))
+                    yield history, bot
+                else:
+                    bot[-1][1] = history[-1]["content"]
+                    yield history, bot
+        def add_text(history, bot, text, image, video, audio):
+            logger.debug(
+                "Add text, text: %s, image: %s, video: %s, audio: %s",
+                text,
+                image,
+                video,
+                audio,
+            )
             if image:
                 buffered = BytesIO()
                 with PIL.Image.open(image) as img:
@@ -306,20 +468,54 @@ class GradioInterface:
                         },
                     ],
                 }
+            elif audio:
+                def audio_to_base64(audio_path):
+                    with open(audio_path, "rb") as audio_file:
+                        encoded_string = base64.b64encode(audio_file.read()).decode(
+                            "utf-8"
+                        )
+                    return encoded_string
+                def generate_html_audio(audio_path):
+                    base64_audio = audio_to_base64(audio_path)
+                    audio_format = audio_path.split(".")[-1]
+                    return (
+                        f"<audio controls style='max-width:100%;'>"
+                        f"<source src='data:audio/{audio_format};base64,{base64_audio}' type='audio/{audio_format}'>"
+                        f"Your browser does not support the audio tag.</audio>"
+                    )
+                display_content = f"{generate_html_audio(audio)}<br>{text}"
+                message = {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": text},
+                        {
+                            "type": "audio_url",
+                            "audio_url": {"url": audio},
+                        },
+                    ],
+                }
             else:
                 display_content = text
                 message = {"role": "user", "content": text}
             history = history + [message]
             bot = bot + [[display_content, None]]
-            return history, bot, "", None, None
+            return history, bot, "", None, None, None
         def clear_history():
             logger.debug("Clear history.")
-            return [], None, "", None, None
+            return [], None, "", None, None, None
         def update_button(text):
             return gr.update(interactive=bool(text))
+        has_vision = "vision" in self.model_ability
+        has_audio = "audio" in self.model_ability
         with gr.Blocks(
             title=f"🚀 Xinference Chat Bot : {self.model_name} 🚀",
             css="""
@@ -358,11 +554,29 @@ class GradioInterface:
             state = gr.State([])
             with gr.Row():
                 chatbot = gr.Chatbot(
-                    elem_id="chatbot", label=self.model_name, height=700, scale=7
+                    elem_id="chatbot", label=self.model_name, scale=7, min_height=900
                 )
                 with gr.Column(scale=3):
-                    imagebox = gr.Image(type="filepath")
-                    videobox = gr.Video()
+                    if has_vision:
+                        imagebox = gr.Image(type="filepath")
+                        videobox = gr.Video()
+                    else:
+                        imagebox = gr.Image(type="filepath", visible=False)
+                        videobox = gr.Video(visible=False)
+                    if has_audio:
+                        audiobox = gr.Audio(
+                            sources=["microphone", "upload"],
+                            type="filepath",
+                            visible=True,
+                        )
+                    else:
+                        audiobox = gr.Audio(
+                            sources=["microphone", "upload"],
+                            type="filepath",
+                            visible=False,
+                        )
                     textbox = gr.Textbox(
                         show_label=False,
                         placeholder="Enter text and press ENTER",
@@ -390,8 +604,8 @@ class GradioInterface:
             textbox.submit(
                 add_text,
-                [state, chatbot, textbox, imagebox, videobox],
-                [state, chatbot, textbox, imagebox, videobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
                 queue=False,
             ).then(
                 predict,
@@ -401,8 +615,8 @@ class GradioInterface:
             submit_btn.click(
                 add_text,
-                [state, chatbot, textbox, imagebox, videobox],
-                [state, chatbot, textbox, imagebox, videobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
                 queue=False,
             ).then(
                 predict,
@@ -413,7 +627,7 @@ class GradioInterface:
             clear_btn.click(
                 clear_history,
                 None,
-                [state, chatbot, textbox, imagebox, videobox],
+                [state, chatbot, textbox, imagebox, videobox, audiobox],
                 queue=False,
             )

xinference/core/model.py CHANGED Viewed

@@ -185,7 +185,7 @@ class ModelActor(xo.StatelessActor, CancelMixin):
                 )
         if hasattr(self._model, "stop") and callable(self._model.stop):
-            self._model.stop()
+            await asyncio.to_thread(self._model.stop)
         if isinstance(self._model, LLMVLLMModel):
             if self._transfer_ref is not None:
@@ -231,6 +231,7 @@ class ModelActor(xo.StatelessActor, CancelMixin):
         driver_info: Optional[dict] = None,  # for model across workers
     ):
         super().__init__()
         from ..model.llm.llama_cpp.core import XllamaCppModel
         from ..model.llm.lmdeploy.core import LMDeployModel
         from ..model.llm.sglang.core import SGLANGModel
@@ -284,6 +285,8 @@ class ModelActor(xo.StatelessActor, CancelMixin):
     async def __post_create__(self):
         self._loop = asyncio.get_running_loop()
+        logger.debug("Starting ModelActor at %s, uid: %s", self.address, self.uid)
         self._handle_pending_requests_task = asyncio.create_task(
             self._handle_pending_requests()
         )
@@ -463,7 +466,9 @@ class ModelActor(xo.StatelessActor, CancelMixin):
         while True:
             i += 1
             try:
-                self._model.load()
+                if hasattr(self._model, "set_loop"):
+                    self._model.set_loop(asyncio.get_running_loop())
+                await asyncio.to_thread(self._model.load)
                 if hasattr(self._model, "driver_info"):
                     self._driver_info = self._model.driver_info
                 break
@@ -490,7 +495,23 @@ class ModelActor(xo.StatelessActor, CancelMixin):
     async def wait_for_load(self):
         if hasattr(self._model, "wait_for_load"):
-            self._model.wait_for_load()
+            await asyncio.to_thread(self._model.wait_for_load)
+    def need_create_pools(self):
+        return getattr(self._model, "need_create_pools", False)
+    def set_pool_addresses(self, pool_addresses: List[str]):
+        if hasattr(self._model, "set_pool_addresses"):
+            self._model.set_pool_addresses(pool_addresses)
+    def get_pool_addresses(self) -> Optional[List[str]]:
+        if hasattr(self._model, "get_pool_addresses"):
+            return self._model.get_pool_addresses()
+        return None
+    def set_worker_addresses(self, shard: int, worker_addresses: List[str]):
+        if hasattr(self._model, "set_worker_addresses"):
+            self._model.set_worker_addresses(shard, worker_addresses)
     def model_uid(self):
         return (

xinference/core/progress_tracker.py CHANGED Viewed

@@ -92,16 +92,20 @@ class ProgressTrackerActor(xo.StatelessActor):
             await asyncio.sleep(self._check_interval)
-    def start(self, request_id: str):
+    def start(self, request_id: str, info: Optional[str] = None):
         self._request_id_to_progress[request_id] = _ProgressInfo(
-            progress=0.0, last_updated=time.time()
+            progress=0.0, last_updated=time.time(), info=info
         )
-    def set_progress(self, request_id: str, progress: float):
+    def set_progress(
+        self, request_id: str, progress: float, info: Optional[str] = None
+    ):
         assert progress <= 1.0
-        info = self._request_id_to_progress[request_id]
-        info.progress = progress
-        info.last_updated = time.time()
+        info_ = self._request_id_to_progress[request_id]
+        info_.progress = progress
+        info_.last_updated = time.time()
+        if info:
+            info_.info = info
         logger.debug(
             "Setting progress, request id: %s, progress: %s", request_id, progress
         )
@@ -109,6 +113,10 @@ class ProgressTrackerActor(xo.StatelessActor):
     def get_progress(self, request_id: str) -> float:
         return self._request_id_to_progress[request_id].progress
+    def get_progress_info(self, request_id: str) -> Tuple[float, Optional[str]]:
+        info = self._request_id_to_progress[request_id]
+        return info.progress, info.info
 class Progressor:
     _sub_progress_stack: List[Tuple[float, float]]
@@ -169,7 +177,7 @@ class Progressor:
             self.set_progress(1.0)
         return False
-    def set_progress(self, progress: float):
+    def set_progress(self, progress: float, info: Optional[str] = None):
         if self.request_id:
             self._current_progress = (
                 self._current_sub_progress_start
@@ -179,7 +187,7 @@ class Progressor:
             if (
                 self._current_progress - self._last_report_progress >= self._upload_span
                 or 1.0 - progress < 1e-5
-            ):
+            ) or info:
                 set_progress = self.progress_tracker_ref.set_progress(
                     self.request_id, self._current_progress
                 )

xinference 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

Potentially problematic release.

xinference 1.4.0py3-none-any.whl → 1.5.0py3-none-any.whl