PyPI - xinference - Versions diffs - 0.14.4.post1__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

xinference 0.14.4.post1py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (149) hide show

xinference/core/scheduler.py CHANGED Viewed

@@ -18,7 +18,7 @@ import logging
 import uuid
 from collections import deque
 from enum import Enum
-from typing import List, Optional, Set, Tuple
+from typing import Dict, List, Optional, Set, Tuple, Union
 import xoscar as xo
@@ -37,13 +37,24 @@ class AbortRequestMessage(Enum):
 class InferenceRequest:
-    def __init__(self, prompt, future_or_queue, is_prefill, *args, **kwargs):
-        # original prompt
-        self._prompt = prompt
+    def __init__(
+        self,
+        prompt_or_messages,
+        future_or_queue,
+        is_prefill,
+        call_ability,
+        *args,
+        **kwargs,
+    ):
+        # original prompt, prompt(str) for generate model and messages(List[Dict]) for chat model
+        self._prompt = prompt_or_messages
         # full prompt that contains chat history and applies chat template
         self._full_prompt = None
         # whether the current request is in the prefill phase
         self._is_prefill = is_prefill
+        # the ability that the user calls this model for, that is `generate` / `chat` for now,
+        # which is for results formatting
+        self._call_ability = call_ability
         # full prompt tokens
         self._prompt_tokens = None
         # all new generated tokens during decode phase
@@ -88,38 +99,22 @@ class InferenceRequest:
         self._check_args()
     def _check_args(self):
-        # chat
-        if len(self._inference_args) == 3:
-            # system prompt
-            assert self._inference_args[0] is None or isinstance(
-                self._inference_args[0], str
-            )
-            # chat history
-            assert self._inference_args[1] is None or isinstance(
-                self._inference_args[1], list
-            )
-            # generate config
-            assert self._inference_args[2] is None or isinstance(
-                self._inference_args[2], dict
-            )
-        else:  # generate
-            assert len(self._inference_args) == 1
-            # generate config
-            assert self._inference_args[0] is None or isinstance(
-                self._inference_args[0], dict
-            )
+        assert len(self._inference_args) == 1
+        # generate config
+        assert self._inference_args[0] is None or isinstance(
+            self._inference_args[0], dict
+        )
     @property
     def prompt(self):
+        """
+        prompt for generate model and messages for chat model
+        """
         return self._prompt
     @property
-    def system_prompt(self):
-        return self._inference_args[0]
-    @property
-    def chat_history(self):
-        return self._inference_args[1]
+    def call_ability(self):
+        return self._call_ability
     @property
     def full_prompt(self):
@@ -162,11 +157,7 @@ class InferenceRequest:
     @property
     def generate_config(self):
-        return (
-            self._inference_args[2]
-            if len(self._inference_args) == 3
-            else self._inference_args[0]
-        )
+        return self._inference_args[0]
     @property
     def sanitized_generate_config(self):
@@ -423,8 +414,17 @@ class SchedulerActor(xo.StatelessActor):
         self._empty_cache()
-    async def add_request(self, prompt: str, future_or_queue, *args, **kwargs):
-        req = InferenceRequest(prompt, future_or_queue, True, *args, **kwargs)
+    async def add_request(
+        self,
+        prompt_or_messages: Union[str, List[Dict]],
+        future_or_queue,
+        call_ability,
+        *args,
+        **kwargs,
+    ):
+        req = InferenceRequest(
+            prompt_or_messages, future_or_queue, True, call_ability, *args, **kwargs
+        )
         rid = req.request_id
         if rid is not None:
             if rid in self._id_to_req:

xinference/core/status_guard.py CHANGED Viewed

@@ -51,7 +51,7 @@ class StatusGuardActor(xo.StatelessActor):
         self._model_uid_to_info: Dict[str, InstanceInfo] = {}  # type: ignore
     @classmethod
-    def uid(cls) -> str:
+    def default_uid(cls) -> str:
         return "status_guard"
     @staticmethod

xinference/core/supervisor.py CHANGED Viewed

@@ -105,7 +105,7 @@ class SupervisorActor(xo.StatelessActor):
         self._lock = asyncio.Lock()
     @classmethod
-    def uid(cls) -> str:
+    def default_uid(cls) -> str:
         return "supervisor"
     def _get_worker_ref_by_ip(
@@ -135,12 +135,12 @@ class SupervisorActor(xo.StatelessActor):
         self._status_guard_ref: xo.ActorRefType[  # type: ignore
             "StatusGuardActor"
         ] = await xo.create_actor(
-            StatusGuardActor, address=self.address, uid=StatusGuardActor.uid()
+            StatusGuardActor, address=self.address, uid=StatusGuardActor.default_uid()
         )
         self._cache_tracker_ref: xo.ActorRefType[  # type: ignore
             "CacheTrackerActor"
         ] = await xo.create_actor(
-            CacheTrackerActor, address=self.address, uid=CacheTrackerActor.uid()
+            CacheTrackerActor, address=self.address, uid=CacheTrackerActor.default_uid()
         )
         from .event import EventCollectorActor
@@ -148,7 +148,9 @@ class SupervisorActor(xo.StatelessActor):
         self._event_collector_ref: xo.ActorRefType[  # type: ignore
             EventCollectorActor
         ] = await xo.create_actor(
-            EventCollectorActor, address=self.address, uid=EventCollectorActor.uid()
+            EventCollectorActor,
+            address=self.address,
+            uid=EventCollectorActor.default_uid(),
         )
         from ..model.audio import (
@@ -308,10 +310,7 @@ class SupervisorActor(xo.StatelessActor):
     async def get_builtin_prompts() -> Dict[str, Any]:
         from ..model.llm.llm_family import BUILTIN_LLM_PROMPT_STYLE
-        data = {}
-        for k, v in BUILTIN_LLM_PROMPT_STYLE.items():
-            data[k] = v.dict()
-        return data
+        return {k: v for k, v in BUILTIN_LLM_PROMPT_STYLE.items()}
     @staticmethod
     async def get_builtin_families() -> Dict[str, List[str]]:
@@ -1028,7 +1027,7 @@ class SupervisorActor(xo.StatelessActor):
         else:
             task = asyncio.create_task(_launch_model())
             ASYNC_LAUNCH_TASKS[model_uid] = task
-            task.add_done_callback(lambda _: callback_for_async_launch(model_uid))
+            task.add_done_callback(lambda _: callback_for_async_launch(model_uid))  # type: ignore
         return model_uid
     async def get_instance_info(
@@ -1233,7 +1232,9 @@ class SupervisorActor(xo.StatelessActor):
             worker_address not in self._worker_address_to_worker
         ), f"Worker {worker_address} exists"
-        worker_ref = await xo.actor_ref(address=worker_address, uid=WorkerActor.uid())
+        worker_ref = await xo.actor_ref(
+            address=worker_address, uid=WorkerActor.default_uid()
+        )
         self._worker_address_to_worker[worker_address] = worker_ref
         logger.debug("Worker %s has been added successfully", worker_address)

xinference/core/utils.py CHANGED Viewed

@@ -11,62 +11,120 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
 import logging
 import os
 import random
 import string
-from typing import Dict, Generator, List, Tuple, Union
+import uuid
+from typing import Dict, Generator, List, Optional, Tuple, Union
 import orjson
 from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlShutdown
 from .._compat import BaseModel
+from ..constants import XINFERENCE_LOG_ARG_MAX_LENGTH
 logger = logging.getLogger(__name__)
-def log_async(logger, args_formatter=None):
+def truncate_log_arg(arg) -> str:
+    s = str(arg)
+    if len(s) > XINFERENCE_LOG_ARG_MAX_LENGTH:
+        s = s[0:XINFERENCE_LOG_ARG_MAX_LENGTH] + "..."
+    return s
+def log_async(
+    logger,
+    level=logging.DEBUG,
+    ignore_kwargs: Optional[List[str]] = None,
+    log_exception=True,
+):
     import time
     from functools import wraps
     def decorator(func):
+        func_name = func.__name__
         @wraps(func)
         async def wrapped(*args, **kwargs):
-            if args_formatter is not None:
-                formatted_args, formatted_kwargs = copy.copy(args), copy.copy(kwargs)
-                args_formatter(formatted_args, formatted_kwargs)
-            else:
-                formatted_args, formatted_kwargs = args, kwargs
-            logger.debug(
-                f"Enter {func.__name__}, args: {formatted_args}, kwargs: {formatted_kwargs}"
+            request_id_str = kwargs.get("request_id", "")
+            if not request_id_str:
+                request_id_str = uuid.uuid1()
+            request_id_str = f"[request {request_id_str}]"
+            formatted_args = ",".join(map(truncate_log_arg, args))
+            formatted_kwargs = ",".join(
+                [
+                    "%s=%s" % (k, truncate_log_arg(v))
+                    for k, v in kwargs.items()
+                    if ignore_kwargs is None or k not in ignore_kwargs
+                ]
             )
-            start = time.time()
-            ret = await func(*args, **kwargs)
-            logger.debug(
-                f"Leave {func.__name__}, elapsed time: {int(time.time() - start)} s"
+            logger.log(
+                level,
+                f"{request_id_str} Enter {func_name}, args: {formatted_args}, kwargs: {formatted_kwargs}",
             )
-            return ret
+            start = time.time()
+            try:
+                ret = await func(*args, **kwargs)
+                logger.log(
+                    level,
+                    f"{request_id_str} Leave {func_name}, elapsed time: {int(time.time() - start)} s",
+                )
+                return ret
+            except Exception as e:
+                if log_exception:
+                    logger.error(
+                        f"{request_id_str} Leave {func_name}, error: {e}, elapsed time: {int(time.time() - start)} s",
+                        exc_info=True,
+                    )
+                else:
+                    logger.log(
+                        level,
+                        f"{request_id_str} Leave {func_name}, error: {e}, elapsed time: {int(time.time() - start)} s",
+                    )
+                raise
         return wrapped
     return decorator
-def log_sync(logger):
+def log_sync(logger, level=logging.DEBUG, log_exception=True):
     import time
     from functools import wraps
     def decorator(func):
         @wraps(func)
         def wrapped(*args, **kwargs):
-            logger.debug(f"Enter {func.__name__}, args: {args}, kwargs: {kwargs}")
-            start = time.time()
-            ret = func(*args, **kwargs)
-            logger.debug(
-                f"Leave {func.__name__}, elapsed time: {int(time.time() - start)} s"
+            formatted_args = ",".join(map(truncate_log_arg, args))
+            formatted_kwargs = ",".join(
+                map(lambda x: "%s=%s" % (x[0], truncate_log_arg(x[1])), kwargs.items())
             )
-            return ret
+            logger.log(
+                level,
+                f"Enter {func.__name__}, args: {formatted_args}, kwargs: {formatted_kwargs}",
+            )
+            start = time.time()
+            try:
+                ret = func(*args, **kwargs)
+                logger.log(
+                    level,
+                    f"Leave {func.__name__}, elapsed time: {int(time.time() - start)} s",
+                )
+                return ret
+            except Exception as e:
+                if log_exception:
+                    logger.error(
+                        f"Leave {func.__name__}, error: {e}, elapsed time: {int(time.time() - start)} s",
+                        exc_info=True,
+                    )
+                else:
+                    logger.log(
+                        level,
+                        f"Leave {func.__name__}, error: {e}, elapsed time: {int(time.time() - start)} s",
+                    )
+                raise
         return wrapped

xinference/core/worker.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import asyncio
+import logging
 import os
 import platform
 import queue
@@ -73,15 +74,15 @@ class WorkerActor(xo.StatelessActor):
         self._supervisor_ref: Optional[xo.ActorRefType] = None
         self._main_pool = main_pool
         self._main_pool.recover_sub_pool = self.recover_sub_pool
-        self._status_guard_ref: xo.ActorRefType["StatusGuardActor"] = (  # type: ignore
-            None
-        )
+        self._status_guard_ref: xo.ActorRefType[
+            "StatusGuardActor"
+        ] = None  # type: ignore
         self._event_collector_ref: xo.ActorRefType[  # type: ignore
             EventCollectorActor
         ] = None
-        self._cache_tracker_ref: xo.ActorRefType[CacheTrackerActor] = (  # type: ignore
-            None
-        )
+        self._cache_tracker_ref: xo.ActorRefType[
+            CacheTrackerActor
+        ] = None  # type: ignore
         # internal states.
         # temporary placeholder during model launch process:
@@ -185,7 +186,7 @@ class WorkerActor(xo.StatelessActor):
                 break
     @classmethod
-    def uid(cls) -> str:
+    def default_uid(cls) -> str:
         return "worker"
     async def __post_create__(self):
@@ -270,9 +271,9 @@ class WorkerActor(xo.StatelessActor):
         try:
             await self.get_supervisor_ref(add_worker=True)
-        except Exception as e:
+        except Exception:
             # Do not crash the worker if supervisor is down, auto re-connect later
-            logger.error(f"cannot connect to supervisor {e}")
+            logger.error(f"cannot connect to supervisor", exc_info=True)
         if not XINFERENCE_DISABLE_HEALTH_CHECK:
             from ..isolation import Isolation
@@ -324,7 +325,7 @@ class WorkerActor(xo.StatelessActor):
         if self._supervisor_ref is not None:
             return self._supervisor_ref
         supervisor_ref = await xo.actor_ref(  # type: ignore
-            address=self._supervisor_address, uid=SupervisorActor.uid()
+            address=self._supervisor_address, uid=SupervisorActor.default_uid()
         )
         # Prevent concurrent operations leads to double initialization, check again.
         if self._supervisor_ref is not None:
@@ -336,13 +337,13 @@ class WorkerActor(xo.StatelessActor):
             logger.info("Connected to supervisor as a fresh worker")
         self._status_guard_ref = await xo.actor_ref(
-            address=self._supervisor_address, uid=StatusGuardActor.uid()
+            address=self._supervisor_address, uid=StatusGuardActor.default_uid()
         )
         self._event_collector_ref = await xo.actor_ref(
-            address=self._supervisor_address, uid=EventCollectorActor.uid()
+            address=self._supervisor_address, uid=EventCollectorActor.default_uid()
         )
         self._cache_tracker_ref = await xo.actor_ref(
-            address=self._supervisor_address, uid=CacheTrackerActor.uid()
+            address=self._supervisor_address, uid=CacheTrackerActor.default_uid()
         )
         # cache_tracker is on supervisor
         from ..model.audio import get_audio_model_descriptions
@@ -770,7 +771,7 @@ class WorkerActor(xo.StatelessActor):
                 version_info["model_file_location"],
             )
-    @log_async(logger=logger)
+    @log_async(logger=logger, level=logging.INFO)
     async def launch_builtin_model(
         self,
         model_uid: str,
@@ -814,7 +815,7 @@ class WorkerActor(xo.StatelessActor):
                 )
         except Exception as e:
             # Report callback error can be log and ignore, should not interrupt the Process
-            logger.error("report_event error: %s" % (e))
+            logger.error("report_event error: %s" % (e), exc_info=True)
         if gpu_idx is not None:
             logger.info(
@@ -917,7 +918,7 @@ class WorkerActor(xo.StatelessActor):
             {"model_ability": abilities, "status": LaunchStatus.READY.name},
         )
-    @log_async(logger=logger)
+    @log_async(logger=logger, level=logging.INFO)
     async def terminate_model(self, model_uid: str, is_model_die=False):
         # Terminate model while its launching is not allow
         if model_uid in self._model_uid_launching_guard:

xinference/deploy/cmdline.py CHANGED Viewed

@@ -17,7 +17,7 @@ import logging
 import os
 import sys
 import warnings
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 import click
 from xoscar.utils import get_next_port
@@ -38,7 +38,6 @@ from ..constants import (
     XINFERENCE_LOG_MAX_BYTES,
 )
 from ..isolation import Isolation
-from ..types import ChatCompletionMessage
 from .utils import (
     get_config_dict,
     get_log_file,
@@ -1210,13 +1209,12 @@ def model_chat(
     stream: bool,
     api_key: Optional[str],
 ):
-    # TODO: chat model roles may not be user and assistant.
     endpoint = get_endpoint(endpoint)
     client = RESTfulClient(base_url=endpoint, api_key=api_key)
     if api_key is None:
         client._set_token(get_stored_token(endpoint, client))
-    chat_history: "List[ChatCompletionMessage]" = []
+    messages: List[Dict] = []
     if stream:
         # TODO: when stream=True, RestfulClient cannot generate words one by one.
         # So use Client in temporary. The implementation needs to be changed to
@@ -1229,10 +1227,10 @@ def model_chat(
                 if prompt == "":
                     break
                 print("Assistant: ", end="", file=sys.stdout)
+                messages.append(dict(role="user", content=prompt))
                 response_content = ""
                 for chunk in model.chat(
-                    prompt=prompt,
-                    chat_history=chat_history,
+                    messages,
                     generate_config={"stream": stream, "max_tokens": max_tokens},
                 ):
                     delta = chunk["choices"][0]["delta"]
@@ -1242,10 +1240,7 @@ def model_chat(
                         response_content += delta["content"]
                         print(delta["content"], end="", flush=True, file=sys.stdout)
                 print("", file=sys.stdout)
-                chat_history.append(ChatCompletionMessage(role="user", content=prompt))
-                chat_history.append(
-                    ChatCompletionMessage(role="assistant", content=response_content)
-                )
+                messages.append(dict(role="assistant", content=response_content))
         model = client.get_model(model_uid=model_uid)
@@ -1274,20 +1269,17 @@ def model_chat(
             prompt = input("User: ")
             if prompt == "":
                 break
-            chat_history.append(ChatCompletionMessage(role="user", content=prompt))
+            messages.append({"role": "user", "content": prompt})
             print("Assistant: ", end="", file=sys.stdout)
             response = restful_model.chat(
-                prompt=prompt,
-                chat_history=chat_history,
+                messages,
                 generate_config={"stream": stream, "max_tokens": max_tokens},
             )
             if not isinstance(response, dict):
                 raise ValueError("chat result is not valid")
             response_content = response["choices"][0]["message"]["content"]
             print(f"{response_content}\n", file=sys.stdout)
-            chat_history.append(
-                ChatCompletionMessage(role="assistant", content=response_content)
-            )
+            messages.append(dict(role="assistant", content=response_content))
 @cli.command("vllm-models", help="Query and display models compatible with vLLM.")

xinference/deploy/local.py CHANGED Viewed

@@ -49,7 +49,7 @@ async def _start_local_cluster(
             address=address, logging_conf=logging_conf
         )
         await xo.create_actor(
-            SupervisorActor, address=address, uid=SupervisorActor.uid()
+            SupervisorActor, address=address, uid=SupervisorActor.default_uid()
         )
         await start_worker_components(
             address=address,

xinference/deploy/supervisor.py CHANGED Viewed

@@ -41,7 +41,7 @@ async def _start_supervisor(address: str, logging_conf: Optional[Dict] = None):
             address=address, n_process=0, logging_conf={"dict": logging_conf}
         )
         await xo.create_actor(
-            SupervisorActor, address=address, uid=SupervisorActor.uid()
+            SupervisorActor, address=address, uid=SupervisorActor.default_uid()
         )
         await pool.join()
     except asyncio.exceptions.CancelledError:

xinference/deploy/utils.py CHANGED Viewed

@@ -167,7 +167,7 @@ def health_check(address: str, max_attempts: int, sleep_interval: int = 3) -> bo
                 from ..core.supervisor import SupervisorActor
                 supervisor_ref: xo.ActorRefType[SupervisorActor] = await xo.actor_ref(  # type: ignore
-                    address=address, uid=SupervisorActor.uid()
+                    address=address, uid=SupervisorActor.default_uid()
                 )
                 await supervisor_ref.get_status()

xinference/deploy/worker.py CHANGED Viewed

@@ -43,7 +43,7 @@ async def start_worker_components(
     await xo.create_actor(
         WorkerActor,
         address=address,
-        uid=WorkerActor.uid(),
+        uid=WorkerActor.default_uid(),
         supervisor_address=supervisor_address,
         main_pool=main_pool,
         gpu_devices=gpu_device_indices,

xinference 0.14.4.post1__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

xinference 0.14.4.post1py3-none-any.whl → 0.15.0py3-none-any.whl