PyPI - xinference - Versions diffs - 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend - Supply Chain Defender

xinference 0.7.5py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (120) hide show

xinference/core/supervisor.py CHANGED Viewed

@@ -22,6 +22,8 @@ from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple, Un
 import xoscar as xo
 from ..core import ModelActor
+from ..core.status_guard import InstanceInfo, LaunchStatus
+from .metrics import record_metrics
 from .resource import ResourceStatus
 from .utils import (
     build_replica_model_uid,
@@ -46,6 +48,12 @@ logger = getLogger(__name__)
 DEFAULT_NODE_TIMEOUT = 60
+ASYNC_LAUNCH_TASKS = {}  # type: ignore
+def callback_for_async_launch(model_uid: str):
+    ASYNC_LAUNCH_TASKS.pop(model_uid, None)
+    logger.debug(f"Model uid: {model_uid} async launch completes.")
 @dataclass
@@ -81,6 +89,13 @@ class SupervisorActor(xo.StatelessActor):
         # comment this line to avoid worker lost
         # self._check_dead_nodes_task = asyncio.create_task(self._check_dead_nodes())
         logger.info(f"Xinference supervisor {self.address} started")
+        from .status_guard import StatusGuardActor
+        self._status_guard_ref: xo.ActorRefType[
+            "StatusGuardActor"
+        ] = await xo.create_actor(
+            StatusGuardActor, address=self.address, uid=StatusGuardActor.uid()
+        )
         from ..model.embedding import (
             CustomEmbeddingModelSpec,
@@ -119,11 +134,13 @@ class SupervisorActor(xo.StatelessActor):
         from ..model.llm.llm_family import (
             BUILTIN_LLM_MODEL_CHAT_FAMILIES,
             BUILTIN_LLM_MODEL_GENERATE_FAMILIES,
+            BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
         )
         return {
             "chat": list(BUILTIN_LLM_MODEL_CHAT_FAMILIES),
             "generate": list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES),
+            "tool_call": list(BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES),
         }
     async def get_devices_count(self) -> int:
@@ -511,6 +528,7 @@ class SupervisorActor(xo.StatelessActor):
         replica: int = 1,
         n_gpu: Optional[Union[int, str]] = "auto",
         request_limits: Optional[int] = None,
+        wait_ready: bool = True,
         **kwargs,
     ) -> str:
         if model_uid is None:
@@ -552,6 +570,18 @@ class SupervisorActor(xo.StatelessActor):
             )
             self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref
+        async def _launch_model():
+            try:
+                for rep_model_uid in iter_replica_model_uid(model_uid, replica):
+                    await _launch_one_model(rep_model_uid)
+            except Exception:
+                # terminate_model will remove the replica info.
+                await self.terminate_model(model_uid, suppress_exception=True)
+                await self._status_guard_ref.update_instance_info(
+                    model_uid, {"status": LaunchStatus.ERROR.name}
+                )
+                raise
         if not is_valid_model_uid(model_uid):
             raise ValueError(
                 "The model UID is invalid. Please specify the model UID by 0 < length <= 100."
@@ -568,15 +598,31 @@ class SupervisorActor(xo.StatelessActor):
         self._model_uid_to_replica_info[model_uid] = ReplicaInfo(
             replica=replica, scheduler=itertools.cycle(range(replica))
         )
-        try:
-            for rep_model_uid in iter_replica_model_uid(model_uid, replica):
-                await _launch_one_model(rep_model_uid)
-        except Exception:
-            # terminate_model will remove the replica info.
-            await self.terminate_model(model_uid, suppress_exception=True)
-            raise
+        instance_info = InstanceInfo(
+            model_name=model_name,
+            model_uid=model_uid,
+            model_ability=[],
+            replica=replica,
+            status=LaunchStatus.CREATING.name,
+            instance_created_ts=int(time.time()),
+        )
+        await self._status_guard_ref.set_instance_info(model_uid, instance_info)
+        if wait_ready:
+            await _launch_model()
+        else:
+            task = asyncio.create_task(_launch_model())
+            ASYNC_LAUNCH_TASKS[model_uid] = task
+            task.add_done_callback(lambda _: callback_for_async_launch(model_uid))
         return model_uid
+    async def get_instance_info(
+        self, model_name: Optional[str], model_uid: Optional[str]
+    ) -> List[Dict]:
+        infos = await self._status_guard_ref.get_instance_info(
+            model_name=model_name, model_uid=model_uid
+        )
+        return [info.dict() for info in sorted(infos, key=lambda info: info.model_uid)]
     async def _check_dead_nodes(self):
         while True:
             dead_nodes = []
@@ -705,3 +751,7 @@ class SupervisorActor(xo.StatelessActor):
         self._worker_status[worker_address] = WorkerStatus(
             update_time=time.time(), status=status
         )
+    @staticmethod
+    def record_metrics(name, op, kwargs):
+        record_metrics(name, op, kwargs)

xinference/core/worker.py CHANGED Viewed

@@ -15,7 +15,9 @@
 import asyncio
 import os
 import platform
+import queue
 import signal
+import threading
 from collections import defaultdict
 from logging import getLogger
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
@@ -25,8 +27,10 @@ from xoscar import MainActorPoolType
 from ..constants import XINFERENCE_CACHE_DIR
 from ..core import ModelActor
+from ..core.status_guard import LaunchStatus
 from ..model.core import ModelDescription, create_model_instance
 from ..utils import cuda_count
+from .metrics import launch_metrics_export_server, record_metrics
 from .resource import gather_node_info
 from .utils import log_async, log_sync, parse_replica_model_uid, purge_dir
@@ -34,6 +38,12 @@ logger = getLogger(__name__)
 DEFAULT_NODE_HEARTBEAT_INTERVAL = 5
+MODEL_ACTOR_AUTO_RECOVER_LIMIT: Optional[int]
+_MODEL_ACTOR_AUTO_RECOVER_LIMIT = os.getenv("XINFERENCE_MODEL_ACTOR_AUTO_RECOVER_LIMIT")
+if _MODEL_ACTOR_AUTO_RECOVER_LIMIT is not None:
+    MODEL_ACTOR_AUTO_RECOVER_LIMIT = int(_MODEL_ACTOR_AUTO_RECOVER_LIMIT)
+else:
+    MODEL_ACTOR_AUTO_RECOVER_LIMIT = None
 class WorkerActor(xo.StatelessActor):
@@ -42,6 +52,8 @@ class WorkerActor(xo.StatelessActor):
         supervisor_address: str,
         main_pool: MainActorPoolType,
         cuda_devices: List[int],
+        metrics_exporter_host: Optional[str] = None,
+        metrics_exporter_port: Optional[int] = None,
     ):
         super().__init__()
         # static attrs.
@@ -57,20 +69,71 @@ class WorkerActor(xo.StatelessActor):
         self._gpu_to_model_uid: Dict[int, str] = {}
         self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
         self._model_uid_to_addr: Dict[str, str] = {}
+        self._model_uid_to_recover_count: Dict[str, int] = {}
         self._model_uid_to_launch_args: Dict[str, Dict] = {}
+        # metrics export server.
+        if metrics_exporter_host is not None or metrics_exporter_port is not None:
+            logger.info(
+                f"Starting metrics export server at {metrics_exporter_host}:{metrics_exporter_port}"
+            )
+            q: queue.Queue = queue.Queue()
+            self._metrics_thread = threading.Thread(
+                name="Metrics Export Server",
+                target=launch_metrics_export_server,
+                args=(q, metrics_exporter_host, metrics_exporter_port),
+                daemon=True,
+            )
+            self._metrics_thread.start()
+            logger.info("Checking metrics export server...")
+            while self._metrics_thread.is_alive():
+                try:
+                    host, port = q.get(block=False)[:2]
+                    logger.info(f"Metrics server is started at: http://{host}:{port}")
+                    break
+                except queue.Empty:
+                    pass
+            else:
+                raise Exception("Metrics server thread exit.")
         self._lock = asyncio.Lock()
     async def recover_sub_pool(self, address):
-        logger.warning("Process %s is down, create model.", address)
+        logger.warning("Process %s is down.", address)
+        # Xoscar does not remove the address from sub_processes.
+        try:
+            await self._main_pool.remove_sub_pool(address)
+        except Exception:
+            pass
         for model_uid, addr in self._model_uid_to_addr.items():
             if addr == address:
                 launch_args = self._model_uid_to_launch_args.get(model_uid)
-                try:
-                    await self.terminate_model(model_uid)
-                except Exception:
-                    pass
-                await self.launch_builtin_model(**launch_args)
+                if launch_args is None:
+                    logger.warning(
+                        "Not recreate model because the it is down during launch."
+                    )
+                else:
+                    recover_count = self._model_uid_to_recover_count.get(model_uid)
+                    try:
+                        await self.terminate_model(model_uid)
+                    except Exception:
+                        pass
+                    if recover_count is not None:
+                        if recover_count > 0:
+                            logger.warning(
+                                "Recreating model actor %s, remain %s times ...",
+                                model_uid,
+                                recover_count - 1,
+                            )
+                            self._model_uid_to_recover_count[model_uid] = (
+                                recover_count - 1
+                            )
+                            await self.launch_builtin_model(**launch_args)
+                        else:
+                            logger.warning("Stop recreating model actor.")
+                    else:
+                        logger.warning("Recreating model actor %s ...", model_uid)
+                        await self.launch_builtin_model(**launch_args)
                 break
     @classmethod
@@ -78,8 +141,14 @@ class WorkerActor(xo.StatelessActor):
         return "worker"
     async def __post_create__(self):
+        from .status_guard import StatusGuardActor
         from .supervisor import SupervisorActor
+        self._status_guard_ref: xo.ActorRefType[
+            "StatusGuardActor"
+        ] = await xo.actor_ref(
+            address=self._supervisor_address, uid=StatusGuardActor.uid()
+        )
         self._supervisor_ref: xo.ActorRefType["SupervisorActor"] = await xo.actor_ref(
             address=self._supervisor_address, uid=SupervisorActor.uid()
         )
@@ -309,7 +378,12 @@ class WorkerActor(xo.StatelessActor):
         try:
             model_ref = await xo.create_actor(
-                ModelActor, address=subpool_address, uid=model_uid, model=model
+                ModelActor,
+                address=subpool_address,
+                uid=model_uid,
+                worker_address=self.address,
+                model=model,
+                model_description=model_description,
             )
             await model_ref.load()
         except:
@@ -324,6 +398,22 @@ class WorkerActor(xo.StatelessActor):
             self._gpu_to_model_uid[int(dev)] = model_uid
         self._model_uid_to_addr[model_uid] = subpool_address
+    async def _get_model_ability(self, model: Any, model_type: str) -> List[str]:
+        from ..model.llm.core import LLM
+        if model_type == "embedding":
+            return ["embed"]
+        elif model_type == "rerank":
+            return ["rerank"]
+        elif model_type == "image":
+            return ["text_to_image"]
+        elif model_type == "multimodal":
+            return ["multimodal"]
+        else:
+            assert model_type == "LLM"
+            assert isinstance(model, LLM)
+            return model.model_family.model_ability  # type: ignore
     @log_async(logger=logger)
     async def launch_builtin_model(
         self,
@@ -339,6 +429,8 @@ class WorkerActor(xo.StatelessActor):
     ):
         launch_args = locals()
         launch_args.pop("self")
+        launch_args.pop("kwargs")
+        launch_args.update(kwargs)
         if n_gpu is not None:
             if isinstance(n_gpu, int) and (n_gpu <= 0 or n_gpu > cuda_count()):
                 raise ValueError(
@@ -358,6 +450,7 @@ class WorkerActor(xo.StatelessActor):
         )
         try:
+            origin_uid, _, _ = parse_replica_model_uid(model_uid)
             model, model_description = await asyncio.to_thread(
                 create_model_instance,
                 subpool_address,
@@ -375,7 +468,9 @@ class WorkerActor(xo.StatelessActor):
                 ModelActor,
                 address=subpool_address,
                 uid=model_uid,
+                worker_address=self.address,
                 model=model,
+                model_description=model_description,
                 request_limits=request_limits,
             )
             await model_ref.load()
@@ -388,13 +483,27 @@ class WorkerActor(xo.StatelessActor):
         self._model_uid_to_model[model_uid] = model_ref
         self._model_uid_to_model_spec[model_uid] = model_description
         self._model_uid_to_addr[model_uid] = subpool_address
+        self._model_uid_to_recover_count.setdefault(
+            model_uid, MODEL_ACTOR_AUTO_RECOVER_LIMIT
+        )
         self._model_uid_to_launch_args[model_uid] = launch_args
+        # update status to READY
+        abilities = await self._get_model_ability(model, model_type)
+        await self._status_guard_ref.update_instance_info(
+            origin_uid,
+            {"model_ability": abilities, "status": LaunchStatus.READY.name},
+        )
     @log_async(logger=logger)
     async def terminate_model(self, model_uid: str):
+        origin_uid, _, _ = parse_replica_model_uid(model_uid)
+        await self._status_guard_ref.update_instance_info(
+            origin_uid, {"status": LaunchStatus.TERMINATING.name}
+        )
         model_ref = self._model_uid_to_model.get(model_uid, None)
         if model_ref is None:
-            raise ValueError(f"Model not found in the model list, uid: {model_uid}")
+            logger.debug("Model not found, uid: %s", model_uid)
         try:
             await xo.destroy_actor(model_ref)
@@ -405,12 +514,20 @@ class WorkerActor(xo.StatelessActor):
         try:
             subpool_address = self._model_uid_to_addr[model_uid]
             await self._main_pool.remove_sub_pool(subpool_address)
+        except Exception as e:
+            logger.debug(
+                "Remove sub pool failed, model uid: %s, error: %s", model_uid, e
+            )
         finally:
-            del self._model_uid_to_model[model_uid]
-            del self._model_uid_to_model_spec[model_uid]
+            self._model_uid_to_model.pop(model_uid, None)
+            self._model_uid_to_model_spec.pop(model_uid, None)
             self.release_devices(model_uid)
-            del self._model_uid_to_addr[model_uid]
-            del self._model_uid_to_launch_args[model_uid]
+            self._model_uid_to_addr.pop(model_uid, None)
+            self._model_uid_to_recover_count.pop(model_uid, None)
+            self._model_uid_to_launch_args.pop(model_uid, None)
+            await self._status_guard_ref.update_instance_info(
+                origin_uid, {"status": LaunchStatus.TERMINATED.name}
+            )
     @log_async(logger=logger)
     async def list_models(self) -> Dict[str, Dict[str, Any]]:
@@ -425,7 +542,7 @@ class WorkerActor(xo.StatelessActor):
     def get_model(self, model_uid: str) -> xo.ActorRefType["ModelActor"]:
         model_ref = self._model_uid_to_model.get(model_uid, None)
         if model_ref is None:
-            raise ValueError(f"Model not found in the model list, uid: {model_uid}")
+            raise ValueError(f"Model not found, uid: {model_uid}")
         return model_ref
     @log_sync(logger=logger)
@@ -458,3 +575,7 @@ class WorkerActor(xo.StatelessActor):
                 await asyncio.sleep(DEFAULT_NODE_HEARTBEAT_INTERVAL)
             except asyncio.CancelledError:  # pragma: no cover
                 break
+    @staticmethod
+    def record_metrics(name, op, kwargs):
+        record_metrics(name, op, kwargs)