PyPI - xinference - Versions diffs - 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

xinference 0.8.0py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (35) hide show

xinference/_version.py +3 -3
xinference/api/restful_api.py +35 -1
xinference/client/oscar/actor_client.py +2 -2
xinference/client/restful/restful_client.py +2 -2
xinference/conftest.py +5 -1
xinference/core/metrics.py +83 -0
xinference/core/model.py +148 -8
xinference/core/status_guard.py +86 -0
xinference/core/supervisor.py +57 -7
xinference/core/worker.py +132 -13
xinference/deploy/cmdline.py +57 -4
xinference/deploy/local.py +32 -6
xinference/deploy/worker.py +33 -5
xinference/fields.py +4 -1
xinference/model/llm/__init__.py +7 -0
xinference/model/llm/ggml/llamacpp.py +3 -2
xinference/model/llm/llm_family.json +70 -3
xinference/model/llm/llm_family.py +11 -1
xinference/model/llm/llm_family_modelscope.json +72 -3
xinference/model/llm/pytorch/chatglm.py +70 -28
xinference/model/llm/pytorch/core.py +11 -30
xinference/model/llm/pytorch/internlm2.py +155 -0
xinference/model/llm/pytorch/utils.py +0 -153
xinference/model/llm/utils.py +37 -8
xinference/model/llm/vllm/core.py +15 -3
xinference/model/multimodal/__init__.py +15 -8
xinference/model/multimodal/model_spec_modelscope.json +45 -0
xinference/model/utils.py +7 -2
xinference/types.py +2 -0
{xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/METADATA +2 -1
{xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/RECORD +35 -31
{xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/LICENSE +0 -0
{xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/WHEEL +0 -0
{xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/entry_points.txt +0 -0
{xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/top_level.txt +0 -0

xinference/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2024-01-11T21:54:21+0800",
+ "date": "2024-01-19T17:14:28+0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "e4c892c5ea9b459ac60c70ce82db73a0b0a9adfa",
- "version": "0.8.0"
+ "full-revisionid": "fb3985e95fbb3e6cb51a321d6d6a9a10661128fe",
+ "version": "0.8.1"
 }
 '''  # END VERSION_JSON

xinference/api/restful_api.py CHANGED Viewed

@@ -27,6 +27,8 @@ from typing import Any, List, Optional, Union
 import gradio as gr
 import pydantic
 import xoscar as xo
+from aioprometheus import REGISTRY, MetricsMiddleware
+from aioprometheus.asgi.starlette import metrics
 from fastapi import (
     APIRouter,
     FastAPI,
@@ -252,6 +254,15 @@ class RESTfulAPI:
         self._router.add_api_route(
             "/v1/cluster/auth", self.is_cluster_authenticated, methods=["GET"]
         )
+        # running instances
+        self._router.add_api_route(
+            "/v1/models/instances",
+            self.get_instance_info,
+            methods=["GET"],
+            dependencies=[Security(verify_token, scopes=["models:list"])]
+            if self.is_authenticated()
+            else None,
+        )
         self._router.add_api_route(
             "/v1/models",
             self.list_models,
@@ -380,7 +391,13 @@ class RESTfulAPI:
             else None,
         )
+        # Clear the global Registry for the MetricsMiddleware, or
+        # the MetricsMiddleware will register duplicated metrics if the port
+        # conflict (This serve method run more than once).
+        REGISTRY.clear()
+        self._app.add_middleware(MetricsMiddleware)
         self._app.include_router(self._router)
+        self._app.add_route("/metrics", metrics)
         # Check all the routes returns Response.
         # This is to avoid `jsonable_encoder` performance issue:
@@ -546,7 +563,9 @@ class RESTfulAPI:
         return JSONResponse(content={"model_uid": model_uid})
-    async def launch_model(self, request: Request) -> JSONResponse:
+    async def launch_model(
+        self, request: Request, wait_ready: bool = Query(True)
+    ) -> JSONResponse:
         payload = await request.json()
         model_uid = payload.get("model_uid")
         model_name = payload.get("model_name")
@@ -591,6 +610,7 @@ class RESTfulAPI:
                 replica=replica,
                 n_gpu=n_gpu,
                 request_limits=request_limits,
+                wait_ready=wait_ready,
                 **kwargs,
             )
@@ -606,6 +626,20 @@ class RESTfulAPI:
         return JSONResponse(content={"model_uid": model_uid})
+    async def get_instance_info(
+        self,
+        model_name: Optional[str] = Query(None),
+        model_uid: Optional[str] = Query(None),
+    ) -> JSONResponse:
+        try:
+            infos = await (await self._get_supervisor_ref()).get_instance_info(
+                model_name, model_uid
+            )
+        except Exception as e:
+            logger.error(str(e), exc_info=True)
+            raise HTTPException(status_code=500, detail=str(e))
+        return JSONResponse(content=infos)
     async def build_gradio_interface(
         self, model_uid: str, body: BuildGradioInterfaceRequest, request: Request
     ) -> JSONResponse:

xinference/client/oscar/actor_client.py CHANGED Viewed

@@ -171,7 +171,7 @@ class RerankModelHandle(ModelHandle):
         return results
-class GenerateModelHandle(EmbeddingModelHandle):
+class GenerateModelHandle(ModelHandle):
     def generate(
         self,
         prompt: str,
@@ -255,7 +255,7 @@ class ChatModelHandle(GenerateModelHandle):
         return ClientIteratorWrapper(r)
-class ChatglmCppChatModelHandle(EmbeddingModelHandle):
+class ChatglmCppChatModelHandle(ModelHandle):
     def chat(
         self,
         prompt: str,

xinference/client/restful/restful_client.py CHANGED Viewed

@@ -257,7 +257,7 @@ class RESTfulImageModelHandle(RESTfulModelHandle):
         return response_data
-class RESTfulGenerateModelHandle(RESTfulEmbeddingModelHandle):
+class RESTfulGenerateModelHandle(RESTfulModelHandle):
     def generate(
         self,
         prompt: str,
@@ -486,7 +486,7 @@ class RESTfulMultimodalModelHandle(RESTfulModelHandle):
         return response_data
-class RESTfulChatglmCppChatModelHandle(RESTfulEmbeddingModelHandle):
+class RESTfulChatglmCppChatModelHandle(RESTfulModelHandle):
     def chat(
         self,
         prompt: str,

xinference/conftest.py CHANGED Viewed

@@ -144,7 +144,11 @@ async def _start_test_cluster(
             SupervisorActor, address=address, uid=SupervisorActor.uid()
         )
         await start_worker_components(
-            address=address, supervisor_address=address, main_pool=pool
+            address=address,
+            supervisor_address=address,
+            main_pool=pool,
+            metrics_exporter_host=None,
+            metrics_exporter_port=None,
         )
         await pool.join()
     except asyncio.CancelledError:

xinference/core/metrics.py ADDED Viewed

@@ -0,0 +1,83 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import uvicorn
+from aioprometheus import Counter, Gauge
+from aioprometheus.asgi.starlette import metrics
+from fastapi import FastAPI
+from fastapi.responses import RedirectResponse
+DEFAULT_METRICS_SERVER_LOG_LEVEL = "warning"
+generate_throughput = Gauge(
+    "xinference:generate_tokens_per_s", "Generate throughput in tokens/s."
+)
+# Latency
+time_to_first_token = Gauge(
+    "xinference:time_to_first_token_ms", "First token latency in ms."
+)
+# Tokens counter
+input_tokens_total_counter = Counter(
+    "xinference:input_tokens_total_counter", "Total number of input tokens."
+)
+output_tokens_total_counter = Counter(
+    "xinference:output_tokens_total_counter", "Total number of output tokens."
+)
+def record_metrics(name, op, kwargs):
+    collector = globals().get(name)
+    getattr(collector, op)(**kwargs)
+def launch_metrics_export_server(q, host=None, port=None):
+    app = FastAPI()
+    app.add_route("/metrics", metrics)
+    @app.get("/")
+    async def root():
+        response = RedirectResponse(url="/metrics")
+        return response
+    async def main():
+        if host is not None and port is not None:
+            config = uvicorn.Config(
+                app, host=host, port=port, log_level=DEFAULT_METRICS_SERVER_LOG_LEVEL
+            )
+        elif host is not None:
+            config = uvicorn.Config(
+                app, host=host, port=0, log_level=DEFAULT_METRICS_SERVER_LOG_LEVEL
+            )
+        elif port is not None:
+            config = uvicorn.Config(
+                app, port=port, log_level=DEFAULT_METRICS_SERVER_LOG_LEVEL
+            )
+        else:
+            config = uvicorn.Config(app, log_level=DEFAULT_METRICS_SERVER_LOG_LEVEL)
+        server = uvicorn.Server(config)
+        task = asyncio.create_task(server.serve())
+        while not server.started and not task.done():
+            await asyncio.sleep(0.1)
+        for server in server.servers:
+            for socket in server.sockets:
+                q.put(socket.getsockname())
+        await task
+    asyncio.run(main())

xinference/core/model.py CHANGED Viewed

@@ -17,6 +17,7 @@ import functools
 import inspect
 import json
 import os
+import time
 import types
 import weakref
 from typing import (
@@ -34,7 +35,9 @@ import sse_starlette.sse
 import xoscar as xo
 if TYPE_CHECKING:
+    from .worker import WorkerActor
     from ..model.llm.core import LLM
+    from ..model.core import ModelDescription
     import PIL
 import logging
@@ -140,13 +143,23 @@ class ModelActor(xo.StatelessActor):
             gc.collect()
             torch.cuda.empty_cache()
-    def __init__(self, model: "LLM", request_limits: Optional[int] = None):
+    def __init__(
+        self,
+        worker_address: str,
+        model: "LLM",
+        model_description: Optional["ModelDescription"] = None,
+        request_limits: Optional[int] = None,
+    ):
         super().__init__()
         from ..model.llm.pytorch.core import PytorchModel
         from ..model.llm.pytorch.spec_model import SpeculativeModel
         from ..model.llm.vllm.core import VLLMModel
+        self._worker_address = worker_address
         self._model = model
+        self._model_description = (
+            model_description.to_dict() if model_description else {}
+        )
         self._request_limits = request_limits
         self._generators: Dict[str, Union[Iterator, AsyncGenerator]] = {}
@@ -156,7 +169,65 @@ class ModelActor(xo.StatelessActor):
             if isinstance(self._model, (PytorchModel, SpeculativeModel, VLLMModel))
             else asyncio.locks.Lock()
         )
+        self._worker_ref = None
         self._serve_count = 0
+        self._metrics_labels = {
+            "type": self._model_description.get("model_type", "unknown"),
+            "model": self.model_uid(),
+            "node": self._worker_address,
+            "format": self._model_description.get("model_format", "unknown"),
+            "quantization": self._model_description.get("quantization", "none"),
+        }
+        self._loop: Optional[asyncio.AbstractEventLoop] = None
+    async def __post_create__(self):
+        self._loop = asyncio.get_running_loop()
+    async def _record_completion_metrics(
+        self, duration, completion_tokens, prompt_tokens
+    ):
+        coros = []
+        if completion_tokens > 0:
+            coros.append(
+                self.record_metrics(
+                    "output_tokens_total_counter",
+                    "add",
+                    {
+                        "labels": self._metrics_labels,
+                        "value": completion_tokens,
+                    },
+                )
+            )
+        if prompt_tokens > 0:
+            coros.append(
+                self.record_metrics(
+                    "input_tokens_total_counter",
+                    "add",
+                    {"labels": self._metrics_labels, "value": prompt_tokens},
+                )
+            )
+        if completion_tokens > 0:
+            generate_throughput = completion_tokens / duration
+            coros.append(
+                self.record_metrics(
+                    "generate_throughput",
+                    "set",
+                    {
+                        "labels": self._metrics_labels,
+                        "value": generate_throughput,
+                    },
+                )
+            )
+        await asyncio.gather(*coros)
+    async def _get_worker_ref(self) -> xo.ActorRefType["WorkerActor"]:
+        from .worker import WorkerActor
+        if self._worker_ref is None:
+            self._worker_ref = await xo.actor_ref(
+                address=self._worker_address, uid=WorkerActor.uid()
+            )
+        return self._worker_ref
     def is_vllm_backend(self) -> bool:
         from ..model.llm.vllm.core import VLLMModel
@@ -178,8 +249,14 @@ class ModelActor(xo.StatelessActor):
         )
     def _to_json_generator(self, gen: types.GeneratorType):
+        start_time = time.time()
+        time_to_first_token = None
+        final_usage = None
         try:
             for v in gen:
+                if time_to_first_token is None:
+                    time_to_first_token = (time.time() - start_time) * 1000
+                final_usage = v.pop("usage", None)
                 v = dict(data=json.dumps(v))
                 yield sse_starlette.sse.ensure_bytes(v, None)
         except OutOfMemoryError:
@@ -187,10 +264,31 @@ class ModelActor(xo.StatelessActor):
                 "Model actor is out of memory, model id: %s", self.model_uid()
             )
             os._exit(1)
+        finally:
+            if self._loop is not None and time_to_first_token is not None:
+                coro = self.record_metrics(
+                    "time_to_first_token",
+                    "set",
+                    {"labels": self._metrics_labels, "value": time_to_first_token},
+                )
+                asyncio.run_coroutine_threadsafe(coro, loop=self._loop)
+            if self._loop is not None and final_usage is not None:
+                coro = self._record_completion_metrics(
+                    time.time() - start_time,
+                    completion_tokens=final_usage["completion_tokens"],
+                    prompt_tokens=final_usage["prompt_tokens"],
+                )
+                asyncio.run_coroutine_threadsafe(coro, loop=self._loop)
     async def _to_json_async_gen(self, gen: types.AsyncGeneratorType):
+        start_time = time.time()
+        time_to_first_token = None
+        final_usage = None
         try:
             async for v in gen:
+                if time_to_first_token is None:
+                    time_to_first_token = (time.time() - start_time) * 1000
+                final_usage = v.pop("usage", None)
                 v = await asyncio.to_thread(json.dumps, v)
                 v = dict(data=v)  # noqa: F821
                 yield await asyncio.to_thread(sse_starlette.sse.ensure_bytes, v, None)
@@ -199,6 +297,25 @@ class ModelActor(xo.StatelessActor):
                 "Model actor is out of memory, model id: %s", self.model_uid()
             )
             os._exit(1)
+        finally:
+            coros = []
+            if time_to_first_token is not None:
+                coros.append(
+                    self.record_metrics(
+                        "time_to_first_token",
+                        "set",
+                        {"labels": self._metrics_labels, "value": time_to_first_token},
+                    )
+                )
+            if final_usage is not None:
+                coros.append(
+                    self._record_completion_metrics(
+                        time.time() - start_time,
+                        completion_tokens=final_usage["completion_tokens"],
+                        prompt_tokens=final_usage["prompt_tokens"],
+                    )
+                )
+            await asyncio.gather(*coros)
     @oom_check
     async def _call_wrapper(self, fn: Callable, *args, **kwargs):
@@ -245,13 +362,32 @@ class ModelActor(xo.StatelessActor):
     @request_limit
     @xo.generator
     async def chat(self, prompt: str, *args, **kwargs):
-        if hasattr(self._model, "chat"):
-            return await self._call_wrapper(self._model.chat, prompt, *args, **kwargs)
-        if hasattr(self._model, "async_chat"):
-            return await self._call_wrapper(
-                self._model.async_chat, prompt, *args, **kwargs
-            )
-        raise AttributeError(f"Model {self._model.model_spec} is not for chat.")
+        start_time = time.time()
+        response = None
+        try:
+            if hasattr(self._model, "chat"):
+                response = await self._call_wrapper(
+                    self._model.chat, prompt, *args, **kwargs
+                )
+                return response
+            if hasattr(self._model, "async_chat"):
+                response = await self._call_wrapper(
+                    self._model.async_chat, prompt, *args, **kwargs
+                )
+                return response
+            raise AttributeError(f"Model {self._model.model_spec} is not for chat.")
+        finally:
+            # For the non stream result.
+            if response is not None and isinstance(response, dict):
+                usage = response["usage"]
+                # Some backends may not have a valid usage, we just skip them.
+                completion_tokens = usage["completion_tokens"]
+                prompt_tokens = usage["prompt_tokens"]
+                await self._record_completion_metrics(
+                    time.time() - start_time,
+                    completion_tokens,
+                    prompt_tokens,
+                )
     @log_async(logger=logger)
     @request_limit
@@ -341,3 +477,7 @@ class ModelActor(xo.StatelessActor):
         raise AttributeError(
             f"Model {self._model.model_spec} is not for creating image."
         )
+    async def record_metrics(self, name, op, kwargs):
+        worker_ref = await self._get_worker_ref()
+        await worker_ref.record_metrics(name, op, kwargs)

xinference/core/status_guard.py ADDED Viewed

@@ -0,0 +1,86 @@
+# Copyright 2022-2024 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from enum import Enum
+from logging import getLogger
+from typing import Dict, List, Optional
+import xoscar as xo
+from pydantic import BaseModel
+logger = getLogger(__name__)
+class LaunchStatus(Enum):
+    CREATING = 1
+    UPDATING = 2
+    TERMINATING = 3
+    TERMINATED = 4
+    READY = 5
+    ERROR = 6
+class InstanceInfo(BaseModel):
+    model_name: str
+    model_uid: str
+    model_ability: List[str]
+    replica: int
+    status: str
+    instance_created_ts: int
+    def update(self, **kwargs):
+        for field, value in kwargs.items():
+            setattr(self, field, value)
+class StatusGuardActor(xo.StatelessActor):
+    def __init__(self):
+        super().__init__()
+        self._model_uid_to_info: Dict[str, InstanceInfo] = {}
+    @classmethod
+    def uid(cls) -> str:
+        return "status_guard"
+    @staticmethod
+    def _drop_terminated_info(instance_infos: List[InstanceInfo]) -> List[InstanceInfo]:
+        return [
+            info
+            for info in instance_infos
+            if info.status != LaunchStatus.TERMINATED.name
+        ]
+    def set_instance_info(self, model_uid: str, info: InstanceInfo):
+        self._model_uid_to_info[model_uid] = info
+    def get_instance_info(
+        self, model_name: Optional[str] = None, model_uid: Optional[str] = None
+    ) -> List[InstanceInfo]:
+        if model_uid is not None:
+            return (
+                self._drop_terminated_info([self._model_uid_to_info[model_uid]])
+                if model_uid in self._model_uid_to_info
+                else []
+            )
+        all_infos: List[InstanceInfo] = list(self._model_uid_to_info.values())
+        filtered_infos: List[InstanceInfo] = list(
+            filter(lambda info: info.model_name == model_name, all_infos)
+        )
+        return (
+            self._drop_terminated_info(filtered_infos)
+            if model_name is not None
+            else self._drop_terminated_info(all_infos)
+        )
+    def update_instance_info(self, model_uid: str, info: Dict):
+        self._model_uid_to_info[model_uid].update(**info)

xinference/core/supervisor.py CHANGED Viewed

@@ -22,6 +22,8 @@ from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple, Un
 import xoscar as xo
 from ..core import ModelActor
+from ..core.status_guard import InstanceInfo, LaunchStatus
+from .metrics import record_metrics
 from .resource import ResourceStatus
 from .utils import (
     build_replica_model_uid,
@@ -46,6 +48,12 @@ logger = getLogger(__name__)
 DEFAULT_NODE_TIMEOUT = 60
+ASYNC_LAUNCH_TASKS = {}  # type: ignore
+def callback_for_async_launch(model_uid: str):
+    ASYNC_LAUNCH_TASKS.pop(model_uid, None)
+    logger.debug(f"Model uid: {model_uid} async launch completes.")
 @dataclass
@@ -81,6 +89,13 @@ class SupervisorActor(xo.StatelessActor):
         # comment this line to avoid worker lost
         # self._check_dead_nodes_task = asyncio.create_task(self._check_dead_nodes())
         logger.info(f"Xinference supervisor {self.address} started")
+        from .status_guard import StatusGuardActor
+        self._status_guard_ref: xo.ActorRefType[
+            "StatusGuardActor"
+        ] = await xo.create_actor(
+            StatusGuardActor, address=self.address, uid=StatusGuardActor.uid()
+        )
         from ..model.embedding import (
             CustomEmbeddingModelSpec,
@@ -119,11 +134,13 @@ class SupervisorActor(xo.StatelessActor):
         from ..model.llm.llm_family import (
             BUILTIN_LLM_MODEL_CHAT_FAMILIES,
             BUILTIN_LLM_MODEL_GENERATE_FAMILIES,
+            BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
         )
         return {
             "chat": list(BUILTIN_LLM_MODEL_CHAT_FAMILIES),
             "generate": list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES),
+            "tool_call": list(BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES),
         }
     async def get_devices_count(self) -> int:
@@ -511,6 +528,7 @@ class SupervisorActor(xo.StatelessActor):
         replica: int = 1,
         n_gpu: Optional[Union[int, str]] = "auto",
         request_limits: Optional[int] = None,
+        wait_ready: bool = True,
         **kwargs,
     ) -> str:
         if model_uid is None:
@@ -552,6 +570,18 @@ class SupervisorActor(xo.StatelessActor):
             )
             self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref
+        async def _launch_model():
+            try:
+                for rep_model_uid in iter_replica_model_uid(model_uid, replica):
+                    await _launch_one_model(rep_model_uid)
+            except Exception:
+                # terminate_model will remove the replica info.
+                await self.terminate_model(model_uid, suppress_exception=True)
+                await self._status_guard_ref.update_instance_info(
+                    model_uid, {"status": LaunchStatus.ERROR.name}
+                )
+                raise
         if not is_valid_model_uid(model_uid):
             raise ValueError(
                 "The model UID is invalid. Please specify the model UID by 0 < length <= 100."
@@ -568,15 +598,31 @@ class SupervisorActor(xo.StatelessActor):
         self._model_uid_to_replica_info[model_uid] = ReplicaInfo(
             replica=replica, scheduler=itertools.cycle(range(replica))
         )
-        try:
-            for rep_model_uid in iter_replica_model_uid(model_uid, replica):
-                await _launch_one_model(rep_model_uid)
-        except Exception:
-            # terminate_model will remove the replica info.
-            await self.terminate_model(model_uid, suppress_exception=True)
-            raise
+        instance_info = InstanceInfo(
+            model_name=model_name,
+            model_uid=model_uid,
+            model_ability=[],
+            replica=replica,
+            status=LaunchStatus.CREATING.name,
+            instance_created_ts=int(time.time()),
+        )
+        await self._status_guard_ref.set_instance_info(model_uid, instance_info)
+        if wait_ready:
+            await _launch_model()
+        else:
+            task = asyncio.create_task(_launch_model())
+            ASYNC_LAUNCH_TASKS[model_uid] = task
+            task.add_done_callback(lambda _: callback_for_async_launch(model_uid))
         return model_uid
+    async def get_instance_info(
+        self, model_name: Optional[str], model_uid: Optional[str]
+    ) -> List[Dict]:
+        infos = await self._status_guard_ref.get_instance_info(
+            model_name=model_name, model_uid=model_uid
+        )
+        return [info.dict() for info in sorted(infos, key=lambda info: info.model_uid)]
     async def _check_dead_nodes(self):
         while True:
             dead_nodes = []
@@ -705,3 +751,7 @@ class SupervisorActor(xo.StatelessActor):
         self._worker_status[worker_address] = WorkerStatus(
             update_time=time.time(), status=status
         )
+    @staticmethod
+    def record_metrics(name, op, kwargs):
+        record_metrics(name, op, kwargs)

xinference 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

xinference 0.8.0py3-none-any.whl → 0.8.1py3-none-any.whl