xinference 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/oauth2/__init__.py +13 -0
- xinference/api/oauth2/common.py +14 -0
- xinference/api/oauth2/core.py +93 -0
- xinference/api/oauth2/types.py +36 -0
- xinference/api/oauth2/utils.py +44 -0
- xinference/api/restful_api.py +216 -27
- xinference/client/oscar/actor_client.py +18 -18
- xinference/client/restful/restful_client.py +96 -33
- xinference/conftest.py +63 -1
- xinference/constants.py +1 -0
- xinference/core/chat_interface.py +143 -3
- xinference/core/metrics.py +83 -0
- xinference/core/model.py +244 -181
- xinference/core/status_guard.py +86 -0
- xinference/core/supervisor.py +57 -7
- xinference/core/worker.py +134 -13
- xinference/deploy/cmdline.py +142 -16
- xinference/deploy/local.py +39 -7
- xinference/deploy/supervisor.py +2 -0
- xinference/deploy/worker.py +33 -5
- xinference/fields.py +4 -1
- xinference/model/core.py +8 -1
- xinference/model/embedding/core.py +3 -2
- xinference/model/embedding/model_spec_modelscope.json +60 -18
- xinference/model/image/stable_diffusion/core.py +4 -3
- xinference/model/llm/__init__.py +7 -0
- xinference/model/llm/ggml/llamacpp.py +3 -2
- xinference/model/llm/llm_family.json +87 -3
- xinference/model/llm/llm_family.py +15 -5
- xinference/model/llm/llm_family_modelscope.json +92 -3
- xinference/model/llm/pytorch/chatglm.py +70 -28
- xinference/model/llm/pytorch/core.py +11 -30
- xinference/model/llm/pytorch/internlm2.py +155 -0
- xinference/model/llm/pytorch/utils.py +0 -153
- xinference/model/llm/utils.py +37 -8
- xinference/model/llm/vllm/core.py +15 -3
- xinference/model/multimodal/__init__.py +15 -8
- xinference/model/multimodal/core.py +8 -1
- xinference/model/multimodal/model_spec.json +9 -0
- xinference/model/multimodal/model_spec_modelscope.json +45 -0
- xinference/model/multimodal/qwen_vl.py +5 -9
- xinference/model/utils.py +7 -2
- xinference/types.py +2 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.b83095c2.js +3 -0
- xinference/web/ui/build/static/js/{main.236e72e7.js.LICENSE.txt → main.b83095c2.js.LICENSE.txt} +7 -0
- xinference/web/ui/build/static/js/main.b83095c2.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0a853b2fa1902551e262a2f1a4b7894341f27b3dd9587f2ef7aaea195af89518.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/101923c539819f26ad11fbcbd6f6e56436b285efbb090dcc7dd648c6e924c4a8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/193e7ba39e70d4bb2895a5cb317f6f293a5fd02e7e324c02a1eba2f83216419c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/22858de5265f2d279fca9f2f54dfb147e4b2704200dfb5d2ad3ec9769417328f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/27696db5fcd4fcf0e7974cadf1e4a2ab89690474045c3188eafd586323ad13bb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/27bcada3ee8f89d21184b359f022fc965f350ffaca52c9814c29f1fc37121173.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/27bdbe25deab8cf08f7fab8f05f8f26cf84a98809527a37986a4ab73a57ba96a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2bee7b8bd3d52976a45d6068e1333df88b943e0e679403c809e45382e3818037.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/30670751f55508ef3b861e13dd71b9e5a10d2561373357a12fc3831a0b77fd93.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3605cd3a96ff2a3b443c70a101575482279ad26847924cab0684d165ba0d2492.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3789ef437d3ecbf945bb9cea39093d1f16ebbfa32dbe6daf35abcfb6d48de6f1.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4942da6bc03bf7373af068e22f916341aabc5b5df855d73c1d348c696724ce37.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4d933e35e0fe79867d3aa6c46db28804804efddf5490347cb6c2c2879762a157.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4d96f071168af43965e0fab2ded658fa0a15b8d9ca03789a5ef9c5c16a4e3cee.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/52a6136cb2dbbf9c51d461724d9b283ebe74a73fb19d5df7ba8e13c42bd7174d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5c408307c982f07f9c09c85c98212d1b1c22548a9194c69548750a3016b91b88.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/663adbcb60b942e9cf094c8d9fabe57517f5e5e6e722d28b4948a40b7445a3b8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/666bb2e1b250dc731311a7e4880886177885dfa768508d2ed63e02630cc78725.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/71493aadd34d568fbe605cacaba220aa69bd09273251ee4ba27930f8d01fccd8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8b071db2a5a9ef68dc14d5f606540bd23d9785e365a11997c510656764d2dccf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8b246d79cd3f6fc78f11777e6a6acca6a2c5d4ecce7f2dd4dcf9a48126440d3c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a4d72d3b806ba061919115f0c513738726872e3c79cf258f007519d3f91d1a16.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b4e4fccaf8f2489a29081f0bf3b191656bd452fb3c8b5e3c6d92d94f680964d5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b53eb7c7967f6577bd3e678293c44204fb03ffa7fdc1dd59d3099015c68f6f7f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d06af85a84e5c5a29d3acf2dbb5b30c0cf75c8aec4ab5f975e6096f944ee4324.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d5e150bff31715977d8f537c970f06d4fe3de9909d7e8342244a83a9f6447121.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/de36e5c08fd524e341d664883dda6cb1745acc852a4f1b011a35a0b4615f72fa.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f037ffef5992af0892d6d991053c1dace364cd39a3f11f1a41f92776e8a59459.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f23ab356a8603d4a2aaa74388c2f381675c207d37c4d1c832df922e9655c9a6b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f7c23b0922f4087b9e2e3e46f15c946b772daa46c28c3a12426212ecaf481deb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f95a8bd358eeb55fa2f49f1224cc2f4f36006359856744ff09ae4bb295f59ec1.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +36 -0
- xinference/web/ui/node_modules/@types/cookie/package.json +30 -0
- xinference/web/ui/node_modules/@types/hoist-non-react-statics/package.json +33 -0
- xinference/web/ui/node_modules/react-cookie/package.json +55 -0
- xinference/web/ui/node_modules/universal-cookie/package.json +48 -0
- xinference/web/ui/package-lock.json +37 -0
- xinference/web/ui/package.json +3 -2
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/METADATA +17 -6
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/RECORD +101 -66
- xinference/web/ui/build/static/js/main.236e72e7.js +0 -3
- xinference/web/ui/build/static/js/main.236e72e7.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0cccfbe5d963b8e31eb679f9d9677392839cedd04aa2956ac6b33cf19599d597.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0f3b6cc71b7c83bdc85aa4835927aeb86af2ce0d2ac241917ecfbf90f75c6d27.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2f651cf60b1bde50c0601c7110f77dd44819fb6e2501ff748a631724d91445d4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/42bb623f337ad08ed076484185726e072ca52bb88e373d72c7b052db4c273342.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/57af83639c604bd3362d0f03f7505e81c6f67ff77bee7c6bb31f6e5523eba185.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/667753ce39ce1d4bcbf9a5f1a103d653be1d19d42f4e1fbaceb9b507679a52c7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/66ed1bd4c06748c1b176a625c25c856997edc787856c73162f82f2b465c5d956.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/78f2521da2e2a98b075a2666cb782c7e2c019cd3c72199eecd5901c82d8655df.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8d2b0b3c6988d1894694dcbbe708ef91cfe62d62dac317031f09915ced637953.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/9427ae7f1e94ae8dcd2333fb361e381f4054fde07394fe5448658e3417368476.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bcee2b4e76b07620f9087989eb86d43c645ba3c7a74132cf926260af1164af0e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/cc2ddd02ccc1dad1a2737ac247c79e6f6ed2c7836c6b68e511e3048f666b64af.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d2e8e6665a7efc832b43907dadf4e3c896a59eaf8129f9a520882466c8f2e489.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d8a42e9df7157de9f28eecefdf178fd113bf2280d28471b6e32a8a45276042df.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e26750d9556e9741912333349e4da454c53dbfddbfc6002ab49518dcf02af745.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/ef42ec014d7bc373b874b2a1ff0dcd785490f125e913698bc049b0bd778e4d66.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fe3eb4d76c79ca98833f686d642224eeeb94cc83ad14300d281623796d087f0a.json +0 -1
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/LICENSE +0 -0
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/WHEEL +0 -0
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/top_level.txt +0 -0
xinference/core/supervisor.py
CHANGED
|
@@ -22,6 +22,8 @@ from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple, Un
|
|
|
22
22
|
import xoscar as xo
|
|
23
23
|
|
|
24
24
|
from ..core import ModelActor
|
|
25
|
+
from ..core.status_guard import InstanceInfo, LaunchStatus
|
|
26
|
+
from .metrics import record_metrics
|
|
25
27
|
from .resource import ResourceStatus
|
|
26
28
|
from .utils import (
|
|
27
29
|
build_replica_model_uid,
|
|
@@ -46,6 +48,12 @@ logger = getLogger(__name__)
|
|
|
46
48
|
|
|
47
49
|
|
|
48
50
|
DEFAULT_NODE_TIMEOUT = 60
|
|
51
|
+
ASYNC_LAUNCH_TASKS = {} # type: ignore
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def callback_for_async_launch(model_uid: str):
|
|
55
|
+
ASYNC_LAUNCH_TASKS.pop(model_uid, None)
|
|
56
|
+
logger.debug(f"Model uid: {model_uid} async launch completes.")
|
|
49
57
|
|
|
50
58
|
|
|
51
59
|
@dataclass
|
|
@@ -81,6 +89,13 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
81
89
|
# comment this line to avoid worker lost
|
|
82
90
|
# self._check_dead_nodes_task = asyncio.create_task(self._check_dead_nodes())
|
|
83
91
|
logger.info(f"Xinference supervisor {self.address} started")
|
|
92
|
+
from .status_guard import StatusGuardActor
|
|
93
|
+
|
|
94
|
+
self._status_guard_ref: xo.ActorRefType[
|
|
95
|
+
"StatusGuardActor"
|
|
96
|
+
] = await xo.create_actor(
|
|
97
|
+
StatusGuardActor, address=self.address, uid=StatusGuardActor.uid()
|
|
98
|
+
)
|
|
84
99
|
|
|
85
100
|
from ..model.embedding import (
|
|
86
101
|
CustomEmbeddingModelSpec,
|
|
@@ -119,11 +134,13 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
119
134
|
from ..model.llm.llm_family import (
|
|
120
135
|
BUILTIN_LLM_MODEL_CHAT_FAMILIES,
|
|
121
136
|
BUILTIN_LLM_MODEL_GENERATE_FAMILIES,
|
|
137
|
+
BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
|
|
122
138
|
)
|
|
123
139
|
|
|
124
140
|
return {
|
|
125
141
|
"chat": list(BUILTIN_LLM_MODEL_CHAT_FAMILIES),
|
|
126
142
|
"generate": list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES),
|
|
143
|
+
"tool_call": list(BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES),
|
|
127
144
|
}
|
|
128
145
|
|
|
129
146
|
async def get_devices_count(self) -> int:
|
|
@@ -511,6 +528,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
511
528
|
replica: int = 1,
|
|
512
529
|
n_gpu: Optional[Union[int, str]] = "auto",
|
|
513
530
|
request_limits: Optional[int] = None,
|
|
531
|
+
wait_ready: bool = True,
|
|
514
532
|
**kwargs,
|
|
515
533
|
) -> str:
|
|
516
534
|
if model_uid is None:
|
|
@@ -552,6 +570,18 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
552
570
|
)
|
|
553
571
|
self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref
|
|
554
572
|
|
|
573
|
+
async def _launch_model():
|
|
574
|
+
try:
|
|
575
|
+
for rep_model_uid in iter_replica_model_uid(model_uid, replica):
|
|
576
|
+
await _launch_one_model(rep_model_uid)
|
|
577
|
+
except Exception:
|
|
578
|
+
# terminate_model will remove the replica info.
|
|
579
|
+
await self.terminate_model(model_uid, suppress_exception=True)
|
|
580
|
+
await self._status_guard_ref.update_instance_info(
|
|
581
|
+
model_uid, {"status": LaunchStatus.ERROR.name}
|
|
582
|
+
)
|
|
583
|
+
raise
|
|
584
|
+
|
|
555
585
|
if not is_valid_model_uid(model_uid):
|
|
556
586
|
raise ValueError(
|
|
557
587
|
"The model UID is invalid. Please specify the model UID by 0 < length <= 100."
|
|
@@ -568,15 +598,31 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
568
598
|
self._model_uid_to_replica_info[model_uid] = ReplicaInfo(
|
|
569
599
|
replica=replica, scheduler=itertools.cycle(range(replica))
|
|
570
600
|
)
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
601
|
+
instance_info = InstanceInfo(
|
|
602
|
+
model_name=model_name,
|
|
603
|
+
model_uid=model_uid,
|
|
604
|
+
model_ability=[],
|
|
605
|
+
replica=replica,
|
|
606
|
+
status=LaunchStatus.CREATING.name,
|
|
607
|
+
instance_created_ts=int(time.time()),
|
|
608
|
+
)
|
|
609
|
+
await self._status_guard_ref.set_instance_info(model_uid, instance_info)
|
|
610
|
+
if wait_ready:
|
|
611
|
+
await _launch_model()
|
|
612
|
+
else:
|
|
613
|
+
task = asyncio.create_task(_launch_model())
|
|
614
|
+
ASYNC_LAUNCH_TASKS[model_uid] = task
|
|
615
|
+
task.add_done_callback(lambda _: callback_for_async_launch(model_uid))
|
|
578
616
|
return model_uid
|
|
579
617
|
|
|
618
|
+
async def get_instance_info(
|
|
619
|
+
self, model_name: Optional[str], model_uid: Optional[str]
|
|
620
|
+
) -> List[Dict]:
|
|
621
|
+
infos = await self._status_guard_ref.get_instance_info(
|
|
622
|
+
model_name=model_name, model_uid=model_uid
|
|
623
|
+
)
|
|
624
|
+
return [info.dict() for info in sorted(infos, key=lambda info: info.model_uid)]
|
|
625
|
+
|
|
580
626
|
async def _check_dead_nodes(self):
|
|
581
627
|
while True:
|
|
582
628
|
dead_nodes = []
|
|
@@ -705,3 +751,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
705
751
|
self._worker_status[worker_address] = WorkerStatus(
|
|
706
752
|
update_time=time.time(), status=status
|
|
707
753
|
)
|
|
754
|
+
|
|
755
|
+
@staticmethod
|
|
756
|
+
def record_metrics(name, op, kwargs):
|
|
757
|
+
record_metrics(name, op, kwargs)
|
xinference/core/worker.py
CHANGED
|
@@ -15,7 +15,9 @@
|
|
|
15
15
|
import asyncio
|
|
16
16
|
import os
|
|
17
17
|
import platform
|
|
18
|
+
import queue
|
|
18
19
|
import signal
|
|
20
|
+
import threading
|
|
19
21
|
from collections import defaultdict
|
|
20
22
|
from logging import getLogger
|
|
21
23
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
|
@@ -25,8 +27,10 @@ from xoscar import MainActorPoolType
|
|
|
25
27
|
|
|
26
28
|
from ..constants import XINFERENCE_CACHE_DIR
|
|
27
29
|
from ..core import ModelActor
|
|
30
|
+
from ..core.status_guard import LaunchStatus
|
|
28
31
|
from ..model.core import ModelDescription, create_model_instance
|
|
29
32
|
from ..utils import cuda_count
|
|
33
|
+
from .metrics import launch_metrics_export_server, record_metrics
|
|
30
34
|
from .resource import gather_node_info
|
|
31
35
|
from .utils import log_async, log_sync, parse_replica_model_uid, purge_dir
|
|
32
36
|
|
|
@@ -34,6 +38,12 @@ logger = getLogger(__name__)
|
|
|
34
38
|
|
|
35
39
|
|
|
36
40
|
DEFAULT_NODE_HEARTBEAT_INTERVAL = 5
|
|
41
|
+
MODEL_ACTOR_AUTO_RECOVER_LIMIT: Optional[int]
|
|
42
|
+
_MODEL_ACTOR_AUTO_RECOVER_LIMIT = os.getenv("XINFERENCE_MODEL_ACTOR_AUTO_RECOVER_LIMIT")
|
|
43
|
+
if _MODEL_ACTOR_AUTO_RECOVER_LIMIT is not None:
|
|
44
|
+
MODEL_ACTOR_AUTO_RECOVER_LIMIT = int(_MODEL_ACTOR_AUTO_RECOVER_LIMIT)
|
|
45
|
+
else:
|
|
46
|
+
MODEL_ACTOR_AUTO_RECOVER_LIMIT = None
|
|
37
47
|
|
|
38
48
|
|
|
39
49
|
class WorkerActor(xo.StatelessActor):
|
|
@@ -42,6 +52,8 @@ class WorkerActor(xo.StatelessActor):
|
|
|
42
52
|
supervisor_address: str,
|
|
43
53
|
main_pool: MainActorPoolType,
|
|
44
54
|
cuda_devices: List[int],
|
|
55
|
+
metrics_exporter_host: Optional[str] = None,
|
|
56
|
+
metrics_exporter_port: Optional[int] = None,
|
|
45
57
|
):
|
|
46
58
|
super().__init__()
|
|
47
59
|
# static attrs.
|
|
@@ -57,20 +69,71 @@ class WorkerActor(xo.StatelessActor):
|
|
|
57
69
|
self._gpu_to_model_uid: Dict[int, str] = {}
|
|
58
70
|
self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
|
|
59
71
|
self._model_uid_to_addr: Dict[str, str] = {}
|
|
72
|
+
self._model_uid_to_recover_count: Dict[str, int] = {}
|
|
60
73
|
self._model_uid_to_launch_args: Dict[str, Dict] = {}
|
|
61
74
|
|
|
75
|
+
# metrics export server.
|
|
76
|
+
if metrics_exporter_host is not None or metrics_exporter_port is not None:
|
|
77
|
+
logger.info(
|
|
78
|
+
f"Starting metrics export server at {metrics_exporter_host}:{metrics_exporter_port}"
|
|
79
|
+
)
|
|
80
|
+
q: queue.Queue = queue.Queue()
|
|
81
|
+
self._metrics_thread = threading.Thread(
|
|
82
|
+
name="Metrics Export Server",
|
|
83
|
+
target=launch_metrics_export_server,
|
|
84
|
+
args=(q, metrics_exporter_host, metrics_exporter_port),
|
|
85
|
+
daemon=True,
|
|
86
|
+
)
|
|
87
|
+
self._metrics_thread.start()
|
|
88
|
+
logger.info("Checking metrics export server...")
|
|
89
|
+
while self._metrics_thread.is_alive():
|
|
90
|
+
try:
|
|
91
|
+
host, port = q.get(block=False)[:2]
|
|
92
|
+
logger.info(f"Metrics server is started at: http://{host}:{port}")
|
|
93
|
+
break
|
|
94
|
+
except queue.Empty:
|
|
95
|
+
pass
|
|
96
|
+
else:
|
|
97
|
+
raise Exception("Metrics server thread exit.")
|
|
98
|
+
|
|
62
99
|
self._lock = asyncio.Lock()
|
|
63
100
|
|
|
64
101
|
async def recover_sub_pool(self, address):
|
|
65
|
-
logger.warning("Process %s is down
|
|
102
|
+
logger.warning("Process %s is down.", address)
|
|
103
|
+
# Xoscar does not remove the address from sub_processes.
|
|
104
|
+
try:
|
|
105
|
+
await self._main_pool.remove_sub_pool(address)
|
|
106
|
+
except Exception:
|
|
107
|
+
pass
|
|
66
108
|
for model_uid, addr in self._model_uid_to_addr.items():
|
|
67
109
|
if addr == address:
|
|
68
110
|
launch_args = self._model_uid_to_launch_args.get(model_uid)
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
111
|
+
if launch_args is None:
|
|
112
|
+
logger.warning(
|
|
113
|
+
"Not recreate model because the it is down during launch."
|
|
114
|
+
)
|
|
115
|
+
else:
|
|
116
|
+
recover_count = self._model_uid_to_recover_count.get(model_uid)
|
|
117
|
+
try:
|
|
118
|
+
await self.terminate_model(model_uid)
|
|
119
|
+
except Exception:
|
|
120
|
+
pass
|
|
121
|
+
if recover_count is not None:
|
|
122
|
+
if recover_count > 0:
|
|
123
|
+
logger.warning(
|
|
124
|
+
"Recreating model actor %s, remain %s times ...",
|
|
125
|
+
model_uid,
|
|
126
|
+
recover_count - 1,
|
|
127
|
+
)
|
|
128
|
+
self._model_uid_to_recover_count[model_uid] = (
|
|
129
|
+
recover_count - 1
|
|
130
|
+
)
|
|
131
|
+
await self.launch_builtin_model(**launch_args)
|
|
132
|
+
else:
|
|
133
|
+
logger.warning("Stop recreating model actor.")
|
|
134
|
+
else:
|
|
135
|
+
logger.warning("Recreating model actor %s ...", model_uid)
|
|
136
|
+
await self.launch_builtin_model(**launch_args)
|
|
74
137
|
break
|
|
75
138
|
|
|
76
139
|
@classmethod
|
|
@@ -78,8 +141,14 @@ class WorkerActor(xo.StatelessActor):
|
|
|
78
141
|
return "worker"
|
|
79
142
|
|
|
80
143
|
async def __post_create__(self):
|
|
144
|
+
from .status_guard import StatusGuardActor
|
|
81
145
|
from .supervisor import SupervisorActor
|
|
82
146
|
|
|
147
|
+
self._status_guard_ref: xo.ActorRefType[
|
|
148
|
+
"StatusGuardActor"
|
|
149
|
+
] = await xo.actor_ref(
|
|
150
|
+
address=self._supervisor_address, uid=StatusGuardActor.uid()
|
|
151
|
+
)
|
|
83
152
|
self._supervisor_ref: xo.ActorRefType["SupervisorActor"] = await xo.actor_ref(
|
|
84
153
|
address=self._supervisor_address, uid=SupervisorActor.uid()
|
|
85
154
|
)
|
|
@@ -309,7 +378,12 @@ class WorkerActor(xo.StatelessActor):
|
|
|
309
378
|
|
|
310
379
|
try:
|
|
311
380
|
model_ref = await xo.create_actor(
|
|
312
|
-
ModelActor,
|
|
381
|
+
ModelActor,
|
|
382
|
+
address=subpool_address,
|
|
383
|
+
uid=model_uid,
|
|
384
|
+
worker_address=self.address,
|
|
385
|
+
model=model,
|
|
386
|
+
model_description=model_description,
|
|
313
387
|
)
|
|
314
388
|
await model_ref.load()
|
|
315
389
|
except:
|
|
@@ -324,6 +398,22 @@ class WorkerActor(xo.StatelessActor):
|
|
|
324
398
|
self._gpu_to_model_uid[int(dev)] = model_uid
|
|
325
399
|
self._model_uid_to_addr[model_uid] = subpool_address
|
|
326
400
|
|
|
401
|
+
async def _get_model_ability(self, model: Any, model_type: str) -> List[str]:
|
|
402
|
+
from ..model.llm.core import LLM
|
|
403
|
+
|
|
404
|
+
if model_type == "embedding":
|
|
405
|
+
return ["embed"]
|
|
406
|
+
elif model_type == "rerank":
|
|
407
|
+
return ["rerank"]
|
|
408
|
+
elif model_type == "image":
|
|
409
|
+
return ["text_to_image"]
|
|
410
|
+
elif model_type == "multimodal":
|
|
411
|
+
return ["multimodal"]
|
|
412
|
+
else:
|
|
413
|
+
assert model_type == "LLM"
|
|
414
|
+
assert isinstance(model, LLM)
|
|
415
|
+
return model.model_family.model_ability # type: ignore
|
|
416
|
+
|
|
327
417
|
@log_async(logger=logger)
|
|
328
418
|
async def launch_builtin_model(
|
|
329
419
|
self,
|
|
@@ -339,6 +429,8 @@ class WorkerActor(xo.StatelessActor):
|
|
|
339
429
|
):
|
|
340
430
|
launch_args = locals()
|
|
341
431
|
launch_args.pop("self")
|
|
432
|
+
launch_args.pop("kwargs")
|
|
433
|
+
launch_args.update(kwargs)
|
|
342
434
|
if n_gpu is not None:
|
|
343
435
|
if isinstance(n_gpu, int) and (n_gpu <= 0 or n_gpu > cuda_count()):
|
|
344
436
|
raise ValueError(
|
|
@@ -358,6 +450,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
358
450
|
)
|
|
359
451
|
|
|
360
452
|
try:
|
|
453
|
+
origin_uid, _, _ = parse_replica_model_uid(model_uid)
|
|
361
454
|
model, model_description = await asyncio.to_thread(
|
|
362
455
|
create_model_instance,
|
|
363
456
|
subpool_address,
|
|
@@ -375,7 +468,9 @@ class WorkerActor(xo.StatelessActor):
|
|
|
375
468
|
ModelActor,
|
|
376
469
|
address=subpool_address,
|
|
377
470
|
uid=model_uid,
|
|
471
|
+
worker_address=self.address,
|
|
378
472
|
model=model,
|
|
473
|
+
model_description=model_description,
|
|
379
474
|
request_limits=request_limits,
|
|
380
475
|
)
|
|
381
476
|
await model_ref.load()
|
|
@@ -388,13 +483,27 @@ class WorkerActor(xo.StatelessActor):
|
|
|
388
483
|
self._model_uid_to_model[model_uid] = model_ref
|
|
389
484
|
self._model_uid_to_model_spec[model_uid] = model_description
|
|
390
485
|
self._model_uid_to_addr[model_uid] = subpool_address
|
|
486
|
+
self._model_uid_to_recover_count.setdefault(
|
|
487
|
+
model_uid, MODEL_ACTOR_AUTO_RECOVER_LIMIT
|
|
488
|
+
)
|
|
391
489
|
self._model_uid_to_launch_args[model_uid] = launch_args
|
|
392
490
|
|
|
491
|
+
# update status to READY
|
|
492
|
+
abilities = await self._get_model_ability(model, model_type)
|
|
493
|
+
await self._status_guard_ref.update_instance_info(
|
|
494
|
+
origin_uid,
|
|
495
|
+
{"model_ability": abilities, "status": LaunchStatus.READY.name},
|
|
496
|
+
)
|
|
497
|
+
|
|
393
498
|
@log_async(logger=logger)
|
|
394
499
|
async def terminate_model(self, model_uid: str):
|
|
500
|
+
origin_uid, _, _ = parse_replica_model_uid(model_uid)
|
|
501
|
+
await self._status_guard_ref.update_instance_info(
|
|
502
|
+
origin_uid, {"status": LaunchStatus.TERMINATING.name}
|
|
503
|
+
)
|
|
395
504
|
model_ref = self._model_uid_to_model.get(model_uid, None)
|
|
396
505
|
if model_ref is None:
|
|
397
|
-
|
|
506
|
+
logger.debug("Model not found, uid: %s", model_uid)
|
|
398
507
|
|
|
399
508
|
try:
|
|
400
509
|
await xo.destroy_actor(model_ref)
|
|
@@ -405,12 +514,20 @@ class WorkerActor(xo.StatelessActor):
|
|
|
405
514
|
try:
|
|
406
515
|
subpool_address = self._model_uid_to_addr[model_uid]
|
|
407
516
|
await self._main_pool.remove_sub_pool(subpool_address)
|
|
517
|
+
except Exception as e:
|
|
518
|
+
logger.debug(
|
|
519
|
+
"Remove sub pool failed, model uid: %s, error: %s", model_uid, e
|
|
520
|
+
)
|
|
408
521
|
finally:
|
|
409
|
-
|
|
410
|
-
|
|
522
|
+
self._model_uid_to_model.pop(model_uid, None)
|
|
523
|
+
self._model_uid_to_model_spec.pop(model_uid, None)
|
|
411
524
|
self.release_devices(model_uid)
|
|
412
|
-
|
|
413
|
-
|
|
525
|
+
self._model_uid_to_addr.pop(model_uid, None)
|
|
526
|
+
self._model_uid_to_recover_count.pop(model_uid, None)
|
|
527
|
+
self._model_uid_to_launch_args.pop(model_uid, None)
|
|
528
|
+
await self._status_guard_ref.update_instance_info(
|
|
529
|
+
origin_uid, {"status": LaunchStatus.TERMINATED.name}
|
|
530
|
+
)
|
|
414
531
|
|
|
415
532
|
@log_async(logger=logger)
|
|
416
533
|
async def list_models(self) -> Dict[str, Dict[str, Any]]:
|
|
@@ -425,7 +542,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
425
542
|
def get_model(self, model_uid: str) -> xo.ActorRefType["ModelActor"]:
|
|
426
543
|
model_ref = self._model_uid_to_model.get(model_uid, None)
|
|
427
544
|
if model_ref is None:
|
|
428
|
-
raise ValueError(f"Model not found
|
|
545
|
+
raise ValueError(f"Model not found, uid: {model_uid}")
|
|
429
546
|
return model_ref
|
|
430
547
|
|
|
431
548
|
@log_sync(logger=logger)
|
|
@@ -458,3 +575,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
458
575
|
await asyncio.sleep(DEFAULT_NODE_HEARTBEAT_INTERVAL)
|
|
459
576
|
except asyncio.CancelledError: # pragma: no cover
|
|
460
577
|
break
|
|
578
|
+
|
|
579
|
+
@staticmethod
|
|
580
|
+
def record_metrics(name, op, kwargs):
|
|
581
|
+
record_metrics(name, op, kwargs)
|