xinference 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (120) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/__init__.py +13 -0
  3. xinference/api/oauth2/common.py +14 -0
  4. xinference/api/oauth2/core.py +93 -0
  5. xinference/api/oauth2/types.py +36 -0
  6. xinference/api/oauth2/utils.py +44 -0
  7. xinference/api/restful_api.py +216 -27
  8. xinference/client/oscar/actor_client.py +18 -18
  9. xinference/client/restful/restful_client.py +96 -33
  10. xinference/conftest.py +63 -1
  11. xinference/constants.py +1 -0
  12. xinference/core/chat_interface.py +143 -3
  13. xinference/core/metrics.py +83 -0
  14. xinference/core/model.py +244 -181
  15. xinference/core/status_guard.py +86 -0
  16. xinference/core/supervisor.py +57 -7
  17. xinference/core/worker.py +134 -13
  18. xinference/deploy/cmdline.py +142 -16
  19. xinference/deploy/local.py +39 -7
  20. xinference/deploy/supervisor.py +2 -0
  21. xinference/deploy/worker.py +33 -5
  22. xinference/fields.py +4 -1
  23. xinference/model/core.py +8 -1
  24. xinference/model/embedding/core.py +3 -2
  25. xinference/model/embedding/model_spec_modelscope.json +60 -18
  26. xinference/model/image/stable_diffusion/core.py +4 -3
  27. xinference/model/llm/__init__.py +7 -0
  28. xinference/model/llm/ggml/llamacpp.py +3 -2
  29. xinference/model/llm/llm_family.json +87 -3
  30. xinference/model/llm/llm_family.py +15 -5
  31. xinference/model/llm/llm_family_modelscope.json +92 -3
  32. xinference/model/llm/pytorch/chatglm.py +70 -28
  33. xinference/model/llm/pytorch/core.py +11 -30
  34. xinference/model/llm/pytorch/internlm2.py +155 -0
  35. xinference/model/llm/pytorch/utils.py +0 -153
  36. xinference/model/llm/utils.py +37 -8
  37. xinference/model/llm/vllm/core.py +15 -3
  38. xinference/model/multimodal/__init__.py +15 -8
  39. xinference/model/multimodal/core.py +8 -1
  40. xinference/model/multimodal/model_spec.json +9 -0
  41. xinference/model/multimodal/model_spec_modelscope.json +45 -0
  42. xinference/model/multimodal/qwen_vl.py +5 -9
  43. xinference/model/utils.py +7 -2
  44. xinference/types.py +2 -0
  45. xinference/web/ui/build/asset-manifest.json +3 -3
  46. xinference/web/ui/build/index.html +1 -1
  47. xinference/web/ui/build/static/js/main.b83095c2.js +3 -0
  48. xinference/web/ui/build/static/js/{main.236e72e7.js.LICENSE.txt → main.b83095c2.js.LICENSE.txt} +7 -0
  49. xinference/web/ui/build/static/js/main.b83095c2.js.map +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/0a853b2fa1902551e262a2f1a4b7894341f27b3dd9587f2ef7aaea195af89518.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/101923c539819f26ad11fbcbd6f6e56436b285efbb090dcc7dd648c6e924c4a8.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/193e7ba39e70d4bb2895a5cb317f6f293a5fd02e7e324c02a1eba2f83216419c.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/22858de5265f2d279fca9f2f54dfb147e4b2704200dfb5d2ad3ec9769417328f.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/27696db5fcd4fcf0e7974cadf1e4a2ab89690474045c3188eafd586323ad13bb.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/27bcada3ee8f89d21184b359f022fc965f350ffaca52c9814c29f1fc37121173.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/27bdbe25deab8cf08f7fab8f05f8f26cf84a98809527a37986a4ab73a57ba96a.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/2bee7b8bd3d52976a45d6068e1333df88b943e0e679403c809e45382e3818037.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/30670751f55508ef3b861e13dd71b9e5a10d2561373357a12fc3831a0b77fd93.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/3605cd3a96ff2a3b443c70a101575482279ad26847924cab0684d165ba0d2492.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/3789ef437d3ecbf945bb9cea39093d1f16ebbfa32dbe6daf35abcfb6d48de6f1.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/4942da6bc03bf7373af068e22f916341aabc5b5df855d73c1d348c696724ce37.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/4d933e35e0fe79867d3aa6c46db28804804efddf5490347cb6c2c2879762a157.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/4d96f071168af43965e0fab2ded658fa0a15b8d9ca03789a5ef9c5c16a4e3cee.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/52a6136cb2dbbf9c51d461724d9b283ebe74a73fb19d5df7ba8e13c42bd7174d.json +1 -0
  66. xinference/web/ui/node_modules/.cache/babel-loader/5c408307c982f07f9c09c85c98212d1b1c22548a9194c69548750a3016b91b88.json +1 -0
  67. xinference/web/ui/node_modules/.cache/babel-loader/663adbcb60b942e9cf094c8d9fabe57517f5e5e6e722d28b4948a40b7445a3b8.json +1 -0
  68. xinference/web/ui/node_modules/.cache/babel-loader/666bb2e1b250dc731311a7e4880886177885dfa768508d2ed63e02630cc78725.json +1 -0
  69. xinference/web/ui/node_modules/.cache/babel-loader/71493aadd34d568fbe605cacaba220aa69bd09273251ee4ba27930f8d01fccd8.json +1 -0
  70. xinference/web/ui/node_modules/.cache/babel-loader/8b071db2a5a9ef68dc14d5f606540bd23d9785e365a11997c510656764d2dccf.json +1 -0
  71. xinference/web/ui/node_modules/.cache/babel-loader/8b246d79cd3f6fc78f11777e6a6acca6a2c5d4ecce7f2dd4dcf9a48126440d3c.json +1 -0
  72. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +1 -0
  73. xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +1 -0
  74. xinference/web/ui/node_modules/.cache/babel-loader/a4d72d3b806ba061919115f0c513738726872e3c79cf258f007519d3f91d1a16.json +1 -0
  75. xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +1 -0
  76. xinference/web/ui/node_modules/.cache/babel-loader/b4e4fccaf8f2489a29081f0bf3b191656bd452fb3c8b5e3c6d92d94f680964d5.json +1 -0
  77. xinference/web/ui/node_modules/.cache/babel-loader/b53eb7c7967f6577bd3e678293c44204fb03ffa7fdc1dd59d3099015c68f6f7f.json +1 -0
  78. xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +1 -0
  79. xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +1 -0
  80. xinference/web/ui/node_modules/.cache/babel-loader/d06af85a84e5c5a29d3acf2dbb5b30c0cf75c8aec4ab5f975e6096f944ee4324.json +1 -0
  81. xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +1 -0
  82. xinference/web/ui/node_modules/.cache/babel-loader/d5e150bff31715977d8f537c970f06d4fe3de9909d7e8342244a83a9f6447121.json +1 -0
  83. xinference/web/ui/node_modules/.cache/babel-loader/de36e5c08fd524e341d664883dda6cb1745acc852a4f1b011a35a0b4615f72fa.json +1 -0
  84. xinference/web/ui/node_modules/.cache/babel-loader/f037ffef5992af0892d6d991053c1dace364cd39a3f11f1a41f92776e8a59459.json +1 -0
  85. xinference/web/ui/node_modules/.cache/babel-loader/f23ab356a8603d4a2aaa74388c2f381675c207d37c4d1c832df922e9655c9a6b.json +1 -0
  86. xinference/web/ui/node_modules/.cache/babel-loader/f7c23b0922f4087b9e2e3e46f15c946b772daa46c28c3a12426212ecaf481deb.json +1 -0
  87. xinference/web/ui/node_modules/.cache/babel-loader/f95a8bd358eeb55fa2f49f1224cc2f4f36006359856744ff09ae4bb295f59ec1.json +1 -0
  88. xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +1 -0
  89. xinference/web/ui/node_modules/.package-lock.json +36 -0
  90. xinference/web/ui/node_modules/@types/cookie/package.json +30 -0
  91. xinference/web/ui/node_modules/@types/hoist-non-react-statics/package.json +33 -0
  92. xinference/web/ui/node_modules/react-cookie/package.json +55 -0
  93. xinference/web/ui/node_modules/universal-cookie/package.json +48 -0
  94. xinference/web/ui/package-lock.json +37 -0
  95. xinference/web/ui/package.json +3 -2
  96. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/METADATA +17 -6
  97. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/RECORD +101 -66
  98. xinference/web/ui/build/static/js/main.236e72e7.js +0 -3
  99. xinference/web/ui/build/static/js/main.236e72e7.js.map +0 -1
  100. xinference/web/ui/node_modules/.cache/babel-loader/0cccfbe5d963b8e31eb679f9d9677392839cedd04aa2956ac6b33cf19599d597.json +0 -1
  101. xinference/web/ui/node_modules/.cache/babel-loader/0f3b6cc71b7c83bdc85aa4835927aeb86af2ce0d2ac241917ecfbf90f75c6d27.json +0 -1
  102. xinference/web/ui/node_modules/.cache/babel-loader/2f651cf60b1bde50c0601c7110f77dd44819fb6e2501ff748a631724d91445d4.json +0 -1
  103. xinference/web/ui/node_modules/.cache/babel-loader/42bb623f337ad08ed076484185726e072ca52bb88e373d72c7b052db4c273342.json +0 -1
  104. xinference/web/ui/node_modules/.cache/babel-loader/57af83639c604bd3362d0f03f7505e81c6f67ff77bee7c6bb31f6e5523eba185.json +0 -1
  105. xinference/web/ui/node_modules/.cache/babel-loader/667753ce39ce1d4bcbf9a5f1a103d653be1d19d42f4e1fbaceb9b507679a52c7.json +0 -1
  106. xinference/web/ui/node_modules/.cache/babel-loader/66ed1bd4c06748c1b176a625c25c856997edc787856c73162f82f2b465c5d956.json +0 -1
  107. xinference/web/ui/node_modules/.cache/babel-loader/78f2521da2e2a98b075a2666cb782c7e2c019cd3c72199eecd5901c82d8655df.json +0 -1
  108. xinference/web/ui/node_modules/.cache/babel-loader/8d2b0b3c6988d1894694dcbbe708ef91cfe62d62dac317031f09915ced637953.json +0 -1
  109. xinference/web/ui/node_modules/.cache/babel-loader/9427ae7f1e94ae8dcd2333fb361e381f4054fde07394fe5448658e3417368476.json +0 -1
  110. xinference/web/ui/node_modules/.cache/babel-loader/bcee2b4e76b07620f9087989eb86d43c645ba3c7a74132cf926260af1164af0e.json +0 -1
  111. xinference/web/ui/node_modules/.cache/babel-loader/cc2ddd02ccc1dad1a2737ac247c79e6f6ed2c7836c6b68e511e3048f666b64af.json +0 -1
  112. xinference/web/ui/node_modules/.cache/babel-loader/d2e8e6665a7efc832b43907dadf4e3c896a59eaf8129f9a520882466c8f2e489.json +0 -1
  113. xinference/web/ui/node_modules/.cache/babel-loader/d8a42e9df7157de9f28eecefdf178fd113bf2280d28471b6e32a8a45276042df.json +0 -1
  114. xinference/web/ui/node_modules/.cache/babel-loader/e26750d9556e9741912333349e4da454c53dbfddbfc6002ab49518dcf02af745.json +0 -1
  115. xinference/web/ui/node_modules/.cache/babel-loader/ef42ec014d7bc373b874b2a1ff0dcd785490f125e913698bc049b0bd778e4d66.json +0 -1
  116. xinference/web/ui/node_modules/.cache/babel-loader/fe3eb4d76c79ca98833f686d642224eeeb94cc83ad14300d281623796d087f0a.json +0 -1
  117. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/LICENSE +0 -0
  118. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/WHEEL +0 -0
  119. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/entry_points.txt +0 -0
  120. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/top_level.txt +0 -0
@@ -22,6 +22,8 @@ from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple, Un
22
22
  import xoscar as xo
23
23
 
24
24
  from ..core import ModelActor
25
+ from ..core.status_guard import InstanceInfo, LaunchStatus
26
+ from .metrics import record_metrics
25
27
  from .resource import ResourceStatus
26
28
  from .utils import (
27
29
  build_replica_model_uid,
@@ -46,6 +48,12 @@ logger = getLogger(__name__)
46
48
 
47
49
 
48
50
  DEFAULT_NODE_TIMEOUT = 60
51
+ ASYNC_LAUNCH_TASKS = {} # type: ignore
52
+
53
+
54
+ def callback_for_async_launch(model_uid: str):
55
+ ASYNC_LAUNCH_TASKS.pop(model_uid, None)
56
+ logger.debug(f"Model uid: {model_uid} async launch completes.")
49
57
 
50
58
 
51
59
  @dataclass
@@ -81,6 +89,13 @@ class SupervisorActor(xo.StatelessActor):
81
89
  # comment this line to avoid worker lost
82
90
  # self._check_dead_nodes_task = asyncio.create_task(self._check_dead_nodes())
83
91
  logger.info(f"Xinference supervisor {self.address} started")
92
+ from .status_guard import StatusGuardActor
93
+
94
+ self._status_guard_ref: xo.ActorRefType[
95
+ "StatusGuardActor"
96
+ ] = await xo.create_actor(
97
+ StatusGuardActor, address=self.address, uid=StatusGuardActor.uid()
98
+ )
84
99
 
85
100
  from ..model.embedding import (
86
101
  CustomEmbeddingModelSpec,
@@ -119,11 +134,13 @@ class SupervisorActor(xo.StatelessActor):
119
134
  from ..model.llm.llm_family import (
120
135
  BUILTIN_LLM_MODEL_CHAT_FAMILIES,
121
136
  BUILTIN_LLM_MODEL_GENERATE_FAMILIES,
137
+ BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
122
138
  )
123
139
 
124
140
  return {
125
141
  "chat": list(BUILTIN_LLM_MODEL_CHAT_FAMILIES),
126
142
  "generate": list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES),
143
+ "tool_call": list(BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES),
127
144
  }
128
145
 
129
146
  async def get_devices_count(self) -> int:
@@ -511,6 +528,7 @@ class SupervisorActor(xo.StatelessActor):
511
528
  replica: int = 1,
512
529
  n_gpu: Optional[Union[int, str]] = "auto",
513
530
  request_limits: Optional[int] = None,
531
+ wait_ready: bool = True,
514
532
  **kwargs,
515
533
  ) -> str:
516
534
  if model_uid is None:
@@ -552,6 +570,18 @@ class SupervisorActor(xo.StatelessActor):
552
570
  )
553
571
  self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref
554
572
 
573
+ async def _launch_model():
574
+ try:
575
+ for rep_model_uid in iter_replica_model_uid(model_uid, replica):
576
+ await _launch_one_model(rep_model_uid)
577
+ except Exception:
578
+ # terminate_model will remove the replica info.
579
+ await self.terminate_model(model_uid, suppress_exception=True)
580
+ await self._status_guard_ref.update_instance_info(
581
+ model_uid, {"status": LaunchStatus.ERROR.name}
582
+ )
583
+ raise
584
+
555
585
  if not is_valid_model_uid(model_uid):
556
586
  raise ValueError(
557
587
  "The model UID is invalid. Please specify the model UID by 0 < length <= 100."
@@ -568,15 +598,31 @@ class SupervisorActor(xo.StatelessActor):
568
598
  self._model_uid_to_replica_info[model_uid] = ReplicaInfo(
569
599
  replica=replica, scheduler=itertools.cycle(range(replica))
570
600
  )
571
- try:
572
- for rep_model_uid in iter_replica_model_uid(model_uid, replica):
573
- await _launch_one_model(rep_model_uid)
574
- except Exception:
575
- # terminate_model will remove the replica info.
576
- await self.terminate_model(model_uid, suppress_exception=True)
577
- raise
601
+ instance_info = InstanceInfo(
602
+ model_name=model_name,
603
+ model_uid=model_uid,
604
+ model_ability=[],
605
+ replica=replica,
606
+ status=LaunchStatus.CREATING.name,
607
+ instance_created_ts=int(time.time()),
608
+ )
609
+ await self._status_guard_ref.set_instance_info(model_uid, instance_info)
610
+ if wait_ready:
611
+ await _launch_model()
612
+ else:
613
+ task = asyncio.create_task(_launch_model())
614
+ ASYNC_LAUNCH_TASKS[model_uid] = task
615
+ task.add_done_callback(lambda _: callback_for_async_launch(model_uid))
578
616
  return model_uid
579
617
 
618
+ async def get_instance_info(
619
+ self, model_name: Optional[str], model_uid: Optional[str]
620
+ ) -> List[Dict]:
621
+ infos = await self._status_guard_ref.get_instance_info(
622
+ model_name=model_name, model_uid=model_uid
623
+ )
624
+ return [info.dict() for info in sorted(infos, key=lambda info: info.model_uid)]
625
+
580
626
  async def _check_dead_nodes(self):
581
627
  while True:
582
628
  dead_nodes = []
@@ -705,3 +751,7 @@ class SupervisorActor(xo.StatelessActor):
705
751
  self._worker_status[worker_address] = WorkerStatus(
706
752
  update_time=time.time(), status=status
707
753
  )
754
+
755
+ @staticmethod
756
+ def record_metrics(name, op, kwargs):
757
+ record_metrics(name, op, kwargs)
xinference/core/worker.py CHANGED
@@ -15,7 +15,9 @@
15
15
  import asyncio
16
16
  import os
17
17
  import platform
18
+ import queue
18
19
  import signal
20
+ import threading
19
21
  from collections import defaultdict
20
22
  from logging import getLogger
21
23
  from typing import Any, Dict, List, Optional, Set, Tuple, Union
@@ -25,8 +27,10 @@ from xoscar import MainActorPoolType
25
27
 
26
28
  from ..constants import XINFERENCE_CACHE_DIR
27
29
  from ..core import ModelActor
30
+ from ..core.status_guard import LaunchStatus
28
31
  from ..model.core import ModelDescription, create_model_instance
29
32
  from ..utils import cuda_count
33
+ from .metrics import launch_metrics_export_server, record_metrics
30
34
  from .resource import gather_node_info
31
35
  from .utils import log_async, log_sync, parse_replica_model_uid, purge_dir
32
36
 
@@ -34,6 +38,12 @@ logger = getLogger(__name__)
34
38
 
35
39
 
36
40
  DEFAULT_NODE_HEARTBEAT_INTERVAL = 5
41
+ MODEL_ACTOR_AUTO_RECOVER_LIMIT: Optional[int]
42
+ _MODEL_ACTOR_AUTO_RECOVER_LIMIT = os.getenv("XINFERENCE_MODEL_ACTOR_AUTO_RECOVER_LIMIT")
43
+ if _MODEL_ACTOR_AUTO_RECOVER_LIMIT is not None:
44
+ MODEL_ACTOR_AUTO_RECOVER_LIMIT = int(_MODEL_ACTOR_AUTO_RECOVER_LIMIT)
45
+ else:
46
+ MODEL_ACTOR_AUTO_RECOVER_LIMIT = None
37
47
 
38
48
 
39
49
  class WorkerActor(xo.StatelessActor):
@@ -42,6 +52,8 @@ class WorkerActor(xo.StatelessActor):
42
52
  supervisor_address: str,
43
53
  main_pool: MainActorPoolType,
44
54
  cuda_devices: List[int],
55
+ metrics_exporter_host: Optional[str] = None,
56
+ metrics_exporter_port: Optional[int] = None,
45
57
  ):
46
58
  super().__init__()
47
59
  # static attrs.
@@ -57,20 +69,71 @@ class WorkerActor(xo.StatelessActor):
57
69
  self._gpu_to_model_uid: Dict[int, str] = {}
58
70
  self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
59
71
  self._model_uid_to_addr: Dict[str, str] = {}
72
+ self._model_uid_to_recover_count: Dict[str, int] = {}
60
73
  self._model_uid_to_launch_args: Dict[str, Dict] = {}
61
74
 
75
+ # metrics export server.
76
+ if metrics_exporter_host is not None or metrics_exporter_port is not None:
77
+ logger.info(
78
+ f"Starting metrics export server at {metrics_exporter_host}:{metrics_exporter_port}"
79
+ )
80
+ q: queue.Queue = queue.Queue()
81
+ self._metrics_thread = threading.Thread(
82
+ name="Metrics Export Server",
83
+ target=launch_metrics_export_server,
84
+ args=(q, metrics_exporter_host, metrics_exporter_port),
85
+ daemon=True,
86
+ )
87
+ self._metrics_thread.start()
88
+ logger.info("Checking metrics export server...")
89
+ while self._metrics_thread.is_alive():
90
+ try:
91
+ host, port = q.get(block=False)[:2]
92
+ logger.info(f"Metrics server is started at: http://{host}:{port}")
93
+ break
94
+ except queue.Empty:
95
+ pass
96
+ else:
97
+ raise Exception("Metrics server thread exit.")
98
+
62
99
  self._lock = asyncio.Lock()
63
100
 
64
101
  async def recover_sub_pool(self, address):
65
- logger.warning("Process %s is down, create model.", address)
102
+ logger.warning("Process %s is down.", address)
103
+ # Xoscar does not remove the address from sub_processes.
104
+ try:
105
+ await self._main_pool.remove_sub_pool(address)
106
+ except Exception:
107
+ pass
66
108
  for model_uid, addr in self._model_uid_to_addr.items():
67
109
  if addr == address:
68
110
  launch_args = self._model_uid_to_launch_args.get(model_uid)
69
- try:
70
- await self.terminate_model(model_uid)
71
- except Exception:
72
- pass
73
- await self.launch_builtin_model(**launch_args)
111
+ if launch_args is None:
112
+ logger.warning(
113
+ "Not recreate model because the it is down during launch."
114
+ )
115
+ else:
116
+ recover_count = self._model_uid_to_recover_count.get(model_uid)
117
+ try:
118
+ await self.terminate_model(model_uid)
119
+ except Exception:
120
+ pass
121
+ if recover_count is not None:
122
+ if recover_count > 0:
123
+ logger.warning(
124
+ "Recreating model actor %s, remain %s times ...",
125
+ model_uid,
126
+ recover_count - 1,
127
+ )
128
+ self._model_uid_to_recover_count[model_uid] = (
129
+ recover_count - 1
130
+ )
131
+ await self.launch_builtin_model(**launch_args)
132
+ else:
133
+ logger.warning("Stop recreating model actor.")
134
+ else:
135
+ logger.warning("Recreating model actor %s ...", model_uid)
136
+ await self.launch_builtin_model(**launch_args)
74
137
  break
75
138
 
76
139
  @classmethod
@@ -78,8 +141,14 @@ class WorkerActor(xo.StatelessActor):
78
141
  return "worker"
79
142
 
80
143
  async def __post_create__(self):
144
+ from .status_guard import StatusGuardActor
81
145
  from .supervisor import SupervisorActor
82
146
 
147
+ self._status_guard_ref: xo.ActorRefType[
148
+ "StatusGuardActor"
149
+ ] = await xo.actor_ref(
150
+ address=self._supervisor_address, uid=StatusGuardActor.uid()
151
+ )
83
152
  self._supervisor_ref: xo.ActorRefType["SupervisorActor"] = await xo.actor_ref(
84
153
  address=self._supervisor_address, uid=SupervisorActor.uid()
85
154
  )
@@ -309,7 +378,12 @@ class WorkerActor(xo.StatelessActor):
309
378
 
310
379
  try:
311
380
  model_ref = await xo.create_actor(
312
- ModelActor, address=subpool_address, uid=model_uid, model=model
381
+ ModelActor,
382
+ address=subpool_address,
383
+ uid=model_uid,
384
+ worker_address=self.address,
385
+ model=model,
386
+ model_description=model_description,
313
387
  )
314
388
  await model_ref.load()
315
389
  except:
@@ -324,6 +398,22 @@ class WorkerActor(xo.StatelessActor):
324
398
  self._gpu_to_model_uid[int(dev)] = model_uid
325
399
  self._model_uid_to_addr[model_uid] = subpool_address
326
400
 
401
+ async def _get_model_ability(self, model: Any, model_type: str) -> List[str]:
402
+ from ..model.llm.core import LLM
403
+
404
+ if model_type == "embedding":
405
+ return ["embed"]
406
+ elif model_type == "rerank":
407
+ return ["rerank"]
408
+ elif model_type == "image":
409
+ return ["text_to_image"]
410
+ elif model_type == "multimodal":
411
+ return ["multimodal"]
412
+ else:
413
+ assert model_type == "LLM"
414
+ assert isinstance(model, LLM)
415
+ return model.model_family.model_ability # type: ignore
416
+
327
417
  @log_async(logger=logger)
328
418
  async def launch_builtin_model(
329
419
  self,
@@ -339,6 +429,8 @@ class WorkerActor(xo.StatelessActor):
339
429
  ):
340
430
  launch_args = locals()
341
431
  launch_args.pop("self")
432
+ launch_args.pop("kwargs")
433
+ launch_args.update(kwargs)
342
434
  if n_gpu is not None:
343
435
  if isinstance(n_gpu, int) and (n_gpu <= 0 or n_gpu > cuda_count()):
344
436
  raise ValueError(
@@ -358,6 +450,7 @@ class WorkerActor(xo.StatelessActor):
358
450
  )
359
451
 
360
452
  try:
453
+ origin_uid, _, _ = parse_replica_model_uid(model_uid)
361
454
  model, model_description = await asyncio.to_thread(
362
455
  create_model_instance,
363
456
  subpool_address,
@@ -375,7 +468,9 @@ class WorkerActor(xo.StatelessActor):
375
468
  ModelActor,
376
469
  address=subpool_address,
377
470
  uid=model_uid,
471
+ worker_address=self.address,
378
472
  model=model,
473
+ model_description=model_description,
379
474
  request_limits=request_limits,
380
475
  )
381
476
  await model_ref.load()
@@ -388,13 +483,27 @@ class WorkerActor(xo.StatelessActor):
388
483
  self._model_uid_to_model[model_uid] = model_ref
389
484
  self._model_uid_to_model_spec[model_uid] = model_description
390
485
  self._model_uid_to_addr[model_uid] = subpool_address
486
+ self._model_uid_to_recover_count.setdefault(
487
+ model_uid, MODEL_ACTOR_AUTO_RECOVER_LIMIT
488
+ )
391
489
  self._model_uid_to_launch_args[model_uid] = launch_args
392
490
 
491
+ # update status to READY
492
+ abilities = await self._get_model_ability(model, model_type)
493
+ await self._status_guard_ref.update_instance_info(
494
+ origin_uid,
495
+ {"model_ability": abilities, "status": LaunchStatus.READY.name},
496
+ )
497
+
393
498
  @log_async(logger=logger)
394
499
  async def terminate_model(self, model_uid: str):
500
+ origin_uid, _, _ = parse_replica_model_uid(model_uid)
501
+ await self._status_guard_ref.update_instance_info(
502
+ origin_uid, {"status": LaunchStatus.TERMINATING.name}
503
+ )
395
504
  model_ref = self._model_uid_to_model.get(model_uid, None)
396
505
  if model_ref is None:
397
- raise ValueError(f"Model not found in the model list, uid: {model_uid}")
506
+ logger.debug("Model not found, uid: %s", model_uid)
398
507
 
399
508
  try:
400
509
  await xo.destroy_actor(model_ref)
@@ -405,12 +514,20 @@ class WorkerActor(xo.StatelessActor):
405
514
  try:
406
515
  subpool_address = self._model_uid_to_addr[model_uid]
407
516
  await self._main_pool.remove_sub_pool(subpool_address)
517
+ except Exception as e:
518
+ logger.debug(
519
+ "Remove sub pool failed, model uid: %s, error: %s", model_uid, e
520
+ )
408
521
  finally:
409
- del self._model_uid_to_model[model_uid]
410
- del self._model_uid_to_model_spec[model_uid]
522
+ self._model_uid_to_model.pop(model_uid, None)
523
+ self._model_uid_to_model_spec.pop(model_uid, None)
411
524
  self.release_devices(model_uid)
412
- del self._model_uid_to_addr[model_uid]
413
- del self._model_uid_to_launch_args[model_uid]
525
+ self._model_uid_to_addr.pop(model_uid, None)
526
+ self._model_uid_to_recover_count.pop(model_uid, None)
527
+ self._model_uid_to_launch_args.pop(model_uid, None)
528
+ await self._status_guard_ref.update_instance_info(
529
+ origin_uid, {"status": LaunchStatus.TERMINATED.name}
530
+ )
414
531
 
415
532
  @log_async(logger=logger)
416
533
  async def list_models(self) -> Dict[str, Dict[str, Any]]:
@@ -425,7 +542,7 @@ class WorkerActor(xo.StatelessActor):
425
542
  def get_model(self, model_uid: str) -> xo.ActorRefType["ModelActor"]:
426
543
  model_ref = self._model_uid_to_model.get(model_uid, None)
427
544
  if model_ref is None:
428
- raise ValueError(f"Model not found in the model list, uid: {model_uid}")
545
+ raise ValueError(f"Model not found, uid: {model_uid}")
429
546
  return model_ref
430
547
 
431
548
  @log_sync(logger=logger)
@@ -458,3 +575,7 @@ class WorkerActor(xo.StatelessActor):
458
575
  await asyncio.sleep(DEFAULT_NODE_HEARTBEAT_INTERVAL)
459
576
  except asyncio.CancelledError: # pragma: no cover
460
577
  break
578
+
579
+ @staticmethod
580
+ def record_metrics(name, op, kwargs):
581
+ record_metrics(name, op, kwargs)