xinference 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (95) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/auth_service.py +132 -0
  3. xinference/api/restful_api.py +282 -78
  4. xinference/client/handlers.py +3 -0
  5. xinference/client/restful/restful_client.py +108 -75
  6. xinference/constants.py +14 -4
  7. xinference/core/cache_tracker.py +102 -0
  8. xinference/core/chat_interface.py +10 -4
  9. xinference/core/event.py +56 -0
  10. xinference/core/model.py +44 -0
  11. xinference/core/resource.py +19 -12
  12. xinference/core/status_guard.py +4 -0
  13. xinference/core/supervisor.py +278 -87
  14. xinference/core/utils.py +68 -3
  15. xinference/core/worker.py +98 -8
  16. xinference/deploy/cmdline.py +6 -3
  17. xinference/deploy/local.py +2 -2
  18. xinference/deploy/supervisor.py +2 -2
  19. xinference/model/audio/__init__.py +27 -0
  20. xinference/model/audio/core.py +161 -0
  21. xinference/model/audio/model_spec.json +79 -0
  22. xinference/model/audio/utils.py +18 -0
  23. xinference/model/audio/whisper.py +132 -0
  24. xinference/model/core.py +18 -13
  25. xinference/model/embedding/__init__.py +27 -2
  26. xinference/model/embedding/core.py +43 -3
  27. xinference/model/embedding/model_spec.json +24 -0
  28. xinference/model/embedding/model_spec_modelscope.json +24 -0
  29. xinference/model/embedding/utils.py +18 -0
  30. xinference/model/image/__init__.py +12 -1
  31. xinference/model/image/core.py +63 -9
  32. xinference/model/image/utils.py +26 -0
  33. xinference/model/llm/__init__.py +20 -1
  34. xinference/model/llm/core.py +43 -2
  35. xinference/model/llm/ggml/chatglm.py +15 -6
  36. xinference/model/llm/llm_family.json +197 -6
  37. xinference/model/llm/llm_family.py +9 -7
  38. xinference/model/llm/llm_family_modelscope.json +189 -4
  39. xinference/model/llm/pytorch/chatglm.py +3 -3
  40. xinference/model/llm/pytorch/core.py +4 -2
  41. xinference/model/{multimodal → llm/pytorch}/qwen_vl.py +10 -8
  42. xinference/model/llm/pytorch/utils.py +21 -9
  43. xinference/model/llm/pytorch/yi_vl.py +246 -0
  44. xinference/model/llm/utils.py +57 -4
  45. xinference/model/llm/vllm/core.py +5 -4
  46. xinference/model/rerank/__init__.py +25 -2
  47. xinference/model/rerank/core.py +51 -9
  48. xinference/model/rerank/model_spec.json +6 -0
  49. xinference/model/rerank/model_spec_modelscope.json +7 -0
  50. xinference/{api/oauth2/common.py → model/rerank/utils.py} +6 -2
  51. xinference/model/utils.py +5 -3
  52. xinference/thirdparty/__init__.py +0 -0
  53. xinference/thirdparty/llava/__init__.py +1 -0
  54. xinference/thirdparty/llava/conversation.py +205 -0
  55. xinference/thirdparty/llava/mm_utils.py +122 -0
  56. xinference/thirdparty/llava/model/__init__.py +1 -0
  57. xinference/thirdparty/llava/model/clip_encoder/__init__.py +0 -0
  58. xinference/thirdparty/llava/model/clip_encoder/builder.py +11 -0
  59. xinference/thirdparty/llava/model/clip_encoder/clip_encoder.py +86 -0
  60. xinference/thirdparty/llava/model/constants.py +6 -0
  61. xinference/thirdparty/llava/model/llava_arch.py +385 -0
  62. xinference/thirdparty/llava/model/llava_llama.py +163 -0
  63. xinference/thirdparty/llava/model/multimodal_projector/__init__.py +0 -0
  64. xinference/thirdparty/llava/model/multimodal_projector/builder.py +64 -0
  65. xinference/types.py +1 -1
  66. xinference/web/ui/build/asset-manifest.json +3 -3
  67. xinference/web/ui/build/index.html +1 -1
  68. xinference/web/ui/build/static/js/main.15822aeb.js +3 -0
  69. xinference/web/ui/build/static/js/main.15822aeb.js.map +1 -0
  70. xinference/web/ui/node_modules/.cache/babel-loader/139e5e4adf436923107d2b02994c7ff6dba2aac1989e9b6638984f0dfe782c4a.json +1 -0
  71. xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +1 -0
  72. xinference/web/ui/node_modules/.cache/babel-loader/64accc515dc6cd584a2873796cd7da6f93de57f7e465eb5423cca9a2f3fe3eff.json +1 -0
  73. xinference/web/ui/node_modules/.cache/babel-loader/65ca3ba225b8c8dac907210545b51f2fcdb2591f0feeb7195f1c037f2bc956a0.json +1 -0
  74. xinference/web/ui/node_modules/.cache/babel-loader/b80db1012318b97c329c4e3e72454f7512fb107e57c444b437dbe4ba1a3faa5a.json +1 -0
  75. {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/METADATA +33 -23
  76. {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/RECORD +81 -64
  77. xinference/api/oauth2/core.py +0 -93
  78. xinference/model/multimodal/__init__.py +0 -52
  79. xinference/model/multimodal/core.py +0 -467
  80. xinference/model/multimodal/model_spec.json +0 -43
  81. xinference/model/multimodal/model_spec_modelscope.json +0 -45
  82. xinference/web/ui/build/static/js/main.b83095c2.js +0 -3
  83. xinference/web/ui/build/static/js/main.b83095c2.js.map +0 -1
  84. xinference/web/ui/node_modules/.cache/babel-loader/101923c539819f26ad11fbcbd6f6e56436b285efbb090dcc7dd648c6e924c4a8.json +0 -1
  85. xinference/web/ui/node_modules/.cache/babel-loader/4942da6bc03bf7373af068e22f916341aabc5b5df855d73c1d348c696724ce37.json +0 -1
  86. xinference/web/ui/node_modules/.cache/babel-loader/52a6136cb2dbbf9c51d461724d9b283ebe74a73fb19d5df7ba8e13c42bd7174d.json +0 -1
  87. xinference/web/ui/node_modules/.cache/babel-loader/71493aadd34d568fbe605cacaba220aa69bd09273251ee4ba27930f8d01fccd8.json +0 -1
  88. xinference/web/ui/node_modules/.cache/babel-loader/8b071db2a5a9ef68dc14d5f606540bd23d9785e365a11997c510656764d2dccf.json +0 -1
  89. xinference/web/ui/node_modules/.cache/babel-loader/a4d72d3b806ba061919115f0c513738726872e3c79cf258f007519d3f91d1a16.json +0 -1
  90. xinference/web/ui/node_modules/.cache/babel-loader/f037ffef5992af0892d6d991053c1dace364cd39a3f11f1a41f92776e8a59459.json +0 -1
  91. /xinference/web/ui/build/static/js/{main.b83095c2.js.LICENSE.txt → main.15822aeb.js.LICENSE.txt} +0 -0
  92. {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/LICENSE +0 -0
  93. {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/WHEEL +0 -0
  94. {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/entry_points.txt +0 -0
  95. {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/top_level.txt +0 -0
@@ -21,10 +21,16 @@ from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple, Un
21
21
 
22
22
  import xoscar as xo
23
23
 
24
+ from ..constants import (
25
+ XINFERENCE_DISABLE_HEALTH_CHECK,
26
+ XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD,
27
+ XINFERENCE_HEALTH_CHECK_INTERVAL,
28
+ XINFERENCE_HEALTH_CHECK_TIMEOUT,
29
+ )
24
30
  from ..core import ModelActor
25
31
  from ..core.status_guard import InstanceInfo, LaunchStatus
26
32
  from .metrics import record_metrics
27
- from .resource import ResourceStatus
33
+ from .resource import GPUStatus, ResourceStatus
28
34
  from .utils import (
29
35
  build_replica_model_uid,
30
36
  gen_random_string,
@@ -32,14 +38,15 @@ from .utils import (
32
38
  iter_replica_model_uid,
33
39
  log_async,
34
40
  log_sync,
41
+ parse_model_version,
35
42
  parse_replica_model_uid,
36
43
  )
37
44
 
38
45
  if TYPE_CHECKING:
46
+ from ..model.audio import AudioModelFamilyV1
39
47
  from ..model.embedding import EmbeddingModelSpec
40
48
  from ..model.image import ImageModelFamilyV1
41
49
  from ..model.llm import LLMFamilyV1
42
- from ..model.multimodal import LVLMFamilyV1
43
50
  from ..model.rerank import RerankModelSpec
44
51
  from .worker import WorkerActor
45
52
 
@@ -47,7 +54,6 @@ if TYPE_CHECKING:
47
54
  logger = getLogger(__name__)
48
55
 
49
56
 
50
- DEFAULT_NODE_TIMEOUT = 60
51
57
  ASYNC_LAUNCH_TASKS = {} # type: ignore
52
58
 
53
59
 
@@ -59,7 +65,8 @@ def callback_for_async_launch(model_uid: str):
59
65
  @dataclass
60
66
  class WorkerStatus:
61
67
  update_time: float
62
- status: Dict[str, ResourceStatus]
68
+ failure_remaining_count: int
69
+ status: Dict[str, Union[ResourceStatus, GPUStatus]]
63
70
 
64
71
 
65
72
  @dataclass
@@ -86,9 +93,17 @@ class SupervisorActor(xo.StatelessActor):
86
93
 
87
94
  async def __post_create__(self):
88
95
  self._uptime = time.time()
89
- # comment this line to avoid worker lost
90
- # self._check_dead_nodes_task = asyncio.create_task(self._check_dead_nodes())
96
+ if not XINFERENCE_DISABLE_HEALTH_CHECK:
97
+ # Run _check_dead_nodes() in a dedicated thread.
98
+ from ..isolation import Isolation
99
+
100
+ self._isolation = Isolation(asyncio.new_event_loop(), threaded=True)
101
+ self._isolation.start()
102
+ asyncio.run_coroutine_threadsafe(
103
+ self._check_dead_nodes(), loop=self._isolation.loop
104
+ )
91
105
  logger.info(f"Xinference supervisor {self.address} started")
106
+ from .cache_tracker import CacheTrackerActor
92
107
  from .status_guard import StatusGuardActor
93
108
 
94
109
  self._status_guard_ref: xo.ActorRefType[
@@ -96,30 +111,98 @@ class SupervisorActor(xo.StatelessActor):
96
111
  ] = await xo.create_actor(
97
112
  StatusGuardActor, address=self.address, uid=StatusGuardActor.uid()
98
113
  )
114
+ self._cache_tracker_ref: xo.ActorRefType[
115
+ "CacheTrackerActor"
116
+ ] = await xo.create_actor(
117
+ CacheTrackerActor, address=self.address, uid=CacheTrackerActor.uid()
118
+ )
119
+
120
+ from .event import EventCollectorActor
121
+
122
+ self._event_collector_ref: xo.ActorRefType[
123
+ EventCollectorActor
124
+ ] = await xo.create_actor(
125
+ EventCollectorActor, address=self.address, uid=EventCollectorActor.uid()
126
+ )
99
127
 
100
128
  from ..model.embedding import (
101
129
  CustomEmbeddingModelSpec,
130
+ generate_embedding_description,
131
+ get_embedding_model_descriptions,
102
132
  register_embedding,
103
133
  unregister_embedding,
104
134
  )
105
- from ..model.llm import register_llm, unregister_llm
106
- from ..model.llm.llm_family import CustomLLMFamilyV1
107
- from ..model.rerank.custom import (
135
+ from ..model.image import get_image_model_descriptions
136
+ from ..model.llm import (
137
+ CustomLLMFamilyV1,
138
+ generate_llm_description,
139
+ get_llm_model_descriptions,
140
+ register_llm,
141
+ unregister_llm,
142
+ )
143
+ from ..model.rerank import (
108
144
  CustomRerankModelSpec,
145
+ generate_rerank_description,
146
+ get_rerank_model_descriptions,
109
147
  register_rerank,
110
148
  unregister_rerank,
111
149
  )
112
150
 
113
151
  self._custom_register_type_to_cls: Dict[str, Tuple] = {
114
- "LLM": (CustomLLMFamilyV1, register_llm, unregister_llm),
152
+ "LLM": (
153
+ CustomLLMFamilyV1,
154
+ register_llm,
155
+ unregister_llm,
156
+ generate_llm_description,
157
+ ),
115
158
  "embedding": (
116
159
  CustomEmbeddingModelSpec,
117
160
  register_embedding,
118
161
  unregister_embedding,
162
+ generate_embedding_description,
163
+ ),
164
+ "rerank": (
165
+ CustomRerankModelSpec,
166
+ register_rerank,
167
+ unregister_rerank,
168
+ generate_rerank_description,
119
169
  ),
120
- "rerank": (CustomRerankModelSpec, register_rerank, unregister_rerank),
121
170
  }
122
171
 
172
+ # record model version
173
+ model_version_infos: Dict[str, List[Dict]] = {}
174
+ model_version_infos.update(get_llm_model_descriptions())
175
+ model_version_infos.update(get_embedding_model_descriptions())
176
+ model_version_infos.update(get_rerank_model_descriptions())
177
+ model_version_infos.update(get_image_model_descriptions())
178
+ await self._cache_tracker_ref.record_model_version(
179
+ model_version_infos, self.address
180
+ )
181
+
182
+ async def get_cluster_device_info(self) -> List:
183
+ supervisor_device_info = {
184
+ "ip_address": self.address.split(":")[0],
185
+ "gpu_count": 0,
186
+ "gpu_vram_total": 0,
187
+ }
188
+ res = [{"node_type": "Supervisor", **supervisor_device_info}]
189
+ for worker_addr, worker_status in self._worker_status.items():
190
+ vram_total: float = sum(
191
+ [v.mem_total for k, v in worker_status.status.items() if k != "cpu"] # type: ignore
192
+ )
193
+ total = (
194
+ vram_total if vram_total == 0 else f"{int(vram_total / 1024 / 1024)}MiB"
195
+ )
196
+ res.append(
197
+ {
198
+ "node_type": "Worker",
199
+ "ip_address": worker_addr.split(":")[0],
200
+ "gpu_count": len(worker_status.status) - 1,
201
+ "gpu_vram_total": total,
202
+ }
203
+ )
204
+ return res
205
+
123
206
  @staticmethod
124
207
  async def get_builtin_prompts() -> Dict[str, Any]:
125
208
  from ..model.llm.llm_family import BUILTIN_LLM_PROMPT_STYLE
@@ -180,99 +263,129 @@ class SupervisorActor(xo.StatelessActor):
180
263
  "workers": self._worker_status,
181
264
  }
182
265
 
183
- def _to_llm_reg(
266
+ async def _to_llm_reg(
184
267
  self, llm_family: "LLMFamilyV1", is_builtin: bool
185
268
  ) -> Dict[str, Any]:
186
269
  from ..model.llm import get_cache_status
187
270
 
271
+ instance_cnt = await self.get_instance_count(llm_family.model_name)
272
+ version_cnt = await self.get_model_version_count(llm_family.model_name)
273
+
188
274
  if self.is_local_deployment():
189
275
  specs = []
190
276
  # TODO: does not work when the supervisor and worker are running on separate nodes.
191
277
  for spec in llm_family.model_specs:
192
278
  cache_status = get_cache_status(llm_family, spec)
193
279
  specs.append({**spec.dict(), "cache_status": cache_status})
194
- return {**llm_family.dict(), "is_builtin": is_builtin, "model_specs": specs}
280
+ res = {**llm_family.dict(), "is_builtin": is_builtin, "model_specs": specs}
195
281
  else:
196
- return {**llm_family.dict(), "is_builtin": is_builtin}
282
+ res = {**llm_family.dict(), "is_builtin": is_builtin}
283
+ res["model_version_count"] = version_cnt
284
+ res["model_instance_count"] = instance_cnt
285
+ return res
197
286
 
198
- def _to_embedding_model_reg(
287
+ async def _to_embedding_model_reg(
199
288
  self, model_spec: "EmbeddingModelSpec", is_builtin: bool
200
289
  ) -> Dict[str, Any]:
201
290
  from ..model.embedding import get_cache_status
202
291
 
292
+ instance_cnt = await self.get_instance_count(model_spec.model_name)
293
+ version_cnt = await self.get_model_version_count(model_spec.model_name)
294
+
203
295
  if self.is_local_deployment():
204
296
  # TODO: does not work when the supervisor and worker are running on separate nodes.
205
297
  cache_status = get_cache_status(model_spec)
206
- return {
298
+ res = {
207
299
  **model_spec.dict(),
208
300
  "cache_status": cache_status,
209
301
  "is_builtin": is_builtin,
210
302
  }
211
303
  else:
212
- return {
304
+ res = {
213
305
  **model_spec.dict(),
214
306
  "is_builtin": is_builtin,
215
307
  }
308
+ res["model_version_count"] = version_cnt
309
+ res["model_instance_count"] = instance_cnt
310
+ return res
216
311
 
217
- def _to_rerank_model_reg(
312
+ async def _to_rerank_model_reg(
218
313
  self, model_spec: "RerankModelSpec", is_builtin: bool
219
314
  ) -> Dict[str, Any]:
220
315
  from ..model.rerank import get_cache_status
221
316
 
317
+ instance_cnt = await self.get_instance_count(model_spec.model_name)
318
+ version_cnt = await self.get_model_version_count(model_spec.model_name)
319
+
222
320
  if self.is_local_deployment():
223
321
  # TODO: does not work when the supervisor and worker are running on separate nodes.
224
322
  cache_status = get_cache_status(model_spec)
225
- return {
323
+ res = {
226
324
  **model_spec.dict(),
227
325
  "cache_status": cache_status,
228
326
  "is_builtin": is_builtin,
229
327
  }
230
328
  else:
231
- return {
329
+ res = {
232
330
  **model_spec.dict(),
233
331
  "is_builtin": is_builtin,
234
332
  }
333
+ res["model_version_count"] = version_cnt
334
+ res["model_instance_count"] = instance_cnt
335
+ return res
235
336
 
236
- def _to_image_model_reg(
337
+ async def _to_image_model_reg(
237
338
  self, model_family: "ImageModelFamilyV1", is_builtin: bool
238
339
  ) -> Dict[str, Any]:
239
340
  from ..model.image import get_cache_status
240
341
 
342
+ instance_cnt = await self.get_instance_count(model_family.model_name)
343
+ version_cnt = await self.get_model_version_count(model_family.model_name)
344
+
241
345
  if self.is_local_deployment():
242
346
  # TODO: does not work when the supervisor and worker are running on separate nodes.
243
347
  cache_status = get_cache_status(model_family)
244
- return {
348
+ res = {
245
349
  **model_family.dict(),
246
350
  "cache_status": cache_status,
247
351
  "is_builtin": is_builtin,
248
352
  }
249
353
  else:
250
- return {
354
+ res = {
251
355
  **model_family.dict(),
252
356
  "is_builtin": is_builtin,
253
357
  }
358
+ res["model_version_count"] = version_cnt
359
+ res["model_instance_count"] = instance_cnt
360
+ return res
254
361
 
255
- def _to_multimodal_reg(
256
- self, model_family: "LVLMFamilyV1", is_builtin: bool
362
+ async def _to_audio_model_reg(
363
+ self, model_family: "AudioModelFamilyV1", is_builtin: bool
257
364
  ) -> Dict[str, Any]:
258
- from ..model.llm import get_cache_status
365
+ from ..model.audio import get_cache_status
366
+
367
+ instance_cnt = await self.get_instance_count(model_family.model_name)
368
+ version_cnt = await self.get_model_version_count(model_family.model_name)
259
369
 
260
370
  if self.is_local_deployment():
261
- specs = []
262
371
  # TODO: does not work when the supervisor and worker are running on separate nodes.
263
- for spec in model_family.model_specs:
264
- cache_status = get_cache_status(model_family, spec)
265
- specs.append({**spec.dict(), "cache_status": cache_status})
266
- return {
372
+ cache_status = get_cache_status(model_family)
373
+ res = {
267
374
  **model_family.dict(),
375
+ "cache_status": cache_status,
268
376
  "is_builtin": is_builtin,
269
- "model_specs": specs,
270
377
  }
271
378
  else:
272
- return {**model_family.dict(), "is_builtin": is_builtin}
379
+ res = {
380
+ **model_family.dict(),
381
+ "is_builtin": is_builtin,
382
+ }
383
+ res["model_version_count"] = version_cnt
384
+ res["model_instance_count"] = instance_cnt
385
+ return res
273
386
 
274
- @log_sync(logger=logger)
275
- def list_model_registrations(
387
+ @log_async(logger=logger)
388
+ async def list_model_registrations(
276
389
  self, model_type: str, detailed: bool = False
277
390
  ) -> List[Dict[str, Any]]:
278
391
  def sort_helper(item):
@@ -285,13 +398,13 @@ class SupervisorActor(xo.StatelessActor):
285
398
  ret = []
286
399
  for family in BUILTIN_LLM_FAMILIES:
287
400
  if detailed:
288
- ret.append(self._to_llm_reg(family, True))
401
+ ret.append(await self._to_llm_reg(family, True))
289
402
  else:
290
403
  ret.append({"model_name": family.model_name, "is_builtin": True})
291
404
 
292
405
  for family in get_user_defined_llm_families():
293
406
  if detailed:
294
- ret.append(self._to_llm_reg(family, False))
407
+ ret.append(await self._to_llm_reg(family, False))
295
408
  else:
296
409
  ret.append({"model_name": family.model_name, "is_builtin": False})
297
410
 
@@ -304,14 +417,16 @@ class SupervisorActor(xo.StatelessActor):
304
417
  ret = []
305
418
  for model_name, family in BUILTIN_EMBEDDING_MODELS.items():
306
419
  if detailed:
307
- ret.append(self._to_embedding_model_reg(family, is_builtin=True))
420
+ ret.append(
421
+ await self._to_embedding_model_reg(family, is_builtin=True)
422
+ )
308
423
  else:
309
424
  ret.append({"model_name": model_name, "is_builtin": True})
310
425
 
311
426
  for model_spec in get_user_defined_embeddings():
312
427
  if detailed:
313
428
  ret.append(
314
- self._to_embedding_model_reg(model_spec, is_builtin=False)
429
+ await self._to_embedding_model_reg(model_spec, is_builtin=False)
315
430
  )
316
431
  else:
317
432
  ret.append(
@@ -326,7 +441,19 @@ class SupervisorActor(xo.StatelessActor):
326
441
  ret = []
327
442
  for model_name, family in BUILTIN_IMAGE_MODELS.items():
328
443
  if detailed:
329
- ret.append(self._to_image_model_reg(family, is_builtin=True))
444
+ ret.append(await self._to_image_model_reg(family, is_builtin=True))
445
+ else:
446
+ ret.append({"model_name": model_name, "is_builtin": True})
447
+
448
+ ret.sort(key=sort_helper)
449
+ return ret
450
+ elif model_type == "audio":
451
+ from ..model.audio import BUILTIN_AUDIO_MODELS
452
+
453
+ ret = []
454
+ for model_name, family in BUILTIN_AUDIO_MODELS.items():
455
+ if detailed:
456
+ ret.append(await self._to_audio_model_reg(family, is_builtin=True))
330
457
  else:
331
458
  ret.append({"model_name": model_name, "is_builtin": True})
332
459
 
@@ -339,30 +466,20 @@ class SupervisorActor(xo.StatelessActor):
339
466
  ret = []
340
467
  for model_name, family in BUILTIN_RERANK_MODELS.items():
341
468
  if detailed:
342
- ret.append(self._to_rerank_model_reg(family, is_builtin=True))
469
+ ret.append(await self._to_rerank_model_reg(family, is_builtin=True))
343
470
  else:
344
471
  ret.append({"model_name": model_name, "is_builtin": True})
345
472
 
346
473
  for model_spec in get_user_defined_reranks():
347
474
  if detailed:
348
- ret.append(self._to_rerank_model_reg(model_spec, is_builtin=False))
475
+ ret.append(
476
+ await self._to_rerank_model_reg(model_spec, is_builtin=False)
477
+ )
349
478
  else:
350
479
  ret.append(
351
480
  {"model_name": model_spec.model_name, "is_builtin": False}
352
481
  )
353
482
 
354
- ret.sort(key=sort_helper)
355
- return ret
356
- elif model_type == "multimodal":
357
- from ..model.multimodal import BUILTIN_LVLM_FAMILIES
358
-
359
- ret = []
360
- for family in BUILTIN_LVLM_FAMILIES:
361
- if detailed:
362
- ret.append(self._to_multimodal_reg(family, True))
363
- else:
364
- ret.append({"model_name": family.model_name, "is_builtin": True})
365
-
366
483
  ret.sort(key=sort_helper)
367
484
  return ret
368
485
  else:
@@ -395,18 +512,18 @@ class SupervisorActor(xo.StatelessActor):
395
512
  if f.model_name == model_name:
396
513
  return f
397
514
  raise ValueError(f"Model {model_name} not found")
398
- elif model_type == "rerank":
399
- from ..model.rerank import BUILTIN_RERANK_MODELS
400
- from ..model.rerank.custom import get_user_defined_reranks
515
+ elif model_type == "audio":
516
+ from ..model.audio import BUILTIN_AUDIO_MODELS
401
517
 
402
- for f in list(BUILTIN_RERANK_MODELS.values()) + get_user_defined_reranks():
518
+ for f in BUILTIN_AUDIO_MODELS.values():
403
519
  if f.model_name == model_name:
404
520
  return f
405
521
  raise ValueError(f"Model {model_name} not found")
406
- elif model_type == "multimodal":
407
- from ..model.multimodal import BUILTIN_LVLM_FAMILIES
522
+ elif model_type == "rerank":
523
+ from ..model.rerank import BUILTIN_RERANK_MODELS
524
+ from ..model.rerank.custom import get_user_defined_reranks
408
525
 
409
- for f in BUILTIN_LVLM_FAMILIES:
526
+ for f in list(BUILTIN_RERANK_MODELS.values()) + get_user_defined_reranks():
410
527
  if f.model_name == model_name:
411
528
  return f
412
529
  raise ValueError(f"Model {model_name} not found")
@@ -420,6 +537,7 @@ class SupervisorActor(xo.StatelessActor):
420
537
  model_spec_cls,
421
538
  register_fn,
422
539
  unregister_fn,
540
+ generate_fn,
423
541
  ) = self._custom_register_type_to_cls[model_type]
424
542
 
425
543
  if not self.is_local_deployment():
@@ -430,6 +548,9 @@ class SupervisorActor(xo.StatelessActor):
430
548
  model_spec = model_spec_cls.parse_raw(model)
431
549
  try:
432
550
  register_fn(model_spec, persist)
551
+ await self._cache_tracker_ref.record_model_version(
552
+ generate_fn(model_spec), self.address
553
+ )
433
554
  except Exception as e:
434
555
  unregister_fn(model_spec.model_name, raise_error=False)
435
556
  raise e
@@ -439,8 +560,9 @@ class SupervisorActor(xo.StatelessActor):
439
560
  @log_async(logger=logger)
440
561
  async def unregister_model(self, model_type: str, model_name: str):
441
562
  if model_type in self._custom_register_type_to_cls:
442
- _, _, unregister_fn = self._custom_register_type_to_cls[model_type]
563
+ _, _, unregister_fn, _ = self._custom_register_type_to_cls[model_type]
443
564
  unregister_fn(model_name)
565
+ await self._cache_tracker_ref.unregister_model_version(model_name)
444
566
 
445
567
  if not self.is_local_deployment():
446
568
  workers = list(self._worker_address_to_worker.values())
@@ -457,6 +579,43 @@ class SupervisorActor(xo.StatelessActor):
457
579
  )
458
580
  return f"{model_name}-{gen_random_string(8)}"
459
581
 
582
+ async def get_model_versions(self, model_type: str, model_name: str) -> List[Dict]:
583
+ return await self._cache_tracker_ref.get_model_versions(model_name)
584
+
585
+ async def get_model_version_count(self, model_name: str) -> int:
586
+ return await self._cache_tracker_ref.get_model_version_count(model_name)
587
+
588
+ @log_async(logger=logger)
589
+ async def launch_model_by_version(
590
+ self,
591
+ model_uid: Optional[str],
592
+ model_type: str,
593
+ model_version: str,
594
+ replica: int = 1,
595
+ n_gpu: Optional[Union[int, str]] = "auto",
596
+ wait_ready: bool = True,
597
+ ):
598
+ parse_results = parse_model_version(model_version, model_type)
599
+
600
+ if model_type == "image" and len(parse_results) == 2:
601
+ kwargs = {"controlnet": parse_results[1]}
602
+ else:
603
+ kwargs = {}
604
+
605
+ return await self.launch_builtin_model(
606
+ model_uid=model_uid,
607
+ model_name=parse_results[0],
608
+ model_size_in_billions=parse_results[1] if model_type == "LLM" else None,
609
+ model_format=parse_results[2] if model_type == "LLM" else None,
610
+ quantization=parse_results[3] if model_type == "LLM" else None,
611
+ model_type=model_type,
612
+ replica=replica,
613
+ n_gpu=n_gpu,
614
+ wait_ready=wait_ready,
615
+ model_version=model_version,
616
+ **kwargs,
617
+ )
618
+
460
619
  async def launch_speculative_llm(
461
620
  self,
462
621
  model_uid: Optional[str],
@@ -529,6 +688,7 @@ class SupervisorActor(xo.StatelessActor):
529
688
  n_gpu: Optional[Union[int, str]] = "auto",
530
689
  request_limits: Optional[int] = None,
531
690
  wait_ready: bool = True,
691
+ model_version: Optional[str] = None,
532
692
  **kwargs,
533
693
  ) -> str:
534
694
  if model_uid is None:
@@ -601,6 +761,7 @@ class SupervisorActor(xo.StatelessActor):
601
761
  instance_info = InstanceInfo(
602
762
  model_name=model_name,
603
763
  model_uid=model_uid,
764
+ model_version=model_version,
604
765
  model_ability=[],
605
766
  replica=replica,
606
767
  status=LaunchStatus.CREATING.name,
@@ -623,29 +784,53 @@ class SupervisorActor(xo.StatelessActor):
623
784
  )
624
785
  return [info.dict() for info in sorted(infos, key=lambda info: info.model_uid)]
625
786
 
787
+ async def get_instance_count(self, model_name: str) -> int:
788
+ return await self._status_guard_ref.get_instance_count(model_name)
789
+
626
790
  async def _check_dead_nodes(self):
627
791
  while True:
628
- dead_nodes = []
629
- for address, status in self._worker_status.items():
630
- if time.time() - status.update_time > DEFAULT_NODE_TIMEOUT:
631
- dead_models = []
632
- for model_uid in self._replica_model_uid_to_worker:
633
- if (
634
- self._replica_model_uid_to_worker[model_uid].address
635
- == address
636
- ):
637
- dead_models.append(model_uid)
638
- logger.error(
639
- "Worker timeout. address: %s, influenced models: %s",
640
- address,
641
- dead_models,
642
- )
643
- dead_nodes.append(address)
644
-
645
- for address in dead_nodes:
646
- self._worker_status.pop(address)
647
- self._worker_address_to_worker.pop(address)
648
- await asyncio.sleep(5)
792
+ try:
793
+ dead_nodes = []
794
+ for address, status in self._worker_status.items():
795
+ if (
796
+ time.time() - status.update_time
797
+ > XINFERENCE_HEALTH_CHECK_TIMEOUT
798
+ ):
799
+ status.failure_remaining_count -= 1
800
+ else:
801
+ status.failure_remaining_count = (
802
+ XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD
803
+ )
804
+
805
+ if status.failure_remaining_count <= 0:
806
+ dead_models = []
807
+ for model_uid in self._replica_model_uid_to_worker:
808
+ if (
809
+ self._replica_model_uid_to_worker[model_uid].address
810
+ == address
811
+ ):
812
+ dead_models.append(model_uid)
813
+ logger.error(
814
+ "Worker dead. address: %s, influenced models: %s",
815
+ address,
816
+ dead_models,
817
+ )
818
+ dead_nodes.append(address)
819
+ elif (
820
+ status.failure_remaining_count
821
+ != XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD
822
+ ):
823
+ logger.error(
824
+ "Worker timeout. address: %s, check count remaining %s...",
825
+ address,
826
+ status.failure_remaining_count,
827
+ )
828
+
829
+ for address in dead_nodes:
830
+ self._worker_status.pop(address, None)
831
+ self._worker_address_to_worker.pop(address, None)
832
+ finally:
833
+ await asyncio.sleep(XINFERENCE_HEALTH_CHECK_INTERVAL)
649
834
 
650
835
  @log_async(logger=logger)
651
836
  async def terminate_model(self, model_uid: str, suppress_exception=False):
@@ -744,13 +929,19 @@ class SupervisorActor(xo.StatelessActor):
744
929
  )
745
930
 
746
931
  async def report_worker_status(
747
- self, worker_address: str, status: Dict[str, ResourceStatus]
932
+ self, worker_address: str, status: Dict[str, Union[ResourceStatus, GPUStatus]]
748
933
  ):
749
934
  if worker_address not in self._worker_status:
750
935
  logger.debug("Worker %s resources: %s", worker_address, status)
751
- self._worker_status[worker_address] = WorkerStatus(
752
- update_time=time.time(), status=status
753
- )
936
+ self._worker_status[worker_address] = WorkerStatus(
937
+ update_time=time.time(),
938
+ failure_remaining_count=XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD,
939
+ status=status,
940
+ )
941
+ else:
942
+ worker_status = self._worker_status[worker_address]
943
+ worker_status.update_time = time.time()
944
+ worker_status.status = status
754
945
 
755
946
  @staticmethod
756
947
  def record_metrics(name, op, kwargs):