xinference 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/oauth2/auth_service.py +132 -0
- xinference/api/restful_api.py +282 -78
- xinference/client/handlers.py +3 -0
- xinference/client/restful/restful_client.py +108 -75
- xinference/constants.py +14 -4
- xinference/core/cache_tracker.py +102 -0
- xinference/core/chat_interface.py +10 -4
- xinference/core/event.py +56 -0
- xinference/core/model.py +44 -0
- xinference/core/resource.py +19 -12
- xinference/core/status_guard.py +4 -0
- xinference/core/supervisor.py +278 -87
- xinference/core/utils.py +68 -3
- xinference/core/worker.py +98 -8
- xinference/deploy/cmdline.py +6 -3
- xinference/deploy/local.py +2 -2
- xinference/deploy/supervisor.py +2 -2
- xinference/model/audio/__init__.py +27 -0
- xinference/model/audio/core.py +161 -0
- xinference/model/audio/model_spec.json +79 -0
- xinference/model/audio/utils.py +18 -0
- xinference/model/audio/whisper.py +132 -0
- xinference/model/core.py +18 -13
- xinference/model/embedding/__init__.py +27 -2
- xinference/model/embedding/core.py +43 -3
- xinference/model/embedding/model_spec.json +24 -0
- xinference/model/embedding/model_spec_modelscope.json +24 -0
- xinference/model/embedding/utils.py +18 -0
- xinference/model/image/__init__.py +12 -1
- xinference/model/image/core.py +63 -9
- xinference/model/image/utils.py +26 -0
- xinference/model/llm/__init__.py +20 -1
- xinference/model/llm/core.py +43 -2
- xinference/model/llm/ggml/chatglm.py +15 -6
- xinference/model/llm/llm_family.json +197 -6
- xinference/model/llm/llm_family.py +9 -7
- xinference/model/llm/llm_family_modelscope.json +189 -4
- xinference/model/llm/pytorch/chatglm.py +3 -3
- xinference/model/llm/pytorch/core.py +4 -2
- xinference/model/{multimodal → llm/pytorch}/qwen_vl.py +10 -8
- xinference/model/llm/pytorch/utils.py +21 -9
- xinference/model/llm/pytorch/yi_vl.py +246 -0
- xinference/model/llm/utils.py +57 -4
- xinference/model/llm/vllm/core.py +5 -4
- xinference/model/rerank/__init__.py +25 -2
- xinference/model/rerank/core.py +51 -9
- xinference/model/rerank/model_spec.json +6 -0
- xinference/model/rerank/model_spec_modelscope.json +7 -0
- xinference/{api/oauth2/common.py → model/rerank/utils.py} +6 -2
- xinference/model/utils.py +5 -3
- xinference/thirdparty/__init__.py +0 -0
- xinference/thirdparty/llava/__init__.py +1 -0
- xinference/thirdparty/llava/conversation.py +205 -0
- xinference/thirdparty/llava/mm_utils.py +122 -0
- xinference/thirdparty/llava/model/__init__.py +1 -0
- xinference/thirdparty/llava/model/clip_encoder/__init__.py +0 -0
- xinference/thirdparty/llava/model/clip_encoder/builder.py +11 -0
- xinference/thirdparty/llava/model/clip_encoder/clip_encoder.py +86 -0
- xinference/thirdparty/llava/model/constants.py +6 -0
- xinference/thirdparty/llava/model/llava_arch.py +385 -0
- xinference/thirdparty/llava/model/llava_llama.py +163 -0
- xinference/thirdparty/llava/model/multimodal_projector/__init__.py +0 -0
- xinference/thirdparty/llava/model/multimodal_projector/builder.py +64 -0
- xinference/types.py +1 -1
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.15822aeb.js +3 -0
- xinference/web/ui/build/static/js/main.15822aeb.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/139e5e4adf436923107d2b02994c7ff6dba2aac1989e9b6638984f0dfe782c4a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/64accc515dc6cd584a2873796cd7da6f93de57f7e465eb5423cca9a2f3fe3eff.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/65ca3ba225b8c8dac907210545b51f2fcdb2591f0feeb7195f1c037f2bc956a0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b80db1012318b97c329c4e3e72454f7512fb107e57c444b437dbe4ba1a3faa5a.json +1 -0
- {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/METADATA +33 -23
- {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/RECORD +81 -64
- xinference/api/oauth2/core.py +0 -93
- xinference/model/multimodal/__init__.py +0 -52
- xinference/model/multimodal/core.py +0 -467
- xinference/model/multimodal/model_spec.json +0 -43
- xinference/model/multimodal/model_spec_modelscope.json +0 -45
- xinference/web/ui/build/static/js/main.b83095c2.js +0 -3
- xinference/web/ui/build/static/js/main.b83095c2.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/101923c539819f26ad11fbcbd6f6e56436b285efbb090dcc7dd648c6e924c4a8.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4942da6bc03bf7373af068e22f916341aabc5b5df855d73c1d348c696724ce37.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/52a6136cb2dbbf9c51d461724d9b283ebe74a73fb19d5df7ba8e13c42bd7174d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/71493aadd34d568fbe605cacaba220aa69bd09273251ee4ba27930f8d01fccd8.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8b071db2a5a9ef68dc14d5f606540bd23d9785e365a11997c510656764d2dccf.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a4d72d3b806ba061919115f0c513738726872e3c79cf258f007519d3f91d1a16.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f037ffef5992af0892d6d991053c1dace364cd39a3f11f1a41f92776e8a59459.json +0 -1
- /xinference/web/ui/build/static/js/{main.b83095c2.js.LICENSE.txt → main.15822aeb.js.LICENSE.txt} +0 -0
- {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/LICENSE +0 -0
- {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/WHEEL +0 -0
- {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/entry_points.txt +0 -0
- {xinference-0.8.1.dist-info → xinference-0.8.3.dist-info}/top_level.txt +0 -0
xinference/core/supervisor.py
CHANGED
|
@@ -21,10 +21,16 @@ from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple, Un
|
|
|
21
21
|
|
|
22
22
|
import xoscar as xo
|
|
23
23
|
|
|
24
|
+
from ..constants import (
|
|
25
|
+
XINFERENCE_DISABLE_HEALTH_CHECK,
|
|
26
|
+
XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD,
|
|
27
|
+
XINFERENCE_HEALTH_CHECK_INTERVAL,
|
|
28
|
+
XINFERENCE_HEALTH_CHECK_TIMEOUT,
|
|
29
|
+
)
|
|
24
30
|
from ..core import ModelActor
|
|
25
31
|
from ..core.status_guard import InstanceInfo, LaunchStatus
|
|
26
32
|
from .metrics import record_metrics
|
|
27
|
-
from .resource import ResourceStatus
|
|
33
|
+
from .resource import GPUStatus, ResourceStatus
|
|
28
34
|
from .utils import (
|
|
29
35
|
build_replica_model_uid,
|
|
30
36
|
gen_random_string,
|
|
@@ -32,14 +38,15 @@ from .utils import (
|
|
|
32
38
|
iter_replica_model_uid,
|
|
33
39
|
log_async,
|
|
34
40
|
log_sync,
|
|
41
|
+
parse_model_version,
|
|
35
42
|
parse_replica_model_uid,
|
|
36
43
|
)
|
|
37
44
|
|
|
38
45
|
if TYPE_CHECKING:
|
|
46
|
+
from ..model.audio import AudioModelFamilyV1
|
|
39
47
|
from ..model.embedding import EmbeddingModelSpec
|
|
40
48
|
from ..model.image import ImageModelFamilyV1
|
|
41
49
|
from ..model.llm import LLMFamilyV1
|
|
42
|
-
from ..model.multimodal import LVLMFamilyV1
|
|
43
50
|
from ..model.rerank import RerankModelSpec
|
|
44
51
|
from .worker import WorkerActor
|
|
45
52
|
|
|
@@ -47,7 +54,6 @@ if TYPE_CHECKING:
|
|
|
47
54
|
logger = getLogger(__name__)
|
|
48
55
|
|
|
49
56
|
|
|
50
|
-
DEFAULT_NODE_TIMEOUT = 60
|
|
51
57
|
ASYNC_LAUNCH_TASKS = {} # type: ignore
|
|
52
58
|
|
|
53
59
|
|
|
@@ -59,7 +65,8 @@ def callback_for_async_launch(model_uid: str):
|
|
|
59
65
|
@dataclass
|
|
60
66
|
class WorkerStatus:
|
|
61
67
|
update_time: float
|
|
62
|
-
|
|
68
|
+
failure_remaining_count: int
|
|
69
|
+
status: Dict[str, Union[ResourceStatus, GPUStatus]]
|
|
63
70
|
|
|
64
71
|
|
|
65
72
|
@dataclass
|
|
@@ -86,9 +93,17 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
86
93
|
|
|
87
94
|
async def __post_create__(self):
|
|
88
95
|
self._uptime = time.time()
|
|
89
|
-
|
|
90
|
-
|
|
96
|
+
if not XINFERENCE_DISABLE_HEALTH_CHECK:
|
|
97
|
+
# Run _check_dead_nodes() in a dedicated thread.
|
|
98
|
+
from ..isolation import Isolation
|
|
99
|
+
|
|
100
|
+
self._isolation = Isolation(asyncio.new_event_loop(), threaded=True)
|
|
101
|
+
self._isolation.start()
|
|
102
|
+
asyncio.run_coroutine_threadsafe(
|
|
103
|
+
self._check_dead_nodes(), loop=self._isolation.loop
|
|
104
|
+
)
|
|
91
105
|
logger.info(f"Xinference supervisor {self.address} started")
|
|
106
|
+
from .cache_tracker import CacheTrackerActor
|
|
92
107
|
from .status_guard import StatusGuardActor
|
|
93
108
|
|
|
94
109
|
self._status_guard_ref: xo.ActorRefType[
|
|
@@ -96,30 +111,98 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
96
111
|
] = await xo.create_actor(
|
|
97
112
|
StatusGuardActor, address=self.address, uid=StatusGuardActor.uid()
|
|
98
113
|
)
|
|
114
|
+
self._cache_tracker_ref: xo.ActorRefType[
|
|
115
|
+
"CacheTrackerActor"
|
|
116
|
+
] = await xo.create_actor(
|
|
117
|
+
CacheTrackerActor, address=self.address, uid=CacheTrackerActor.uid()
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
from .event import EventCollectorActor
|
|
121
|
+
|
|
122
|
+
self._event_collector_ref: xo.ActorRefType[
|
|
123
|
+
EventCollectorActor
|
|
124
|
+
] = await xo.create_actor(
|
|
125
|
+
EventCollectorActor, address=self.address, uid=EventCollectorActor.uid()
|
|
126
|
+
)
|
|
99
127
|
|
|
100
128
|
from ..model.embedding import (
|
|
101
129
|
CustomEmbeddingModelSpec,
|
|
130
|
+
generate_embedding_description,
|
|
131
|
+
get_embedding_model_descriptions,
|
|
102
132
|
register_embedding,
|
|
103
133
|
unregister_embedding,
|
|
104
134
|
)
|
|
105
|
-
from ..model.
|
|
106
|
-
from ..model.llm
|
|
107
|
-
|
|
135
|
+
from ..model.image import get_image_model_descriptions
|
|
136
|
+
from ..model.llm import (
|
|
137
|
+
CustomLLMFamilyV1,
|
|
138
|
+
generate_llm_description,
|
|
139
|
+
get_llm_model_descriptions,
|
|
140
|
+
register_llm,
|
|
141
|
+
unregister_llm,
|
|
142
|
+
)
|
|
143
|
+
from ..model.rerank import (
|
|
108
144
|
CustomRerankModelSpec,
|
|
145
|
+
generate_rerank_description,
|
|
146
|
+
get_rerank_model_descriptions,
|
|
109
147
|
register_rerank,
|
|
110
148
|
unregister_rerank,
|
|
111
149
|
)
|
|
112
150
|
|
|
113
151
|
self._custom_register_type_to_cls: Dict[str, Tuple] = {
|
|
114
|
-
"LLM": (
|
|
152
|
+
"LLM": (
|
|
153
|
+
CustomLLMFamilyV1,
|
|
154
|
+
register_llm,
|
|
155
|
+
unregister_llm,
|
|
156
|
+
generate_llm_description,
|
|
157
|
+
),
|
|
115
158
|
"embedding": (
|
|
116
159
|
CustomEmbeddingModelSpec,
|
|
117
160
|
register_embedding,
|
|
118
161
|
unregister_embedding,
|
|
162
|
+
generate_embedding_description,
|
|
163
|
+
),
|
|
164
|
+
"rerank": (
|
|
165
|
+
CustomRerankModelSpec,
|
|
166
|
+
register_rerank,
|
|
167
|
+
unregister_rerank,
|
|
168
|
+
generate_rerank_description,
|
|
119
169
|
),
|
|
120
|
-
"rerank": (CustomRerankModelSpec, register_rerank, unregister_rerank),
|
|
121
170
|
}
|
|
122
171
|
|
|
172
|
+
# record model version
|
|
173
|
+
model_version_infos: Dict[str, List[Dict]] = {}
|
|
174
|
+
model_version_infos.update(get_llm_model_descriptions())
|
|
175
|
+
model_version_infos.update(get_embedding_model_descriptions())
|
|
176
|
+
model_version_infos.update(get_rerank_model_descriptions())
|
|
177
|
+
model_version_infos.update(get_image_model_descriptions())
|
|
178
|
+
await self._cache_tracker_ref.record_model_version(
|
|
179
|
+
model_version_infos, self.address
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
async def get_cluster_device_info(self) -> List:
|
|
183
|
+
supervisor_device_info = {
|
|
184
|
+
"ip_address": self.address.split(":")[0],
|
|
185
|
+
"gpu_count": 0,
|
|
186
|
+
"gpu_vram_total": 0,
|
|
187
|
+
}
|
|
188
|
+
res = [{"node_type": "Supervisor", **supervisor_device_info}]
|
|
189
|
+
for worker_addr, worker_status in self._worker_status.items():
|
|
190
|
+
vram_total: float = sum(
|
|
191
|
+
[v.mem_total for k, v in worker_status.status.items() if k != "cpu"] # type: ignore
|
|
192
|
+
)
|
|
193
|
+
total = (
|
|
194
|
+
vram_total if vram_total == 0 else f"{int(vram_total / 1024 / 1024)}MiB"
|
|
195
|
+
)
|
|
196
|
+
res.append(
|
|
197
|
+
{
|
|
198
|
+
"node_type": "Worker",
|
|
199
|
+
"ip_address": worker_addr.split(":")[0],
|
|
200
|
+
"gpu_count": len(worker_status.status) - 1,
|
|
201
|
+
"gpu_vram_total": total,
|
|
202
|
+
}
|
|
203
|
+
)
|
|
204
|
+
return res
|
|
205
|
+
|
|
123
206
|
@staticmethod
|
|
124
207
|
async def get_builtin_prompts() -> Dict[str, Any]:
|
|
125
208
|
from ..model.llm.llm_family import BUILTIN_LLM_PROMPT_STYLE
|
|
@@ -180,99 +263,129 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
180
263
|
"workers": self._worker_status,
|
|
181
264
|
}
|
|
182
265
|
|
|
183
|
-
def _to_llm_reg(
|
|
266
|
+
async def _to_llm_reg(
|
|
184
267
|
self, llm_family: "LLMFamilyV1", is_builtin: bool
|
|
185
268
|
) -> Dict[str, Any]:
|
|
186
269
|
from ..model.llm import get_cache_status
|
|
187
270
|
|
|
271
|
+
instance_cnt = await self.get_instance_count(llm_family.model_name)
|
|
272
|
+
version_cnt = await self.get_model_version_count(llm_family.model_name)
|
|
273
|
+
|
|
188
274
|
if self.is_local_deployment():
|
|
189
275
|
specs = []
|
|
190
276
|
# TODO: does not work when the supervisor and worker are running on separate nodes.
|
|
191
277
|
for spec in llm_family.model_specs:
|
|
192
278
|
cache_status = get_cache_status(llm_family, spec)
|
|
193
279
|
specs.append({**spec.dict(), "cache_status": cache_status})
|
|
194
|
-
|
|
280
|
+
res = {**llm_family.dict(), "is_builtin": is_builtin, "model_specs": specs}
|
|
195
281
|
else:
|
|
196
|
-
|
|
282
|
+
res = {**llm_family.dict(), "is_builtin": is_builtin}
|
|
283
|
+
res["model_version_count"] = version_cnt
|
|
284
|
+
res["model_instance_count"] = instance_cnt
|
|
285
|
+
return res
|
|
197
286
|
|
|
198
|
-
def _to_embedding_model_reg(
|
|
287
|
+
async def _to_embedding_model_reg(
|
|
199
288
|
self, model_spec: "EmbeddingModelSpec", is_builtin: bool
|
|
200
289
|
) -> Dict[str, Any]:
|
|
201
290
|
from ..model.embedding import get_cache_status
|
|
202
291
|
|
|
292
|
+
instance_cnt = await self.get_instance_count(model_spec.model_name)
|
|
293
|
+
version_cnt = await self.get_model_version_count(model_spec.model_name)
|
|
294
|
+
|
|
203
295
|
if self.is_local_deployment():
|
|
204
296
|
# TODO: does not work when the supervisor and worker are running on separate nodes.
|
|
205
297
|
cache_status = get_cache_status(model_spec)
|
|
206
|
-
|
|
298
|
+
res = {
|
|
207
299
|
**model_spec.dict(),
|
|
208
300
|
"cache_status": cache_status,
|
|
209
301
|
"is_builtin": is_builtin,
|
|
210
302
|
}
|
|
211
303
|
else:
|
|
212
|
-
|
|
304
|
+
res = {
|
|
213
305
|
**model_spec.dict(),
|
|
214
306
|
"is_builtin": is_builtin,
|
|
215
307
|
}
|
|
308
|
+
res["model_version_count"] = version_cnt
|
|
309
|
+
res["model_instance_count"] = instance_cnt
|
|
310
|
+
return res
|
|
216
311
|
|
|
217
|
-
def _to_rerank_model_reg(
|
|
312
|
+
async def _to_rerank_model_reg(
|
|
218
313
|
self, model_spec: "RerankModelSpec", is_builtin: bool
|
|
219
314
|
) -> Dict[str, Any]:
|
|
220
315
|
from ..model.rerank import get_cache_status
|
|
221
316
|
|
|
317
|
+
instance_cnt = await self.get_instance_count(model_spec.model_name)
|
|
318
|
+
version_cnt = await self.get_model_version_count(model_spec.model_name)
|
|
319
|
+
|
|
222
320
|
if self.is_local_deployment():
|
|
223
321
|
# TODO: does not work when the supervisor and worker are running on separate nodes.
|
|
224
322
|
cache_status = get_cache_status(model_spec)
|
|
225
|
-
|
|
323
|
+
res = {
|
|
226
324
|
**model_spec.dict(),
|
|
227
325
|
"cache_status": cache_status,
|
|
228
326
|
"is_builtin": is_builtin,
|
|
229
327
|
}
|
|
230
328
|
else:
|
|
231
|
-
|
|
329
|
+
res = {
|
|
232
330
|
**model_spec.dict(),
|
|
233
331
|
"is_builtin": is_builtin,
|
|
234
332
|
}
|
|
333
|
+
res["model_version_count"] = version_cnt
|
|
334
|
+
res["model_instance_count"] = instance_cnt
|
|
335
|
+
return res
|
|
235
336
|
|
|
236
|
-
def _to_image_model_reg(
|
|
337
|
+
async def _to_image_model_reg(
|
|
237
338
|
self, model_family: "ImageModelFamilyV1", is_builtin: bool
|
|
238
339
|
) -> Dict[str, Any]:
|
|
239
340
|
from ..model.image import get_cache_status
|
|
240
341
|
|
|
342
|
+
instance_cnt = await self.get_instance_count(model_family.model_name)
|
|
343
|
+
version_cnt = await self.get_model_version_count(model_family.model_name)
|
|
344
|
+
|
|
241
345
|
if self.is_local_deployment():
|
|
242
346
|
# TODO: does not work when the supervisor and worker are running on separate nodes.
|
|
243
347
|
cache_status = get_cache_status(model_family)
|
|
244
|
-
|
|
348
|
+
res = {
|
|
245
349
|
**model_family.dict(),
|
|
246
350
|
"cache_status": cache_status,
|
|
247
351
|
"is_builtin": is_builtin,
|
|
248
352
|
}
|
|
249
353
|
else:
|
|
250
|
-
|
|
354
|
+
res = {
|
|
251
355
|
**model_family.dict(),
|
|
252
356
|
"is_builtin": is_builtin,
|
|
253
357
|
}
|
|
358
|
+
res["model_version_count"] = version_cnt
|
|
359
|
+
res["model_instance_count"] = instance_cnt
|
|
360
|
+
return res
|
|
254
361
|
|
|
255
|
-
def
|
|
256
|
-
self, model_family: "
|
|
362
|
+
async def _to_audio_model_reg(
|
|
363
|
+
self, model_family: "AudioModelFamilyV1", is_builtin: bool
|
|
257
364
|
) -> Dict[str, Any]:
|
|
258
|
-
from ..model.
|
|
365
|
+
from ..model.audio import get_cache_status
|
|
366
|
+
|
|
367
|
+
instance_cnt = await self.get_instance_count(model_family.model_name)
|
|
368
|
+
version_cnt = await self.get_model_version_count(model_family.model_name)
|
|
259
369
|
|
|
260
370
|
if self.is_local_deployment():
|
|
261
|
-
specs = []
|
|
262
371
|
# TODO: does not work when the supervisor and worker are running on separate nodes.
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
specs.append({**spec.dict(), "cache_status": cache_status})
|
|
266
|
-
return {
|
|
372
|
+
cache_status = get_cache_status(model_family)
|
|
373
|
+
res = {
|
|
267
374
|
**model_family.dict(),
|
|
375
|
+
"cache_status": cache_status,
|
|
268
376
|
"is_builtin": is_builtin,
|
|
269
|
-
"model_specs": specs,
|
|
270
377
|
}
|
|
271
378
|
else:
|
|
272
|
-
|
|
379
|
+
res = {
|
|
380
|
+
**model_family.dict(),
|
|
381
|
+
"is_builtin": is_builtin,
|
|
382
|
+
}
|
|
383
|
+
res["model_version_count"] = version_cnt
|
|
384
|
+
res["model_instance_count"] = instance_cnt
|
|
385
|
+
return res
|
|
273
386
|
|
|
274
|
-
@
|
|
275
|
-
def list_model_registrations(
|
|
387
|
+
@log_async(logger=logger)
|
|
388
|
+
async def list_model_registrations(
|
|
276
389
|
self, model_type: str, detailed: bool = False
|
|
277
390
|
) -> List[Dict[str, Any]]:
|
|
278
391
|
def sort_helper(item):
|
|
@@ -285,13 +398,13 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
285
398
|
ret = []
|
|
286
399
|
for family in BUILTIN_LLM_FAMILIES:
|
|
287
400
|
if detailed:
|
|
288
|
-
ret.append(self._to_llm_reg(family, True))
|
|
401
|
+
ret.append(await self._to_llm_reg(family, True))
|
|
289
402
|
else:
|
|
290
403
|
ret.append({"model_name": family.model_name, "is_builtin": True})
|
|
291
404
|
|
|
292
405
|
for family in get_user_defined_llm_families():
|
|
293
406
|
if detailed:
|
|
294
|
-
ret.append(self._to_llm_reg(family, False))
|
|
407
|
+
ret.append(await self._to_llm_reg(family, False))
|
|
295
408
|
else:
|
|
296
409
|
ret.append({"model_name": family.model_name, "is_builtin": False})
|
|
297
410
|
|
|
@@ -304,14 +417,16 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
304
417
|
ret = []
|
|
305
418
|
for model_name, family in BUILTIN_EMBEDDING_MODELS.items():
|
|
306
419
|
if detailed:
|
|
307
|
-
ret.append(
|
|
420
|
+
ret.append(
|
|
421
|
+
await self._to_embedding_model_reg(family, is_builtin=True)
|
|
422
|
+
)
|
|
308
423
|
else:
|
|
309
424
|
ret.append({"model_name": model_name, "is_builtin": True})
|
|
310
425
|
|
|
311
426
|
for model_spec in get_user_defined_embeddings():
|
|
312
427
|
if detailed:
|
|
313
428
|
ret.append(
|
|
314
|
-
self._to_embedding_model_reg(model_spec, is_builtin=False)
|
|
429
|
+
await self._to_embedding_model_reg(model_spec, is_builtin=False)
|
|
315
430
|
)
|
|
316
431
|
else:
|
|
317
432
|
ret.append(
|
|
@@ -326,7 +441,19 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
326
441
|
ret = []
|
|
327
442
|
for model_name, family in BUILTIN_IMAGE_MODELS.items():
|
|
328
443
|
if detailed:
|
|
329
|
-
ret.append(self._to_image_model_reg(family, is_builtin=True))
|
|
444
|
+
ret.append(await self._to_image_model_reg(family, is_builtin=True))
|
|
445
|
+
else:
|
|
446
|
+
ret.append({"model_name": model_name, "is_builtin": True})
|
|
447
|
+
|
|
448
|
+
ret.sort(key=sort_helper)
|
|
449
|
+
return ret
|
|
450
|
+
elif model_type == "audio":
|
|
451
|
+
from ..model.audio import BUILTIN_AUDIO_MODELS
|
|
452
|
+
|
|
453
|
+
ret = []
|
|
454
|
+
for model_name, family in BUILTIN_AUDIO_MODELS.items():
|
|
455
|
+
if detailed:
|
|
456
|
+
ret.append(await self._to_audio_model_reg(family, is_builtin=True))
|
|
330
457
|
else:
|
|
331
458
|
ret.append({"model_name": model_name, "is_builtin": True})
|
|
332
459
|
|
|
@@ -339,30 +466,20 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
339
466
|
ret = []
|
|
340
467
|
for model_name, family in BUILTIN_RERANK_MODELS.items():
|
|
341
468
|
if detailed:
|
|
342
|
-
ret.append(self._to_rerank_model_reg(family, is_builtin=True))
|
|
469
|
+
ret.append(await self._to_rerank_model_reg(family, is_builtin=True))
|
|
343
470
|
else:
|
|
344
471
|
ret.append({"model_name": model_name, "is_builtin": True})
|
|
345
472
|
|
|
346
473
|
for model_spec in get_user_defined_reranks():
|
|
347
474
|
if detailed:
|
|
348
|
-
ret.append(
|
|
475
|
+
ret.append(
|
|
476
|
+
await self._to_rerank_model_reg(model_spec, is_builtin=False)
|
|
477
|
+
)
|
|
349
478
|
else:
|
|
350
479
|
ret.append(
|
|
351
480
|
{"model_name": model_spec.model_name, "is_builtin": False}
|
|
352
481
|
)
|
|
353
482
|
|
|
354
|
-
ret.sort(key=sort_helper)
|
|
355
|
-
return ret
|
|
356
|
-
elif model_type == "multimodal":
|
|
357
|
-
from ..model.multimodal import BUILTIN_LVLM_FAMILIES
|
|
358
|
-
|
|
359
|
-
ret = []
|
|
360
|
-
for family in BUILTIN_LVLM_FAMILIES:
|
|
361
|
-
if detailed:
|
|
362
|
-
ret.append(self._to_multimodal_reg(family, True))
|
|
363
|
-
else:
|
|
364
|
-
ret.append({"model_name": family.model_name, "is_builtin": True})
|
|
365
|
-
|
|
366
483
|
ret.sort(key=sort_helper)
|
|
367
484
|
return ret
|
|
368
485
|
else:
|
|
@@ -395,18 +512,18 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
395
512
|
if f.model_name == model_name:
|
|
396
513
|
return f
|
|
397
514
|
raise ValueError(f"Model {model_name} not found")
|
|
398
|
-
elif model_type == "
|
|
399
|
-
from ..model.
|
|
400
|
-
from ..model.rerank.custom import get_user_defined_reranks
|
|
515
|
+
elif model_type == "audio":
|
|
516
|
+
from ..model.audio import BUILTIN_AUDIO_MODELS
|
|
401
517
|
|
|
402
|
-
for f in
|
|
518
|
+
for f in BUILTIN_AUDIO_MODELS.values():
|
|
403
519
|
if f.model_name == model_name:
|
|
404
520
|
return f
|
|
405
521
|
raise ValueError(f"Model {model_name} not found")
|
|
406
|
-
elif model_type == "
|
|
407
|
-
from ..model.
|
|
522
|
+
elif model_type == "rerank":
|
|
523
|
+
from ..model.rerank import BUILTIN_RERANK_MODELS
|
|
524
|
+
from ..model.rerank.custom import get_user_defined_reranks
|
|
408
525
|
|
|
409
|
-
for f in
|
|
526
|
+
for f in list(BUILTIN_RERANK_MODELS.values()) + get_user_defined_reranks():
|
|
410
527
|
if f.model_name == model_name:
|
|
411
528
|
return f
|
|
412
529
|
raise ValueError(f"Model {model_name} not found")
|
|
@@ -420,6 +537,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
420
537
|
model_spec_cls,
|
|
421
538
|
register_fn,
|
|
422
539
|
unregister_fn,
|
|
540
|
+
generate_fn,
|
|
423
541
|
) = self._custom_register_type_to_cls[model_type]
|
|
424
542
|
|
|
425
543
|
if not self.is_local_deployment():
|
|
@@ -430,6 +548,9 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
430
548
|
model_spec = model_spec_cls.parse_raw(model)
|
|
431
549
|
try:
|
|
432
550
|
register_fn(model_spec, persist)
|
|
551
|
+
await self._cache_tracker_ref.record_model_version(
|
|
552
|
+
generate_fn(model_spec), self.address
|
|
553
|
+
)
|
|
433
554
|
except Exception as e:
|
|
434
555
|
unregister_fn(model_spec.model_name, raise_error=False)
|
|
435
556
|
raise e
|
|
@@ -439,8 +560,9 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
439
560
|
@log_async(logger=logger)
|
|
440
561
|
async def unregister_model(self, model_type: str, model_name: str):
|
|
441
562
|
if model_type in self._custom_register_type_to_cls:
|
|
442
|
-
_, _, unregister_fn = self._custom_register_type_to_cls[model_type]
|
|
563
|
+
_, _, unregister_fn, _ = self._custom_register_type_to_cls[model_type]
|
|
443
564
|
unregister_fn(model_name)
|
|
565
|
+
await self._cache_tracker_ref.unregister_model_version(model_name)
|
|
444
566
|
|
|
445
567
|
if not self.is_local_deployment():
|
|
446
568
|
workers = list(self._worker_address_to_worker.values())
|
|
@@ -457,6 +579,43 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
457
579
|
)
|
|
458
580
|
return f"{model_name}-{gen_random_string(8)}"
|
|
459
581
|
|
|
582
|
+
async def get_model_versions(self, model_type: str, model_name: str) -> List[Dict]:
|
|
583
|
+
return await self._cache_tracker_ref.get_model_versions(model_name)
|
|
584
|
+
|
|
585
|
+
async def get_model_version_count(self, model_name: str) -> int:
|
|
586
|
+
return await self._cache_tracker_ref.get_model_version_count(model_name)
|
|
587
|
+
|
|
588
|
+
@log_async(logger=logger)
|
|
589
|
+
async def launch_model_by_version(
|
|
590
|
+
self,
|
|
591
|
+
model_uid: Optional[str],
|
|
592
|
+
model_type: str,
|
|
593
|
+
model_version: str,
|
|
594
|
+
replica: int = 1,
|
|
595
|
+
n_gpu: Optional[Union[int, str]] = "auto",
|
|
596
|
+
wait_ready: bool = True,
|
|
597
|
+
):
|
|
598
|
+
parse_results = parse_model_version(model_version, model_type)
|
|
599
|
+
|
|
600
|
+
if model_type == "image" and len(parse_results) == 2:
|
|
601
|
+
kwargs = {"controlnet": parse_results[1]}
|
|
602
|
+
else:
|
|
603
|
+
kwargs = {}
|
|
604
|
+
|
|
605
|
+
return await self.launch_builtin_model(
|
|
606
|
+
model_uid=model_uid,
|
|
607
|
+
model_name=parse_results[0],
|
|
608
|
+
model_size_in_billions=parse_results[1] if model_type == "LLM" else None,
|
|
609
|
+
model_format=parse_results[2] if model_type == "LLM" else None,
|
|
610
|
+
quantization=parse_results[3] if model_type == "LLM" else None,
|
|
611
|
+
model_type=model_type,
|
|
612
|
+
replica=replica,
|
|
613
|
+
n_gpu=n_gpu,
|
|
614
|
+
wait_ready=wait_ready,
|
|
615
|
+
model_version=model_version,
|
|
616
|
+
**kwargs,
|
|
617
|
+
)
|
|
618
|
+
|
|
460
619
|
async def launch_speculative_llm(
|
|
461
620
|
self,
|
|
462
621
|
model_uid: Optional[str],
|
|
@@ -529,6 +688,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
529
688
|
n_gpu: Optional[Union[int, str]] = "auto",
|
|
530
689
|
request_limits: Optional[int] = None,
|
|
531
690
|
wait_ready: bool = True,
|
|
691
|
+
model_version: Optional[str] = None,
|
|
532
692
|
**kwargs,
|
|
533
693
|
) -> str:
|
|
534
694
|
if model_uid is None:
|
|
@@ -601,6 +761,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
601
761
|
instance_info = InstanceInfo(
|
|
602
762
|
model_name=model_name,
|
|
603
763
|
model_uid=model_uid,
|
|
764
|
+
model_version=model_version,
|
|
604
765
|
model_ability=[],
|
|
605
766
|
replica=replica,
|
|
606
767
|
status=LaunchStatus.CREATING.name,
|
|
@@ -623,29 +784,53 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
623
784
|
)
|
|
624
785
|
return [info.dict() for info in sorted(infos, key=lambda info: info.model_uid)]
|
|
625
786
|
|
|
787
|
+
async def get_instance_count(self, model_name: str) -> int:
|
|
788
|
+
return await self._status_guard_ref.get_instance_count(model_name)
|
|
789
|
+
|
|
626
790
|
async def _check_dead_nodes(self):
|
|
627
791
|
while True:
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
792
|
+
try:
|
|
793
|
+
dead_nodes = []
|
|
794
|
+
for address, status in self._worker_status.items():
|
|
795
|
+
if (
|
|
796
|
+
time.time() - status.update_time
|
|
797
|
+
> XINFERENCE_HEALTH_CHECK_TIMEOUT
|
|
798
|
+
):
|
|
799
|
+
status.failure_remaining_count -= 1
|
|
800
|
+
else:
|
|
801
|
+
status.failure_remaining_count = (
|
|
802
|
+
XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD
|
|
803
|
+
)
|
|
804
|
+
|
|
805
|
+
if status.failure_remaining_count <= 0:
|
|
806
|
+
dead_models = []
|
|
807
|
+
for model_uid in self._replica_model_uid_to_worker:
|
|
808
|
+
if (
|
|
809
|
+
self._replica_model_uid_to_worker[model_uid].address
|
|
810
|
+
== address
|
|
811
|
+
):
|
|
812
|
+
dead_models.append(model_uid)
|
|
813
|
+
logger.error(
|
|
814
|
+
"Worker dead. address: %s, influenced models: %s",
|
|
815
|
+
address,
|
|
816
|
+
dead_models,
|
|
817
|
+
)
|
|
818
|
+
dead_nodes.append(address)
|
|
819
|
+
elif (
|
|
820
|
+
status.failure_remaining_count
|
|
821
|
+
!= XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD
|
|
822
|
+
):
|
|
823
|
+
logger.error(
|
|
824
|
+
"Worker timeout. address: %s, check count remaining %s...",
|
|
825
|
+
address,
|
|
826
|
+
status.failure_remaining_count,
|
|
827
|
+
)
|
|
828
|
+
|
|
829
|
+
for address in dead_nodes:
|
|
830
|
+
self._worker_status.pop(address, None)
|
|
831
|
+
self._worker_address_to_worker.pop(address, None)
|
|
832
|
+
finally:
|
|
833
|
+
await asyncio.sleep(XINFERENCE_HEALTH_CHECK_INTERVAL)
|
|
649
834
|
|
|
650
835
|
@log_async(logger=logger)
|
|
651
836
|
async def terminate_model(self, model_uid: str, suppress_exception=False):
|
|
@@ -744,13 +929,19 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
744
929
|
)
|
|
745
930
|
|
|
746
931
|
async def report_worker_status(
|
|
747
|
-
self, worker_address: str, status: Dict[str, ResourceStatus]
|
|
932
|
+
self, worker_address: str, status: Dict[str, Union[ResourceStatus, GPUStatus]]
|
|
748
933
|
):
|
|
749
934
|
if worker_address not in self._worker_status:
|
|
750
935
|
logger.debug("Worker %s resources: %s", worker_address, status)
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
936
|
+
self._worker_status[worker_address] = WorkerStatus(
|
|
937
|
+
update_time=time.time(),
|
|
938
|
+
failure_remaining_count=XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD,
|
|
939
|
+
status=status,
|
|
940
|
+
)
|
|
941
|
+
else:
|
|
942
|
+
worker_status = self._worker_status[worker_address]
|
|
943
|
+
worker_status.update_time = time.time()
|
|
944
|
+
worker_status.status = status
|
|
754
945
|
|
|
755
946
|
@staticmethod
|
|
756
947
|
def record_metrics(name, op, kwargs):
|