xinference 0.10.2.post1__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/oauth2/auth_service.py +1 -1
- xinference/api/restful_api.py +53 -61
- xinference/client/restful/restful_client.py +52 -57
- xinference/conftest.py +1 -1
- xinference/core/cache_tracker.py +1 -1
- xinference/core/event.py +1 -1
- xinference/core/model.py +15 -4
- xinference/core/status_guard.py +1 -1
- xinference/core/supervisor.py +58 -72
- xinference/core/worker.py +73 -102
- xinference/deploy/cmdline.py +175 -6
- xinference/deploy/test/test_cmdline.py +2 -0
- xinference/deploy/utils.py +1 -1
- xinference/device_utils.py +29 -3
- xinference/fields.py +5 -1
- xinference/model/audio/model_spec.json +8 -1
- xinference/model/audio/whisper.py +88 -12
- xinference/model/core.py +2 -2
- xinference/model/embedding/core.py +13 -0
- xinference/model/image/__init__.py +29 -0
- xinference/model/image/core.py +6 -0
- xinference/model/image/custom.py +109 -0
- xinference/model/llm/__init__.py +92 -32
- xinference/model/llm/core.py +57 -102
- xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
- xinference/model/llm/llm_family.json +446 -2
- xinference/model/llm/llm_family.py +45 -41
- xinference/model/llm/llm_family_modelscope.json +208 -1
- xinference/model/llm/pytorch/deepseek_vl.py +89 -33
- xinference/model/llm/pytorch/qwen_vl.py +67 -12
- xinference/model/llm/pytorch/yi_vl.py +62 -45
- xinference/model/llm/utils.py +45 -15
- xinference/model/llm/vllm/core.py +21 -4
- xinference/model/rerank/core.py +48 -20
- xinference/thirdparty/omnilmm/chat.py +2 -1
- xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
- xinference/types.py +2 -0
- xinference/web/ui/build/asset-manifest.json +6 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/main.54bca460.css +2 -0
- xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
- xinference/web/ui/build/static/js/main.8e44da4b.js +3 -0
- xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.8e44da4b.js.LICENSE.txt} +7 -0
- xinference/web/ui/build/static/js/main.8e44da4b.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/ddaec68b88e5eff792df1e39a4b4b8b737bfc832293c015660c3c69334e3cf5c.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +33 -0
- xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
- xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
- xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
- xinference/web/ui/node_modules/clipboard/bower.json +18 -0
- xinference/web/ui/node_modules/clipboard/composer.json +25 -0
- xinference/web/ui/node_modules/clipboard/package.json +63 -0
- xinference/web/ui/node_modules/delegate/package.json +31 -0
- xinference/web/ui/node_modules/good-listener/bower.json +11 -0
- xinference/web/ui/node_modules/good-listener/package.json +35 -0
- xinference/web/ui/node_modules/select/bower.json +13 -0
- xinference/web/ui/node_modules/select/package.json +29 -0
- xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
- xinference/web/ui/package-lock.json +34 -0
- xinference/web/ui/package.json +1 -0
- {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/METADATA +14 -13
- {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/RECORD +81 -60
- xinference/client/oscar/__init__.py +0 -13
- xinference/client/oscar/actor_client.py +0 -611
- xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
- xinference/model/llm/pytorch/spec_model.py +0 -186
- xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
- xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
- {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/LICENSE +0 -0
- {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/WHEEL +0 -0
- {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/entry_points.txt +0 -0
- {xinference-0.10.2.post1.dist-info → xinference-0.11.0.dist-info}/top_level.txt +0 -0
xinference/core/supervisor.py
CHANGED
|
@@ -80,12 +80,12 @@ class ReplicaInfo:
|
|
|
80
80
|
class SupervisorActor(xo.StatelessActor):
|
|
81
81
|
def __init__(self):
|
|
82
82
|
super().__init__()
|
|
83
|
-
self._worker_address_to_worker: Dict[str, xo.ActorRefType["WorkerActor"]] = {}
|
|
84
|
-
self._worker_status: Dict[str, WorkerStatus] = {}
|
|
85
|
-
self._replica_model_uid_to_worker: Dict[
|
|
83
|
+
self._worker_address_to_worker: Dict[str, xo.ActorRefType["WorkerActor"]] = {} # type: ignore
|
|
84
|
+
self._worker_status: Dict[str, WorkerStatus] = {} # type: ignore
|
|
85
|
+
self._replica_model_uid_to_worker: Dict[ # type: ignore
|
|
86
86
|
str, xo.ActorRefType["WorkerActor"]
|
|
87
87
|
] = {}
|
|
88
|
-
self._model_uid_to_replica_info: Dict[str, ReplicaInfo] = {}
|
|
88
|
+
self._model_uid_to_replica_info: Dict[str, ReplicaInfo] = {} # type: ignore
|
|
89
89
|
self._uptime = None
|
|
90
90
|
self._lock = asyncio.Lock()
|
|
91
91
|
|
|
@@ -117,12 +117,12 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
117
117
|
from .cache_tracker import CacheTrackerActor
|
|
118
118
|
from .status_guard import StatusGuardActor
|
|
119
119
|
|
|
120
|
-
self._status_guard_ref: xo.ActorRefType[
|
|
120
|
+
self._status_guard_ref: xo.ActorRefType[ # type: ignore
|
|
121
121
|
"StatusGuardActor"
|
|
122
122
|
] = await xo.create_actor(
|
|
123
123
|
StatusGuardActor, address=self.address, uid=StatusGuardActor.uid()
|
|
124
124
|
)
|
|
125
|
-
self._cache_tracker_ref: xo.ActorRefType[
|
|
125
|
+
self._cache_tracker_ref: xo.ActorRefType[ # type: ignore
|
|
126
126
|
"CacheTrackerActor"
|
|
127
127
|
] = await xo.create_actor(
|
|
128
128
|
CacheTrackerActor, address=self.address, uid=CacheTrackerActor.uid()
|
|
@@ -130,7 +130,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
130
130
|
|
|
131
131
|
from .event import EventCollectorActor
|
|
132
132
|
|
|
133
|
-
self._event_collector_ref: xo.ActorRefType[
|
|
133
|
+
self._event_collector_ref: xo.ActorRefType[ # type: ignore
|
|
134
134
|
EventCollectorActor
|
|
135
135
|
] = await xo.create_actor(
|
|
136
136
|
EventCollectorActor, address=self.address, uid=EventCollectorActor.uid()
|
|
@@ -150,7 +150,13 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
150
150
|
register_embedding,
|
|
151
151
|
unregister_embedding,
|
|
152
152
|
)
|
|
153
|
-
from ..model.image import
|
|
153
|
+
from ..model.image import (
|
|
154
|
+
CustomImageModelFamilyV1,
|
|
155
|
+
generate_image_description,
|
|
156
|
+
get_image_model_descriptions,
|
|
157
|
+
register_image,
|
|
158
|
+
unregister_image,
|
|
159
|
+
)
|
|
154
160
|
from ..model.llm import (
|
|
155
161
|
CustomLLMFamilyV1,
|
|
156
162
|
generate_llm_description,
|
|
@@ -166,7 +172,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
166
172
|
unregister_rerank,
|
|
167
173
|
)
|
|
168
174
|
|
|
169
|
-
self._custom_register_type_to_cls: Dict[str, Tuple] = {
|
|
175
|
+
self._custom_register_type_to_cls: Dict[str, Tuple] = { # type: ignore
|
|
170
176
|
"LLM": (
|
|
171
177
|
CustomLLMFamilyV1,
|
|
172
178
|
register_llm,
|
|
@@ -185,6 +191,12 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
185
191
|
unregister_rerank,
|
|
186
192
|
generate_rerank_description,
|
|
187
193
|
),
|
|
194
|
+
"image": (
|
|
195
|
+
CustomImageModelFamilyV1,
|
|
196
|
+
register_image,
|
|
197
|
+
unregister_image,
|
|
198
|
+
generate_image_description,
|
|
199
|
+
),
|
|
188
200
|
"audio": (
|
|
189
201
|
CustomAudioModelFamilyV1,
|
|
190
202
|
register_audio,
|
|
@@ -194,7 +206,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
194
206
|
}
|
|
195
207
|
|
|
196
208
|
# record model version
|
|
197
|
-
model_version_infos: Dict[str, List[Dict]] = {}
|
|
209
|
+
model_version_infos: Dict[str, List[Dict]] = {} # type: ignore
|
|
198
210
|
model_version_infos.update(get_llm_model_descriptions())
|
|
199
211
|
model_version_infos.update(get_embedding_model_descriptions())
|
|
200
212
|
model_version_infos.update(get_rerank_model_descriptions())
|
|
@@ -272,7 +284,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
272
284
|
return {
|
|
273
285
|
"chat": list(BUILTIN_LLM_MODEL_CHAT_FAMILIES),
|
|
274
286
|
"generate": list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES),
|
|
275
|
-
"
|
|
287
|
+
"tools": list(BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES),
|
|
276
288
|
}
|
|
277
289
|
|
|
278
290
|
async def get_devices_count(self) -> int:
|
|
@@ -486,6 +498,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
486
498
|
return ret
|
|
487
499
|
elif model_type == "image":
|
|
488
500
|
from ..model.image import BUILTIN_IMAGE_MODELS
|
|
501
|
+
from ..model.image.custom import get_user_defined_images
|
|
489
502
|
|
|
490
503
|
ret = []
|
|
491
504
|
for model_name, family in BUILTIN_IMAGE_MODELS.items():
|
|
@@ -494,6 +507,16 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
494
507
|
else:
|
|
495
508
|
ret.append({"model_name": model_name, "is_builtin": True})
|
|
496
509
|
|
|
510
|
+
for model_spec in get_user_defined_images():
|
|
511
|
+
if detailed:
|
|
512
|
+
ret.append(
|
|
513
|
+
await self._to_image_model_reg(model_spec, is_builtin=False)
|
|
514
|
+
)
|
|
515
|
+
else:
|
|
516
|
+
ret.append(
|
|
517
|
+
{"model_name": model_spec.model_name, "is_builtin": False}
|
|
518
|
+
)
|
|
519
|
+
|
|
497
520
|
ret.sort(key=sort_helper)
|
|
498
521
|
return ret
|
|
499
522
|
elif model_type == "audio":
|
|
@@ -567,8 +590,9 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
567
590
|
raise ValueError(f"Model {model_name} not found")
|
|
568
591
|
elif model_type == "image":
|
|
569
592
|
from ..model.image import BUILTIN_IMAGE_MODELS
|
|
593
|
+
from ..model.image.custom import get_user_defined_images
|
|
570
594
|
|
|
571
|
-
for f in BUILTIN_IMAGE_MODELS.values():
|
|
595
|
+
for f in list(BUILTIN_IMAGE_MODELS.values()) + get_user_defined_images():
|
|
572
596
|
if f.model_name == model_name:
|
|
573
597
|
return f
|
|
574
598
|
raise ValueError(f"Model {model_name} not found")
|
|
@@ -591,6 +615,24 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
591
615
|
else:
|
|
592
616
|
raise ValueError(f"Unsupported model type: {model_type}")
|
|
593
617
|
|
|
618
|
+
@log_async(logger=logger)
|
|
619
|
+
async def query_engines_by_model_name(self, model_name: str):
|
|
620
|
+
from copy import deepcopy
|
|
621
|
+
|
|
622
|
+
from ..model.llm.llm_family import LLM_ENGINES
|
|
623
|
+
|
|
624
|
+
if model_name not in LLM_ENGINES:
|
|
625
|
+
raise ValueError(f"Model {model_name} not found")
|
|
626
|
+
|
|
627
|
+
# filter llm_class
|
|
628
|
+
engine_params = deepcopy(LLM_ENGINES[model_name])
|
|
629
|
+
for engine in engine_params:
|
|
630
|
+
params = engine_params[engine]
|
|
631
|
+
for param in params:
|
|
632
|
+
del param["llm_class"]
|
|
633
|
+
|
|
634
|
+
return engine_params
|
|
635
|
+
|
|
594
636
|
@log_async(logger=logger)
|
|
595
637
|
async def register_model(self, model_type: str, model: str, persist: bool):
|
|
596
638
|
if model_type in self._custom_register_type_to_cls:
|
|
@@ -651,6 +693,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
651
693
|
self,
|
|
652
694
|
model_uid: Optional[str],
|
|
653
695
|
model_type: str,
|
|
696
|
+
model_engine: Optional[str],
|
|
654
697
|
model_version: str,
|
|
655
698
|
replica: int = 1,
|
|
656
699
|
n_gpu: Optional[Union[int, str]] = "auto",
|
|
@@ -666,6 +709,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
666
709
|
return await self.launch_builtin_model(
|
|
667
710
|
model_uid=model_uid,
|
|
668
711
|
model_name=parse_results[0],
|
|
712
|
+
model_engine=model_engine,
|
|
669
713
|
model_size_in_billions=parse_results[1] if model_type == "LLM" else None,
|
|
670
714
|
model_format=parse_results[2] if model_type == "LLM" else None,
|
|
671
715
|
quantization=parse_results[3] if model_type == "LLM" else None,
|
|
@@ -677,66 +721,6 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
677
721
|
**kwargs,
|
|
678
722
|
)
|
|
679
723
|
|
|
680
|
-
async def launch_speculative_llm(
|
|
681
|
-
self,
|
|
682
|
-
model_uid: Optional[str],
|
|
683
|
-
model_name: str,
|
|
684
|
-
model_size_in_billions: Optional[Union[int, str]],
|
|
685
|
-
quantization: Optional[str],
|
|
686
|
-
draft_model_name: str,
|
|
687
|
-
draft_model_size_in_billions: Optional[int],
|
|
688
|
-
draft_quantization: Optional[str],
|
|
689
|
-
n_gpu: Optional[Union[int, str]] = "auto",
|
|
690
|
-
) -> str:
|
|
691
|
-
if model_uid is None:
|
|
692
|
-
model_uid = self._gen_model_uid(model_name)
|
|
693
|
-
logger.debug(
|
|
694
|
-
(
|
|
695
|
-
f"Enter launch_speculative_llm, model_uid: %s, model_name: %s, model_size: %s, "
|
|
696
|
-
f"draft_model_name: %s, draft_model_size: %s"
|
|
697
|
-
),
|
|
698
|
-
model_uid,
|
|
699
|
-
model_name,
|
|
700
|
-
str(model_size_in_billions) if model_size_in_billions else "",
|
|
701
|
-
draft_model_name,
|
|
702
|
-
draft_model_size_in_billions,
|
|
703
|
-
)
|
|
704
|
-
|
|
705
|
-
# TODO: the draft and target model must be on the same worker.
|
|
706
|
-
if not self.is_local_deployment():
|
|
707
|
-
raise ValueError(
|
|
708
|
-
"Speculative model is not supported in distributed deployment yet."
|
|
709
|
-
)
|
|
710
|
-
|
|
711
|
-
if model_uid in self._model_uid_to_replica_info:
|
|
712
|
-
raise ValueError(f"Model is already in the model list, uid: {model_uid}")
|
|
713
|
-
|
|
714
|
-
worker_ref = await self._choose_worker()
|
|
715
|
-
replica = 1
|
|
716
|
-
self._model_uid_to_replica_info[model_uid] = ReplicaInfo(
|
|
717
|
-
replica=replica, scheduler=itertools.cycle(range(replica))
|
|
718
|
-
)
|
|
719
|
-
|
|
720
|
-
try:
|
|
721
|
-
rep_model_uid = f"{model_uid}-{1}-{0}"
|
|
722
|
-
await worker_ref.launch_speculative_model(
|
|
723
|
-
model_uid=rep_model_uid,
|
|
724
|
-
model_name=model_name,
|
|
725
|
-
model_size_in_billions=model_size_in_billions,
|
|
726
|
-
quantization=quantization,
|
|
727
|
-
draft_model_name=draft_model_name,
|
|
728
|
-
draft_model_size_in_billions=draft_model_size_in_billions,
|
|
729
|
-
draft_quantization=draft_quantization,
|
|
730
|
-
n_gpu=n_gpu,
|
|
731
|
-
)
|
|
732
|
-
self._replica_model_uid_to_worker[rep_model_uid] = worker_ref
|
|
733
|
-
|
|
734
|
-
except Exception:
|
|
735
|
-
# terminate_model will remove the replica info.
|
|
736
|
-
await self.terminate_model(model_uid, suppress_exception=True)
|
|
737
|
-
raise
|
|
738
|
-
return model_uid
|
|
739
|
-
|
|
740
724
|
async def launch_builtin_model(
|
|
741
725
|
self,
|
|
742
726
|
model_uid: Optional[str],
|
|
@@ -744,6 +728,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
744
728
|
model_size_in_billions: Optional[Union[int, str]],
|
|
745
729
|
model_format: Optional[str],
|
|
746
730
|
quantization: Optional[str],
|
|
731
|
+
model_engine: Optional[str],
|
|
747
732
|
model_type: Optional[str],
|
|
748
733
|
replica: int = 1,
|
|
749
734
|
n_gpu: Optional[Union[int, str]] = "auto",
|
|
@@ -799,6 +784,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
799
784
|
model_size_in_billions=model_size_in_billions,
|
|
800
785
|
model_format=model_format,
|
|
801
786
|
quantization=quantization,
|
|
787
|
+
model_engine=model_engine,
|
|
802
788
|
model_type=model_type,
|
|
803
789
|
n_gpu=n_gpu,
|
|
804
790
|
request_limits=request_limits,
|
xinference/core/worker.py
CHANGED
|
@@ -34,7 +34,7 @@ from ..constants import (
|
|
|
34
34
|
)
|
|
35
35
|
from ..core import ModelActor
|
|
36
36
|
from ..core.status_guard import LaunchStatus
|
|
37
|
-
from ..device_utils import gpu_count
|
|
37
|
+
from ..device_utils import get_available_device_env_name, gpu_count
|
|
38
38
|
from ..model.core import ModelDescription, create_model_instance
|
|
39
39
|
from ..types import PeftModelConfig
|
|
40
40
|
from .event import Event, EventCollectorActor, EventType
|
|
@@ -80,7 +80,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
80
80
|
int, Set[Tuple[str, str]]
|
|
81
81
|
] = defaultdict(set)
|
|
82
82
|
self._model_uid_to_addr: Dict[str, str] = {}
|
|
83
|
-
self._model_uid_to_recover_count: Dict[str, int] = {}
|
|
83
|
+
self._model_uid_to_recover_count: Dict[str, Optional[int]] = {}
|
|
84
84
|
self._model_uid_to_launch_args: Dict[str, Dict] = {}
|
|
85
85
|
|
|
86
86
|
# metrics export server.
|
|
@@ -137,14 +137,19 @@ class WorkerActor(xo.StatelessActor):
|
|
|
137
137
|
recover_count - 1,
|
|
138
138
|
)
|
|
139
139
|
event_model_uid, _, __ = parse_replica_model_uid(model_uid)
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
140
|
+
try:
|
|
141
|
+
await self._event_collector_ref.report_event(
|
|
142
|
+
event_model_uid,
|
|
143
|
+
Event(
|
|
144
|
+
event_type=EventType.WARNING,
|
|
145
|
+
event_ts=int(time.time()),
|
|
146
|
+
event_content="Recreate model",
|
|
147
|
+
),
|
|
148
|
+
)
|
|
149
|
+
except Exception as e:
|
|
150
|
+
# Report callback error can be log and ignore, should not interrupt the Process
|
|
151
|
+
logger.error("report_event error: %s" % (e))
|
|
152
|
+
|
|
148
153
|
self._model_uid_to_recover_count[model_uid] = (
|
|
149
154
|
recover_count - 1
|
|
150
155
|
)
|
|
@@ -166,22 +171,22 @@ class WorkerActor(xo.StatelessActor):
|
|
|
166
171
|
from .status_guard import StatusGuardActor
|
|
167
172
|
from .supervisor import SupervisorActor
|
|
168
173
|
|
|
169
|
-
self._status_guard_ref: xo.ActorRefType[
|
|
174
|
+
self._status_guard_ref: xo.ActorRefType[ # type: ignore
|
|
170
175
|
"StatusGuardActor"
|
|
171
176
|
] = await xo.actor_ref(
|
|
172
177
|
address=self._supervisor_address, uid=StatusGuardActor.uid()
|
|
173
178
|
)
|
|
174
|
-
self._event_collector_ref: xo.ActorRefType[
|
|
179
|
+
self._event_collector_ref: xo.ActorRefType[ # type: ignore
|
|
175
180
|
EventCollectorActor
|
|
176
181
|
] = await xo.actor_ref(
|
|
177
182
|
address=self._supervisor_address, uid=EventCollectorActor.uid()
|
|
178
183
|
)
|
|
179
|
-
self._cache_tracker_ref: xo.ActorRefType[
|
|
184
|
+
self._cache_tracker_ref: xo.ActorRefType[ # type: ignore
|
|
180
185
|
"CacheTrackerActor"
|
|
181
186
|
] = await xo.actor_ref(
|
|
182
187
|
address=self._supervisor_address, uid=CacheTrackerActor.uid()
|
|
183
188
|
)
|
|
184
|
-
self._supervisor_ref: xo.ActorRefType["SupervisorActor"] = await xo.actor_ref(
|
|
189
|
+
self._supervisor_ref: xo.ActorRefType["SupervisorActor"] = await xo.actor_ref( # type: ignore
|
|
185
190
|
address=self._supervisor_address, uid=SupervisorActor.uid()
|
|
186
191
|
)
|
|
187
192
|
await self._supervisor_ref.add_worker(self.address)
|
|
@@ -208,7 +213,12 @@ class WorkerActor(xo.StatelessActor):
|
|
|
208
213
|
register_embedding,
|
|
209
214
|
unregister_embedding,
|
|
210
215
|
)
|
|
211
|
-
from ..model.image import
|
|
216
|
+
from ..model.image import (
|
|
217
|
+
CustomImageModelFamilyV1,
|
|
218
|
+
get_image_model_descriptions,
|
|
219
|
+
register_image,
|
|
220
|
+
unregister_image,
|
|
221
|
+
)
|
|
212
222
|
from ..model.llm import (
|
|
213
223
|
CustomLLMFamilyV1,
|
|
214
224
|
get_llm_model_descriptions,
|
|
@@ -222,7 +232,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
222
232
|
unregister_rerank,
|
|
223
233
|
)
|
|
224
234
|
|
|
225
|
-
self._custom_register_type_to_cls: Dict[str, Tuple] = {
|
|
235
|
+
self._custom_register_type_to_cls: Dict[str, Tuple] = { # type: ignore
|
|
226
236
|
"LLM": (CustomLLMFamilyV1, register_llm, unregister_llm),
|
|
227
237
|
"embedding": (
|
|
228
238
|
CustomEmbeddingModelSpec,
|
|
@@ -231,10 +241,15 @@ class WorkerActor(xo.StatelessActor):
|
|
|
231
241
|
),
|
|
232
242
|
"rerank": (CustomRerankModelSpec, register_rerank, unregister_rerank),
|
|
233
243
|
"audio": (CustomAudioModelFamilyV1, register_audio, unregister_audio),
|
|
244
|
+
"image": (
|
|
245
|
+
CustomImageModelFamilyV1,
|
|
246
|
+
register_image,
|
|
247
|
+
unregister_image,
|
|
248
|
+
),
|
|
234
249
|
}
|
|
235
250
|
|
|
236
251
|
# record model version
|
|
237
|
-
model_version_infos: Dict[str, List[Dict]] = {}
|
|
252
|
+
model_version_infos: Dict[str, List[Dict]] = {} # type: ignore
|
|
238
253
|
model_version_infos.update(get_llm_model_descriptions())
|
|
239
254
|
model_version_infos.update(get_embedding_model_descriptions())
|
|
240
255
|
model_version_infos.update(get_rerank_model_descriptions())
|
|
@@ -248,7 +263,11 @@ class WorkerActor(xo.StatelessActor):
|
|
|
248
263
|
if os.name != "nt":
|
|
249
264
|
|
|
250
265
|
async def signal_handler():
|
|
251
|
-
|
|
266
|
+
try:
|
|
267
|
+
await self._supervisor_ref.remove_worker(self.address)
|
|
268
|
+
except Exception as e:
|
|
269
|
+
# Ignore the error of rpc, anyway we are exiting
|
|
270
|
+
logger.exception("remove worker rpc error: %s", e)
|
|
252
271
|
os._exit(0)
|
|
253
272
|
|
|
254
273
|
loop = asyncio.get_running_loop()
|
|
@@ -437,6 +456,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
437
456
|
) -> Tuple[str, List[str]]:
|
|
438
457
|
env = {}
|
|
439
458
|
devices = []
|
|
459
|
+
env_name = get_available_device_env_name()
|
|
440
460
|
if gpu_idx is None:
|
|
441
461
|
if isinstance(n_gpu, int) or (n_gpu == "auto" and gpu_count() > 0):
|
|
442
462
|
# Currently, n_gpu=auto means using 1 GPU
|
|
@@ -446,17 +466,17 @@ class WorkerActor(xo.StatelessActor):
|
|
|
446
466
|
if model_type in ["embedding", "rerank"]
|
|
447
467
|
else self.allocate_devices(model_uid=model_uid, n_gpu=gpu_cnt)
|
|
448
468
|
)
|
|
449
|
-
env[
|
|
469
|
+
env[env_name] = ",".join([str(dev) for dev in devices])
|
|
450
470
|
logger.debug(f"GPU selected: {devices} for model {model_uid}")
|
|
451
471
|
if n_gpu is None:
|
|
452
|
-
env[
|
|
472
|
+
env[env_name] = "-1"
|
|
453
473
|
logger.debug(f"GPU disabled for model {model_uid}")
|
|
454
474
|
else:
|
|
455
475
|
assert isinstance(gpu_idx, list)
|
|
456
476
|
devices = await self.allocate_devices_with_gpu_idx(
|
|
457
477
|
model_uid, model_type, gpu_idx # type: ignore
|
|
458
478
|
)
|
|
459
|
-
env[
|
|
479
|
+
env[env_name] = ",".join([str(dev) for dev in devices])
|
|
460
480
|
|
|
461
481
|
if os.name != "nt" and platform.system() != "Darwin":
|
|
462
482
|
# Linux
|
|
@@ -503,67 +523,6 @@ class WorkerActor(xo.StatelessActor):
|
|
|
503
523
|
else:
|
|
504
524
|
raise ValueError(f"Unsupported model type: {model_type}")
|
|
505
525
|
|
|
506
|
-
@log_async(logger=logger)
|
|
507
|
-
async def launch_speculative_model(
|
|
508
|
-
self,
|
|
509
|
-
model_uid: str,
|
|
510
|
-
model_name: str,
|
|
511
|
-
model_size_in_billions: Optional[int],
|
|
512
|
-
quantization: Optional[str],
|
|
513
|
-
draft_model_name: str,
|
|
514
|
-
draft_model_size_in_billions: Optional[int],
|
|
515
|
-
draft_quantization: Optional[str],
|
|
516
|
-
n_gpu: Optional[Union[int, str]] = "auto",
|
|
517
|
-
):
|
|
518
|
-
if n_gpu is not None:
|
|
519
|
-
if isinstance(n_gpu, int) and (n_gpu <= 0 or n_gpu > gpu_count()):
|
|
520
|
-
raise ValueError(
|
|
521
|
-
f"The parameter `n_gpu` must be greater than 0 and "
|
|
522
|
-
f"not greater than the number of GPUs: {gpu_count()} on the machine."
|
|
523
|
-
)
|
|
524
|
-
if isinstance(n_gpu, str) and n_gpu != "auto":
|
|
525
|
-
raise ValueError("Currently `n_gpu` only supports `auto`.")
|
|
526
|
-
|
|
527
|
-
from ..model.llm.core import create_speculative_llm_model_instance
|
|
528
|
-
|
|
529
|
-
subpool_address, devices = await self._create_subpool(model_uid, n_gpu=n_gpu)
|
|
530
|
-
|
|
531
|
-
model, model_description = await asyncio.to_thread(
|
|
532
|
-
create_speculative_llm_model_instance,
|
|
533
|
-
subpool_addr=subpool_address,
|
|
534
|
-
devices=devices,
|
|
535
|
-
model_uid=model_uid,
|
|
536
|
-
model_name=model_name,
|
|
537
|
-
model_size_in_billions=model_size_in_billions,
|
|
538
|
-
quantization=quantization,
|
|
539
|
-
draft_model_name=draft_model_name,
|
|
540
|
-
draft_model_size_in_billions=draft_model_size_in_billions,
|
|
541
|
-
draft_quantization=draft_quantization,
|
|
542
|
-
is_local_deployment=True,
|
|
543
|
-
)
|
|
544
|
-
|
|
545
|
-
try:
|
|
546
|
-
model_ref = await xo.create_actor(
|
|
547
|
-
ModelActor,
|
|
548
|
-
address=subpool_address,
|
|
549
|
-
uid=model_uid,
|
|
550
|
-
worker_address=self.address,
|
|
551
|
-
model=model,
|
|
552
|
-
model_description=model_description,
|
|
553
|
-
)
|
|
554
|
-
await model_ref.load()
|
|
555
|
-
except:
|
|
556
|
-
logger.error(f"Failed to load model {model_uid}", exc_info=True)
|
|
557
|
-
self.release_devices(model_uid=model_uid)
|
|
558
|
-
await self._main_pool.remove_sub_pool(subpool_address)
|
|
559
|
-
raise
|
|
560
|
-
|
|
561
|
-
self._model_uid_to_model[model_uid] = model_ref
|
|
562
|
-
self._model_uid_to_model_spec[model_uid] = model_description
|
|
563
|
-
for dev in devices:
|
|
564
|
-
self._gpu_to_model_uid[int(dev)] = model_uid
|
|
565
|
-
self._model_uid_to_addr[model_uid] = subpool_address
|
|
566
|
-
|
|
567
526
|
async def _get_model_ability(self, model: Any, model_type: str) -> List[str]:
|
|
568
527
|
from ..model.llm.core import LLM
|
|
569
528
|
|
|
@@ -605,6 +564,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
605
564
|
model_size_in_billions: Optional[Union[int, str]],
|
|
606
565
|
model_format: Optional[str],
|
|
607
566
|
quantization: Optional[str],
|
|
567
|
+
model_engine: Optional[str],
|
|
608
568
|
model_type: str = "LLM",
|
|
609
569
|
n_gpu: Optional[Union[int, str]] = "auto",
|
|
610
570
|
peft_model_config: Optional[PeftModelConfig] = None,
|
|
@@ -612,20 +572,28 @@ class WorkerActor(xo.StatelessActor):
|
|
|
612
572
|
gpu_idx: Optional[Union[int, List[int]]] = None,
|
|
613
573
|
**kwargs,
|
|
614
574
|
):
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
Event(
|
|
619
|
-
event_type=EventType.INFO,
|
|
620
|
-
event_ts=int(time.time()),
|
|
621
|
-
event_content="Launch model",
|
|
622
|
-
),
|
|
623
|
-
)
|
|
575
|
+
# !!! Note that The following code must be placed at the very beginning of this function,
|
|
576
|
+
# or there will be problems with auto-recovery.
|
|
577
|
+
# Because `locals()` will collect all the local parameters of this function and pass to this function again.
|
|
624
578
|
launch_args = locals()
|
|
625
579
|
launch_args.pop("self")
|
|
626
580
|
launch_args.pop("kwargs")
|
|
627
581
|
launch_args.update(kwargs)
|
|
628
582
|
|
|
583
|
+
event_model_uid, _, __ = parse_replica_model_uid(model_uid)
|
|
584
|
+
try:
|
|
585
|
+
await self._event_collector_ref.report_event(
|
|
586
|
+
event_model_uid,
|
|
587
|
+
Event(
|
|
588
|
+
event_type=EventType.INFO,
|
|
589
|
+
event_ts=int(time.time()),
|
|
590
|
+
event_content="Launch model",
|
|
591
|
+
),
|
|
592
|
+
)
|
|
593
|
+
except Exception as e:
|
|
594
|
+
# Report callback error can be log and ignore, should not interrupt the Process
|
|
595
|
+
logger.error("report_event error: %s" % (e))
|
|
596
|
+
|
|
629
597
|
if gpu_idx is not None:
|
|
630
598
|
logger.info(
|
|
631
599
|
f"You specify to launch the model: {model_name} on GPU index: {gpu_idx} "
|
|
@@ -657,8 +625,6 @@ class WorkerActor(xo.StatelessActor):
|
|
|
657
625
|
|
|
658
626
|
assert model_uid not in self._model_uid_to_model
|
|
659
627
|
self._check_model_is_valid(model_name, model_format)
|
|
660
|
-
assert self._supervisor_ref is not None
|
|
661
|
-
is_local_deployment = await self._supervisor_ref.is_local_deployment()
|
|
662
628
|
|
|
663
629
|
subpool_address, devices = await self._create_subpool(
|
|
664
630
|
model_uid, model_type, n_gpu=n_gpu, gpu_idx=gpu_idx
|
|
@@ -673,11 +639,11 @@ class WorkerActor(xo.StatelessActor):
|
|
|
673
639
|
model_uid,
|
|
674
640
|
model_type,
|
|
675
641
|
model_name,
|
|
642
|
+
model_engine,
|
|
676
643
|
model_format,
|
|
677
644
|
model_size_in_billions,
|
|
678
645
|
quantization,
|
|
679
646
|
peft_model_config,
|
|
680
|
-
is_local_deployment,
|
|
681
647
|
**kwargs,
|
|
682
648
|
)
|
|
683
649
|
await self.update_cache_status(model_name, model_description)
|
|
@@ -715,14 +681,19 @@ class WorkerActor(xo.StatelessActor):
|
|
|
715
681
|
@log_async(logger=logger)
|
|
716
682
|
async def terminate_model(self, model_uid: str):
|
|
717
683
|
event_model_uid, _, __ = parse_replica_model_uid(model_uid)
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
684
|
+
try:
|
|
685
|
+
await self._event_collector_ref.report_event(
|
|
686
|
+
event_model_uid,
|
|
687
|
+
Event(
|
|
688
|
+
event_type=EventType.INFO,
|
|
689
|
+
event_ts=int(time.time()),
|
|
690
|
+
event_content="Terminate model",
|
|
691
|
+
),
|
|
692
|
+
)
|
|
693
|
+
except Exception as e:
|
|
694
|
+
# Report callback error can be log and ignore, should not interrupt the Process
|
|
695
|
+
logger.error("report_event error: %s" % (e))
|
|
696
|
+
|
|
726
697
|
origin_uid, _, _ = parse_replica_model_uid(model_uid)
|
|
727
698
|
await self._status_guard_ref.update_instance_info(
|
|
728
699
|
origin_uid, {"status": LaunchStatus.TERMINATING.name}
|