xinference 0.10.3__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (101) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/auth_service.py +1 -1
  3. xinference/api/restful_api.py +53 -61
  4. xinference/client/restful/restful_client.py +52 -57
  5. xinference/conftest.py +1 -1
  6. xinference/core/cache_tracker.py +1 -1
  7. xinference/core/chat_interface.py +10 -4
  8. xinference/core/event.py +1 -1
  9. xinference/core/model.py +17 -6
  10. xinference/core/status_guard.py +1 -1
  11. xinference/core/supervisor.py +58 -72
  12. xinference/core/worker.py +68 -101
  13. xinference/deploy/cmdline.py +166 -1
  14. xinference/deploy/test/test_cmdline.py +2 -0
  15. xinference/deploy/utils.py +1 -1
  16. xinference/device_utils.py +29 -3
  17. xinference/fields.py +7 -1
  18. xinference/model/audio/whisper.py +88 -12
  19. xinference/model/core.py +2 -2
  20. xinference/model/image/__init__.py +29 -0
  21. xinference/model/image/core.py +6 -0
  22. xinference/model/image/custom.py +109 -0
  23. xinference/model/llm/__init__.py +92 -32
  24. xinference/model/llm/core.py +57 -102
  25. xinference/model/llm/ggml/chatglm.py +98 -13
  26. xinference/model/llm/ggml/llamacpp.py +49 -2
  27. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
  28. xinference/model/llm/llm_family.json +438 -7
  29. xinference/model/llm/llm_family.py +45 -41
  30. xinference/model/llm/llm_family_modelscope.json +258 -5
  31. xinference/model/llm/pytorch/chatglm.py +48 -0
  32. xinference/model/llm/pytorch/core.py +23 -6
  33. xinference/model/llm/pytorch/deepseek_vl.py +115 -33
  34. xinference/model/llm/pytorch/internlm2.py +32 -1
  35. xinference/model/llm/pytorch/qwen_vl.py +94 -12
  36. xinference/model/llm/pytorch/utils.py +38 -1
  37. xinference/model/llm/pytorch/yi_vl.py +96 -51
  38. xinference/model/llm/sglang/core.py +31 -9
  39. xinference/model/llm/utils.py +54 -20
  40. xinference/model/llm/vllm/core.py +101 -7
  41. xinference/thirdparty/omnilmm/chat.py +2 -1
  42. xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
  43. xinference/types.py +11 -0
  44. xinference/web/ui/build/asset-manifest.json +6 -3
  45. xinference/web/ui/build/index.html +1 -1
  46. xinference/web/ui/build/static/css/main.54bca460.css +2 -0
  47. xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
  48. xinference/web/ui/build/static/js/main.551aa479.js +3 -0
  49. xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.551aa479.js.LICENSE.txt} +7 -0
  50. xinference/web/ui/build/static/js/main.551aa479.js.map +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/23caf6f1e52c43e983ca3bfd4189f41dbd645fa78f2dfdcd7f6b69bc41678665.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
  66. xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
  67. xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
  68. xinference/web/ui/node_modules/.package-lock.json +33 -0
  69. xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
  70. xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
  71. xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
  72. xinference/web/ui/node_modules/clipboard/bower.json +18 -0
  73. xinference/web/ui/node_modules/clipboard/composer.json +25 -0
  74. xinference/web/ui/node_modules/clipboard/package.json +63 -0
  75. xinference/web/ui/node_modules/delegate/package.json +31 -0
  76. xinference/web/ui/node_modules/good-listener/bower.json +11 -0
  77. xinference/web/ui/node_modules/good-listener/package.json +35 -0
  78. xinference/web/ui/node_modules/select/bower.json +13 -0
  79. xinference/web/ui/node_modules/select/package.json +29 -0
  80. xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
  81. xinference/web/ui/package-lock.json +34 -0
  82. xinference/web/ui/package.json +1 -0
  83. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/METADATA +13 -12
  84. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/RECORD +88 -67
  85. xinference/client/oscar/__init__.py +0 -13
  86. xinference/client/oscar/actor_client.py +0 -611
  87. xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
  88. xinference/model/llm/pytorch/spec_model.py +0 -186
  89. xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
  90. xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +0 -1
  93. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
  94. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
  95. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
  96. xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
  97. xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
  98. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/LICENSE +0 -0
  99. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/WHEEL +0 -0
  100. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/entry_points.txt +0 -0
  101. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/top_level.txt +0 -0
@@ -48,7 +48,7 @@ class InstanceInfo(BaseModel):
48
48
  class StatusGuardActor(xo.StatelessActor):
49
49
  def __init__(self):
50
50
  super().__init__()
51
- self._model_uid_to_info: Dict[str, InstanceInfo] = {}
51
+ self._model_uid_to_info: Dict[str, InstanceInfo] = {} # type: ignore
52
52
 
53
53
  @classmethod
54
54
  def uid(cls) -> str:
@@ -80,12 +80,12 @@ class ReplicaInfo:
80
80
  class SupervisorActor(xo.StatelessActor):
81
81
  def __init__(self):
82
82
  super().__init__()
83
- self._worker_address_to_worker: Dict[str, xo.ActorRefType["WorkerActor"]] = {}
84
- self._worker_status: Dict[str, WorkerStatus] = {}
85
- self._replica_model_uid_to_worker: Dict[
83
+ self._worker_address_to_worker: Dict[str, xo.ActorRefType["WorkerActor"]] = {} # type: ignore
84
+ self._worker_status: Dict[str, WorkerStatus] = {} # type: ignore
85
+ self._replica_model_uid_to_worker: Dict[ # type: ignore
86
86
  str, xo.ActorRefType["WorkerActor"]
87
87
  ] = {}
88
- self._model_uid_to_replica_info: Dict[str, ReplicaInfo] = {}
88
+ self._model_uid_to_replica_info: Dict[str, ReplicaInfo] = {} # type: ignore
89
89
  self._uptime = None
90
90
  self._lock = asyncio.Lock()
91
91
 
@@ -117,12 +117,12 @@ class SupervisorActor(xo.StatelessActor):
117
117
  from .cache_tracker import CacheTrackerActor
118
118
  from .status_guard import StatusGuardActor
119
119
 
120
- self._status_guard_ref: xo.ActorRefType[
120
+ self._status_guard_ref: xo.ActorRefType[ # type: ignore
121
121
  "StatusGuardActor"
122
122
  ] = await xo.create_actor(
123
123
  StatusGuardActor, address=self.address, uid=StatusGuardActor.uid()
124
124
  )
125
- self._cache_tracker_ref: xo.ActorRefType[
125
+ self._cache_tracker_ref: xo.ActorRefType[ # type: ignore
126
126
  "CacheTrackerActor"
127
127
  ] = await xo.create_actor(
128
128
  CacheTrackerActor, address=self.address, uid=CacheTrackerActor.uid()
@@ -130,7 +130,7 @@ class SupervisorActor(xo.StatelessActor):
130
130
 
131
131
  from .event import EventCollectorActor
132
132
 
133
- self._event_collector_ref: xo.ActorRefType[
133
+ self._event_collector_ref: xo.ActorRefType[ # type: ignore
134
134
  EventCollectorActor
135
135
  ] = await xo.create_actor(
136
136
  EventCollectorActor, address=self.address, uid=EventCollectorActor.uid()
@@ -150,7 +150,13 @@ class SupervisorActor(xo.StatelessActor):
150
150
  register_embedding,
151
151
  unregister_embedding,
152
152
  )
153
- from ..model.image import get_image_model_descriptions
153
+ from ..model.image import (
154
+ CustomImageModelFamilyV1,
155
+ generate_image_description,
156
+ get_image_model_descriptions,
157
+ register_image,
158
+ unregister_image,
159
+ )
154
160
  from ..model.llm import (
155
161
  CustomLLMFamilyV1,
156
162
  generate_llm_description,
@@ -166,7 +172,7 @@ class SupervisorActor(xo.StatelessActor):
166
172
  unregister_rerank,
167
173
  )
168
174
 
169
- self._custom_register_type_to_cls: Dict[str, Tuple] = {
175
+ self._custom_register_type_to_cls: Dict[str, Tuple] = { # type: ignore
170
176
  "LLM": (
171
177
  CustomLLMFamilyV1,
172
178
  register_llm,
@@ -185,6 +191,12 @@ class SupervisorActor(xo.StatelessActor):
185
191
  unregister_rerank,
186
192
  generate_rerank_description,
187
193
  ),
194
+ "image": (
195
+ CustomImageModelFamilyV1,
196
+ register_image,
197
+ unregister_image,
198
+ generate_image_description,
199
+ ),
188
200
  "audio": (
189
201
  CustomAudioModelFamilyV1,
190
202
  register_audio,
@@ -194,7 +206,7 @@ class SupervisorActor(xo.StatelessActor):
194
206
  }
195
207
 
196
208
  # record model version
197
- model_version_infos: Dict[str, List[Dict]] = {}
209
+ model_version_infos: Dict[str, List[Dict]] = {} # type: ignore
198
210
  model_version_infos.update(get_llm_model_descriptions())
199
211
  model_version_infos.update(get_embedding_model_descriptions())
200
212
  model_version_infos.update(get_rerank_model_descriptions())
@@ -272,7 +284,7 @@ class SupervisorActor(xo.StatelessActor):
272
284
  return {
273
285
  "chat": list(BUILTIN_LLM_MODEL_CHAT_FAMILIES),
274
286
  "generate": list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES),
275
- "tool_call": list(BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES),
287
+ "tools": list(BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES),
276
288
  }
277
289
 
278
290
  async def get_devices_count(self) -> int:
@@ -486,6 +498,7 @@ class SupervisorActor(xo.StatelessActor):
486
498
  return ret
487
499
  elif model_type == "image":
488
500
  from ..model.image import BUILTIN_IMAGE_MODELS
501
+ from ..model.image.custom import get_user_defined_images
489
502
 
490
503
  ret = []
491
504
  for model_name, family in BUILTIN_IMAGE_MODELS.items():
@@ -494,6 +507,16 @@ class SupervisorActor(xo.StatelessActor):
494
507
  else:
495
508
  ret.append({"model_name": model_name, "is_builtin": True})
496
509
 
510
+ for model_spec in get_user_defined_images():
511
+ if detailed:
512
+ ret.append(
513
+ await self._to_image_model_reg(model_spec, is_builtin=False)
514
+ )
515
+ else:
516
+ ret.append(
517
+ {"model_name": model_spec.model_name, "is_builtin": False}
518
+ )
519
+
497
520
  ret.sort(key=sort_helper)
498
521
  return ret
499
522
  elif model_type == "audio":
@@ -567,8 +590,9 @@ class SupervisorActor(xo.StatelessActor):
567
590
  raise ValueError(f"Model {model_name} not found")
568
591
  elif model_type == "image":
569
592
  from ..model.image import BUILTIN_IMAGE_MODELS
593
+ from ..model.image.custom import get_user_defined_images
570
594
 
571
- for f in BUILTIN_IMAGE_MODELS.values():
595
+ for f in list(BUILTIN_IMAGE_MODELS.values()) + get_user_defined_images():
572
596
  if f.model_name == model_name:
573
597
  return f
574
598
  raise ValueError(f"Model {model_name} not found")
@@ -591,6 +615,24 @@ class SupervisorActor(xo.StatelessActor):
591
615
  else:
592
616
  raise ValueError(f"Unsupported model type: {model_type}")
593
617
 
618
+ @log_async(logger=logger)
619
+ async def query_engines_by_model_name(self, model_name: str):
620
+ from copy import deepcopy
621
+
622
+ from ..model.llm.llm_family import LLM_ENGINES
623
+
624
+ if model_name not in LLM_ENGINES:
625
+ raise ValueError(f"Model {model_name} not found")
626
+
627
+ # filter llm_class
628
+ engine_params = deepcopy(LLM_ENGINES[model_name])
629
+ for engine in engine_params:
630
+ params = engine_params[engine]
631
+ for param in params:
632
+ del param["llm_class"]
633
+
634
+ return engine_params
635
+
594
636
  @log_async(logger=logger)
595
637
  async def register_model(self, model_type: str, model: str, persist: bool):
596
638
  if model_type in self._custom_register_type_to_cls:
@@ -651,6 +693,7 @@ class SupervisorActor(xo.StatelessActor):
651
693
  self,
652
694
  model_uid: Optional[str],
653
695
  model_type: str,
696
+ model_engine: Optional[str],
654
697
  model_version: str,
655
698
  replica: int = 1,
656
699
  n_gpu: Optional[Union[int, str]] = "auto",
@@ -666,6 +709,7 @@ class SupervisorActor(xo.StatelessActor):
666
709
  return await self.launch_builtin_model(
667
710
  model_uid=model_uid,
668
711
  model_name=parse_results[0],
712
+ model_engine=model_engine,
669
713
  model_size_in_billions=parse_results[1] if model_type == "LLM" else None,
670
714
  model_format=parse_results[2] if model_type == "LLM" else None,
671
715
  quantization=parse_results[3] if model_type == "LLM" else None,
@@ -677,66 +721,6 @@ class SupervisorActor(xo.StatelessActor):
677
721
  **kwargs,
678
722
  )
679
723
 
680
- async def launch_speculative_llm(
681
- self,
682
- model_uid: Optional[str],
683
- model_name: str,
684
- model_size_in_billions: Optional[Union[int, str]],
685
- quantization: Optional[str],
686
- draft_model_name: str,
687
- draft_model_size_in_billions: Optional[int],
688
- draft_quantization: Optional[str],
689
- n_gpu: Optional[Union[int, str]] = "auto",
690
- ) -> str:
691
- if model_uid is None:
692
- model_uid = self._gen_model_uid(model_name)
693
- logger.debug(
694
- (
695
- f"Enter launch_speculative_llm, model_uid: %s, model_name: %s, model_size: %s, "
696
- f"draft_model_name: %s, draft_model_size: %s"
697
- ),
698
- model_uid,
699
- model_name,
700
- str(model_size_in_billions) if model_size_in_billions else "",
701
- draft_model_name,
702
- draft_model_size_in_billions,
703
- )
704
-
705
- # TODO: the draft and target model must be on the same worker.
706
- if not self.is_local_deployment():
707
- raise ValueError(
708
- "Speculative model is not supported in distributed deployment yet."
709
- )
710
-
711
- if model_uid in self._model_uid_to_replica_info:
712
- raise ValueError(f"Model is already in the model list, uid: {model_uid}")
713
-
714
- worker_ref = await self._choose_worker()
715
- replica = 1
716
- self._model_uid_to_replica_info[model_uid] = ReplicaInfo(
717
- replica=replica, scheduler=itertools.cycle(range(replica))
718
- )
719
-
720
- try:
721
- rep_model_uid = f"{model_uid}-{1}-{0}"
722
- await worker_ref.launch_speculative_model(
723
- model_uid=rep_model_uid,
724
- model_name=model_name,
725
- model_size_in_billions=model_size_in_billions,
726
- quantization=quantization,
727
- draft_model_name=draft_model_name,
728
- draft_model_size_in_billions=draft_model_size_in_billions,
729
- draft_quantization=draft_quantization,
730
- n_gpu=n_gpu,
731
- )
732
- self._replica_model_uid_to_worker[rep_model_uid] = worker_ref
733
-
734
- except Exception:
735
- # terminate_model will remove the replica info.
736
- await self.terminate_model(model_uid, suppress_exception=True)
737
- raise
738
- return model_uid
739
-
740
724
  async def launch_builtin_model(
741
725
  self,
742
726
  model_uid: Optional[str],
@@ -744,6 +728,7 @@ class SupervisorActor(xo.StatelessActor):
744
728
  model_size_in_billions: Optional[Union[int, str]],
745
729
  model_format: Optional[str],
746
730
  quantization: Optional[str],
731
+ model_engine: Optional[str],
747
732
  model_type: Optional[str],
748
733
  replica: int = 1,
749
734
  n_gpu: Optional[Union[int, str]] = "auto",
@@ -799,6 +784,7 @@ class SupervisorActor(xo.StatelessActor):
799
784
  model_size_in_billions=model_size_in_billions,
800
785
  model_format=model_format,
801
786
  quantization=quantization,
787
+ model_engine=model_engine,
802
788
  model_type=model_type,
803
789
  n_gpu=n_gpu,
804
790
  request_limits=request_limits,
xinference/core/worker.py CHANGED
@@ -34,7 +34,7 @@ from ..constants import (
34
34
  )
35
35
  from ..core import ModelActor
36
36
  from ..core.status_guard import LaunchStatus
37
- from ..device_utils import gpu_count
37
+ from ..device_utils import get_available_device_env_name, gpu_count
38
38
  from ..model.core import ModelDescription, create_model_instance
39
39
  from ..types import PeftModelConfig
40
40
  from .event import Event, EventCollectorActor, EventType
@@ -80,7 +80,7 @@ class WorkerActor(xo.StatelessActor):
80
80
  int, Set[Tuple[str, str]]
81
81
  ] = defaultdict(set)
82
82
  self._model_uid_to_addr: Dict[str, str] = {}
83
- self._model_uid_to_recover_count: Dict[str, int] = {}
83
+ self._model_uid_to_recover_count: Dict[str, Optional[int]] = {}
84
84
  self._model_uid_to_launch_args: Dict[str, Dict] = {}
85
85
 
86
86
  # metrics export server.
@@ -137,14 +137,19 @@ class WorkerActor(xo.StatelessActor):
137
137
  recover_count - 1,
138
138
  )
139
139
  event_model_uid, _, __ = parse_replica_model_uid(model_uid)
140
- await self._event_collector_ref.report_event(
141
- event_model_uid,
142
- Event(
143
- event_type=EventType.WARNING,
144
- event_ts=int(time.time()),
145
- event_content="Recreate model",
146
- ),
147
- )
140
+ try:
141
+ await self._event_collector_ref.report_event(
142
+ event_model_uid,
143
+ Event(
144
+ event_type=EventType.WARNING,
145
+ event_ts=int(time.time()),
146
+ event_content="Recreate model",
147
+ ),
148
+ )
149
+ except Exception as e:
150
+ # Report callback error can be log and ignore, should not interrupt the Process
151
+ logger.error("report_event error: %s" % (e))
152
+
148
153
  self._model_uid_to_recover_count[model_uid] = (
149
154
  recover_count - 1
150
155
  )
@@ -166,22 +171,22 @@ class WorkerActor(xo.StatelessActor):
166
171
  from .status_guard import StatusGuardActor
167
172
  from .supervisor import SupervisorActor
168
173
 
169
- self._status_guard_ref: xo.ActorRefType[
174
+ self._status_guard_ref: xo.ActorRefType[ # type: ignore
170
175
  "StatusGuardActor"
171
176
  ] = await xo.actor_ref(
172
177
  address=self._supervisor_address, uid=StatusGuardActor.uid()
173
178
  )
174
- self._event_collector_ref: xo.ActorRefType[
179
+ self._event_collector_ref: xo.ActorRefType[ # type: ignore
175
180
  EventCollectorActor
176
181
  ] = await xo.actor_ref(
177
182
  address=self._supervisor_address, uid=EventCollectorActor.uid()
178
183
  )
179
- self._cache_tracker_ref: xo.ActorRefType[
184
+ self._cache_tracker_ref: xo.ActorRefType[ # type: ignore
180
185
  "CacheTrackerActor"
181
186
  ] = await xo.actor_ref(
182
187
  address=self._supervisor_address, uid=CacheTrackerActor.uid()
183
188
  )
184
- self._supervisor_ref: xo.ActorRefType["SupervisorActor"] = await xo.actor_ref(
189
+ self._supervisor_ref: xo.ActorRefType["SupervisorActor"] = await xo.actor_ref( # type: ignore
185
190
  address=self._supervisor_address, uid=SupervisorActor.uid()
186
191
  )
187
192
  await self._supervisor_ref.add_worker(self.address)
@@ -208,7 +213,12 @@ class WorkerActor(xo.StatelessActor):
208
213
  register_embedding,
209
214
  unregister_embedding,
210
215
  )
211
- from ..model.image import get_image_model_descriptions
216
+ from ..model.image import (
217
+ CustomImageModelFamilyV1,
218
+ get_image_model_descriptions,
219
+ register_image,
220
+ unregister_image,
221
+ )
212
222
  from ..model.llm import (
213
223
  CustomLLMFamilyV1,
214
224
  get_llm_model_descriptions,
@@ -222,7 +232,7 @@ class WorkerActor(xo.StatelessActor):
222
232
  unregister_rerank,
223
233
  )
224
234
 
225
- self._custom_register_type_to_cls: Dict[str, Tuple] = {
235
+ self._custom_register_type_to_cls: Dict[str, Tuple] = { # type: ignore
226
236
  "LLM": (CustomLLMFamilyV1, register_llm, unregister_llm),
227
237
  "embedding": (
228
238
  CustomEmbeddingModelSpec,
@@ -231,10 +241,15 @@ class WorkerActor(xo.StatelessActor):
231
241
  ),
232
242
  "rerank": (CustomRerankModelSpec, register_rerank, unregister_rerank),
233
243
  "audio": (CustomAudioModelFamilyV1, register_audio, unregister_audio),
244
+ "image": (
245
+ CustomImageModelFamilyV1,
246
+ register_image,
247
+ unregister_image,
248
+ ),
234
249
  }
235
250
 
236
251
  # record model version
237
- model_version_infos: Dict[str, List[Dict]] = {}
252
+ model_version_infos: Dict[str, List[Dict]] = {} # type: ignore
238
253
  model_version_infos.update(get_llm_model_descriptions())
239
254
  model_version_infos.update(get_embedding_model_descriptions())
240
255
  model_version_infos.update(get_rerank_model_descriptions())
@@ -248,7 +263,11 @@ class WorkerActor(xo.StatelessActor):
248
263
  if os.name != "nt":
249
264
 
250
265
  async def signal_handler():
251
- await self._supervisor_ref.remove_worker(self.address)
266
+ try:
267
+ await self._supervisor_ref.remove_worker(self.address)
268
+ except Exception as e:
269
+ # Ignore the error of rpc, anyway we are exiting
270
+ logger.exception("remove worker rpc error: %s", e)
252
271
  os._exit(0)
253
272
 
254
273
  loop = asyncio.get_running_loop()
@@ -437,6 +456,7 @@ class WorkerActor(xo.StatelessActor):
437
456
  ) -> Tuple[str, List[str]]:
438
457
  env = {}
439
458
  devices = []
459
+ env_name = get_available_device_env_name()
440
460
  if gpu_idx is None:
441
461
  if isinstance(n_gpu, int) or (n_gpu == "auto" and gpu_count() > 0):
442
462
  # Currently, n_gpu=auto means using 1 GPU
@@ -446,17 +466,17 @@ class WorkerActor(xo.StatelessActor):
446
466
  if model_type in ["embedding", "rerank"]
447
467
  else self.allocate_devices(model_uid=model_uid, n_gpu=gpu_cnt)
448
468
  )
449
- env["CUDA_VISIBLE_DEVICES"] = ",".join([str(dev) for dev in devices])
469
+ env[env_name] = ",".join([str(dev) for dev in devices])
450
470
  logger.debug(f"GPU selected: {devices} for model {model_uid}")
451
471
  if n_gpu is None:
452
- env["CUDA_VISIBLE_DEVICES"] = "-1"
472
+ env[env_name] = "-1"
453
473
  logger.debug(f"GPU disabled for model {model_uid}")
454
474
  else:
455
475
  assert isinstance(gpu_idx, list)
456
476
  devices = await self.allocate_devices_with_gpu_idx(
457
477
  model_uid, model_type, gpu_idx # type: ignore
458
478
  )
459
- env["CUDA_VISIBLE_DEVICES"] = ",".join([str(dev) for dev in devices])
479
+ env[env_name] = ",".join([str(dev) for dev in devices])
460
480
 
461
481
  if os.name != "nt" and platform.system() != "Darwin":
462
482
  # Linux
@@ -503,67 +523,6 @@ class WorkerActor(xo.StatelessActor):
503
523
  else:
504
524
  raise ValueError(f"Unsupported model type: {model_type}")
505
525
 
506
- @log_async(logger=logger)
507
- async def launch_speculative_model(
508
- self,
509
- model_uid: str,
510
- model_name: str,
511
- model_size_in_billions: Optional[int],
512
- quantization: Optional[str],
513
- draft_model_name: str,
514
- draft_model_size_in_billions: Optional[int],
515
- draft_quantization: Optional[str],
516
- n_gpu: Optional[Union[int, str]] = "auto",
517
- ):
518
- if n_gpu is not None:
519
- if isinstance(n_gpu, int) and (n_gpu <= 0 or n_gpu > gpu_count()):
520
- raise ValueError(
521
- f"The parameter `n_gpu` must be greater than 0 and "
522
- f"not greater than the number of GPUs: {gpu_count()} on the machine."
523
- )
524
- if isinstance(n_gpu, str) and n_gpu != "auto":
525
- raise ValueError("Currently `n_gpu` only supports `auto`.")
526
-
527
- from ..model.llm.core import create_speculative_llm_model_instance
528
-
529
- subpool_address, devices = await self._create_subpool(model_uid, n_gpu=n_gpu)
530
-
531
- model, model_description = await asyncio.to_thread(
532
- create_speculative_llm_model_instance,
533
- subpool_addr=subpool_address,
534
- devices=devices,
535
- model_uid=model_uid,
536
- model_name=model_name,
537
- model_size_in_billions=model_size_in_billions,
538
- quantization=quantization,
539
- draft_model_name=draft_model_name,
540
- draft_model_size_in_billions=draft_model_size_in_billions,
541
- draft_quantization=draft_quantization,
542
- is_local_deployment=True,
543
- )
544
-
545
- try:
546
- model_ref = await xo.create_actor(
547
- ModelActor,
548
- address=subpool_address,
549
- uid=model_uid,
550
- worker_address=self.address,
551
- model=model,
552
- model_description=model_description,
553
- )
554
- await model_ref.load()
555
- except:
556
- logger.error(f"Failed to load model {model_uid}", exc_info=True)
557
- self.release_devices(model_uid=model_uid)
558
- await self._main_pool.remove_sub_pool(subpool_address)
559
- raise
560
-
561
- self._model_uid_to_model[model_uid] = model_ref
562
- self._model_uid_to_model_spec[model_uid] = model_description
563
- for dev in devices:
564
- self._gpu_to_model_uid[int(dev)] = model_uid
565
- self._model_uid_to_addr[model_uid] = subpool_address
566
-
567
526
  async def _get_model_ability(self, model: Any, model_type: str) -> List[str]:
568
527
  from ..model.llm.core import LLM
569
528
 
@@ -605,6 +564,7 @@ class WorkerActor(xo.StatelessActor):
605
564
  model_size_in_billions: Optional[Union[int, str]],
606
565
  model_format: Optional[str],
607
566
  quantization: Optional[str],
567
+ model_engine: Optional[str],
608
568
  model_type: str = "LLM",
609
569
  n_gpu: Optional[Union[int, str]] = "auto",
610
570
  peft_model_config: Optional[PeftModelConfig] = None,
@@ -621,14 +581,18 @@ class WorkerActor(xo.StatelessActor):
621
581
  launch_args.update(kwargs)
622
582
 
623
583
  event_model_uid, _, __ = parse_replica_model_uid(model_uid)
624
- await self._event_collector_ref.report_event(
625
- event_model_uid,
626
- Event(
627
- event_type=EventType.INFO,
628
- event_ts=int(time.time()),
629
- event_content="Launch model",
630
- ),
631
- )
584
+ try:
585
+ await self._event_collector_ref.report_event(
586
+ event_model_uid,
587
+ Event(
588
+ event_type=EventType.INFO,
589
+ event_ts=int(time.time()),
590
+ event_content="Launch model",
591
+ ),
592
+ )
593
+ except Exception as e:
594
+ # Report callback error can be log and ignore, should not interrupt the Process
595
+ logger.error("report_event error: %s" % (e))
632
596
 
633
597
  if gpu_idx is not None:
634
598
  logger.info(
@@ -661,8 +625,6 @@ class WorkerActor(xo.StatelessActor):
661
625
 
662
626
  assert model_uid not in self._model_uid_to_model
663
627
  self._check_model_is_valid(model_name, model_format)
664
- assert self._supervisor_ref is not None
665
- is_local_deployment = await self._supervisor_ref.is_local_deployment()
666
628
 
667
629
  subpool_address, devices = await self._create_subpool(
668
630
  model_uid, model_type, n_gpu=n_gpu, gpu_idx=gpu_idx
@@ -677,11 +639,11 @@ class WorkerActor(xo.StatelessActor):
677
639
  model_uid,
678
640
  model_type,
679
641
  model_name,
642
+ model_engine,
680
643
  model_format,
681
644
  model_size_in_billions,
682
645
  quantization,
683
646
  peft_model_config,
684
- is_local_deployment,
685
647
  **kwargs,
686
648
  )
687
649
  await self.update_cache_status(model_name, model_description)
@@ -719,14 +681,19 @@ class WorkerActor(xo.StatelessActor):
719
681
  @log_async(logger=logger)
720
682
  async def terminate_model(self, model_uid: str):
721
683
  event_model_uid, _, __ = parse_replica_model_uid(model_uid)
722
- await self._event_collector_ref.report_event(
723
- event_model_uid,
724
- Event(
725
- event_type=EventType.INFO,
726
- event_ts=int(time.time()),
727
- event_content="Terminate model",
728
- ),
729
- )
684
+ try:
685
+ await self._event_collector_ref.report_event(
686
+ event_model_uid,
687
+ Event(
688
+ event_type=EventType.INFO,
689
+ event_ts=int(time.time()),
690
+ event_content="Terminate model",
691
+ ),
692
+ )
693
+ except Exception as e:
694
+ # Report callback error can be log and ignore, should not interrupt the Process
695
+ logger.error("report_event error: %s" % (e))
696
+
730
697
  origin_uid, _, _ = parse_replica_model_uid(model_uid)
731
698
  await self._status_guard_ref.update_instance_info(
732
699
  origin_uid, {"status": LaunchStatus.TERMINATING.name}