xinference 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (120) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/__init__.py +13 -0
  3. xinference/api/oauth2/common.py +14 -0
  4. xinference/api/oauth2/core.py +93 -0
  5. xinference/api/oauth2/types.py +36 -0
  6. xinference/api/oauth2/utils.py +44 -0
  7. xinference/api/restful_api.py +216 -27
  8. xinference/client/oscar/actor_client.py +18 -18
  9. xinference/client/restful/restful_client.py +96 -33
  10. xinference/conftest.py +63 -1
  11. xinference/constants.py +1 -0
  12. xinference/core/chat_interface.py +143 -3
  13. xinference/core/metrics.py +83 -0
  14. xinference/core/model.py +244 -181
  15. xinference/core/status_guard.py +86 -0
  16. xinference/core/supervisor.py +57 -7
  17. xinference/core/worker.py +134 -13
  18. xinference/deploy/cmdline.py +142 -16
  19. xinference/deploy/local.py +39 -7
  20. xinference/deploy/supervisor.py +2 -0
  21. xinference/deploy/worker.py +33 -5
  22. xinference/fields.py +4 -1
  23. xinference/model/core.py +8 -1
  24. xinference/model/embedding/core.py +3 -2
  25. xinference/model/embedding/model_spec_modelscope.json +60 -18
  26. xinference/model/image/stable_diffusion/core.py +4 -3
  27. xinference/model/llm/__init__.py +7 -0
  28. xinference/model/llm/ggml/llamacpp.py +3 -2
  29. xinference/model/llm/llm_family.json +87 -3
  30. xinference/model/llm/llm_family.py +15 -5
  31. xinference/model/llm/llm_family_modelscope.json +92 -3
  32. xinference/model/llm/pytorch/chatglm.py +70 -28
  33. xinference/model/llm/pytorch/core.py +11 -30
  34. xinference/model/llm/pytorch/internlm2.py +155 -0
  35. xinference/model/llm/pytorch/utils.py +0 -153
  36. xinference/model/llm/utils.py +37 -8
  37. xinference/model/llm/vllm/core.py +15 -3
  38. xinference/model/multimodal/__init__.py +15 -8
  39. xinference/model/multimodal/core.py +8 -1
  40. xinference/model/multimodal/model_spec.json +9 -0
  41. xinference/model/multimodal/model_spec_modelscope.json +45 -0
  42. xinference/model/multimodal/qwen_vl.py +5 -9
  43. xinference/model/utils.py +7 -2
  44. xinference/types.py +2 -0
  45. xinference/web/ui/build/asset-manifest.json +3 -3
  46. xinference/web/ui/build/index.html +1 -1
  47. xinference/web/ui/build/static/js/main.b83095c2.js +3 -0
  48. xinference/web/ui/build/static/js/{main.236e72e7.js.LICENSE.txt → main.b83095c2.js.LICENSE.txt} +7 -0
  49. xinference/web/ui/build/static/js/main.b83095c2.js.map +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/0a853b2fa1902551e262a2f1a4b7894341f27b3dd9587f2ef7aaea195af89518.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/101923c539819f26ad11fbcbd6f6e56436b285efbb090dcc7dd648c6e924c4a8.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/193e7ba39e70d4bb2895a5cb317f6f293a5fd02e7e324c02a1eba2f83216419c.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/22858de5265f2d279fca9f2f54dfb147e4b2704200dfb5d2ad3ec9769417328f.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/27696db5fcd4fcf0e7974cadf1e4a2ab89690474045c3188eafd586323ad13bb.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/27bcada3ee8f89d21184b359f022fc965f350ffaca52c9814c29f1fc37121173.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/27bdbe25deab8cf08f7fab8f05f8f26cf84a98809527a37986a4ab73a57ba96a.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/2bee7b8bd3d52976a45d6068e1333df88b943e0e679403c809e45382e3818037.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/30670751f55508ef3b861e13dd71b9e5a10d2561373357a12fc3831a0b77fd93.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/3605cd3a96ff2a3b443c70a101575482279ad26847924cab0684d165ba0d2492.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/3789ef437d3ecbf945bb9cea39093d1f16ebbfa32dbe6daf35abcfb6d48de6f1.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/4942da6bc03bf7373af068e22f916341aabc5b5df855d73c1d348c696724ce37.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/4d933e35e0fe79867d3aa6c46db28804804efddf5490347cb6c2c2879762a157.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/4d96f071168af43965e0fab2ded658fa0a15b8d9ca03789a5ef9c5c16a4e3cee.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/52a6136cb2dbbf9c51d461724d9b283ebe74a73fb19d5df7ba8e13c42bd7174d.json +1 -0
  66. xinference/web/ui/node_modules/.cache/babel-loader/5c408307c982f07f9c09c85c98212d1b1c22548a9194c69548750a3016b91b88.json +1 -0
  67. xinference/web/ui/node_modules/.cache/babel-loader/663adbcb60b942e9cf094c8d9fabe57517f5e5e6e722d28b4948a40b7445a3b8.json +1 -0
  68. xinference/web/ui/node_modules/.cache/babel-loader/666bb2e1b250dc731311a7e4880886177885dfa768508d2ed63e02630cc78725.json +1 -0
  69. xinference/web/ui/node_modules/.cache/babel-loader/71493aadd34d568fbe605cacaba220aa69bd09273251ee4ba27930f8d01fccd8.json +1 -0
  70. xinference/web/ui/node_modules/.cache/babel-loader/8b071db2a5a9ef68dc14d5f606540bd23d9785e365a11997c510656764d2dccf.json +1 -0
  71. xinference/web/ui/node_modules/.cache/babel-loader/8b246d79cd3f6fc78f11777e6a6acca6a2c5d4ecce7f2dd4dcf9a48126440d3c.json +1 -0
  72. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +1 -0
  73. xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +1 -0
  74. xinference/web/ui/node_modules/.cache/babel-loader/a4d72d3b806ba061919115f0c513738726872e3c79cf258f007519d3f91d1a16.json +1 -0
  75. xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +1 -0
  76. xinference/web/ui/node_modules/.cache/babel-loader/b4e4fccaf8f2489a29081f0bf3b191656bd452fb3c8b5e3c6d92d94f680964d5.json +1 -0
  77. xinference/web/ui/node_modules/.cache/babel-loader/b53eb7c7967f6577bd3e678293c44204fb03ffa7fdc1dd59d3099015c68f6f7f.json +1 -0
  78. xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +1 -0
  79. xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +1 -0
  80. xinference/web/ui/node_modules/.cache/babel-loader/d06af85a84e5c5a29d3acf2dbb5b30c0cf75c8aec4ab5f975e6096f944ee4324.json +1 -0
  81. xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +1 -0
  82. xinference/web/ui/node_modules/.cache/babel-loader/d5e150bff31715977d8f537c970f06d4fe3de9909d7e8342244a83a9f6447121.json +1 -0
  83. xinference/web/ui/node_modules/.cache/babel-loader/de36e5c08fd524e341d664883dda6cb1745acc852a4f1b011a35a0b4615f72fa.json +1 -0
  84. xinference/web/ui/node_modules/.cache/babel-loader/f037ffef5992af0892d6d991053c1dace364cd39a3f11f1a41f92776e8a59459.json +1 -0
  85. xinference/web/ui/node_modules/.cache/babel-loader/f23ab356a8603d4a2aaa74388c2f381675c207d37c4d1c832df922e9655c9a6b.json +1 -0
  86. xinference/web/ui/node_modules/.cache/babel-loader/f7c23b0922f4087b9e2e3e46f15c946b772daa46c28c3a12426212ecaf481deb.json +1 -0
  87. xinference/web/ui/node_modules/.cache/babel-loader/f95a8bd358eeb55fa2f49f1224cc2f4f36006359856744ff09ae4bb295f59ec1.json +1 -0
  88. xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +1 -0
  89. xinference/web/ui/node_modules/.package-lock.json +36 -0
  90. xinference/web/ui/node_modules/@types/cookie/package.json +30 -0
  91. xinference/web/ui/node_modules/@types/hoist-non-react-statics/package.json +33 -0
  92. xinference/web/ui/node_modules/react-cookie/package.json +55 -0
  93. xinference/web/ui/node_modules/universal-cookie/package.json +48 -0
  94. xinference/web/ui/package-lock.json +37 -0
  95. xinference/web/ui/package.json +3 -2
  96. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/METADATA +17 -6
  97. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/RECORD +101 -66
  98. xinference/web/ui/build/static/js/main.236e72e7.js +0 -3
  99. xinference/web/ui/build/static/js/main.236e72e7.js.map +0 -1
  100. xinference/web/ui/node_modules/.cache/babel-loader/0cccfbe5d963b8e31eb679f9d9677392839cedd04aa2956ac6b33cf19599d597.json +0 -1
  101. xinference/web/ui/node_modules/.cache/babel-loader/0f3b6cc71b7c83bdc85aa4835927aeb86af2ce0d2ac241917ecfbf90f75c6d27.json +0 -1
  102. xinference/web/ui/node_modules/.cache/babel-loader/2f651cf60b1bde50c0601c7110f77dd44819fb6e2501ff748a631724d91445d4.json +0 -1
  103. xinference/web/ui/node_modules/.cache/babel-loader/42bb623f337ad08ed076484185726e072ca52bb88e373d72c7b052db4c273342.json +0 -1
  104. xinference/web/ui/node_modules/.cache/babel-loader/57af83639c604bd3362d0f03f7505e81c6f67ff77bee7c6bb31f6e5523eba185.json +0 -1
  105. xinference/web/ui/node_modules/.cache/babel-loader/667753ce39ce1d4bcbf9a5f1a103d653be1d19d42f4e1fbaceb9b507679a52c7.json +0 -1
  106. xinference/web/ui/node_modules/.cache/babel-loader/66ed1bd4c06748c1b176a625c25c856997edc787856c73162f82f2b465c5d956.json +0 -1
  107. xinference/web/ui/node_modules/.cache/babel-loader/78f2521da2e2a98b075a2666cb782c7e2c019cd3c72199eecd5901c82d8655df.json +0 -1
  108. xinference/web/ui/node_modules/.cache/babel-loader/8d2b0b3c6988d1894694dcbbe708ef91cfe62d62dac317031f09915ced637953.json +0 -1
  109. xinference/web/ui/node_modules/.cache/babel-loader/9427ae7f1e94ae8dcd2333fb361e381f4054fde07394fe5448658e3417368476.json +0 -1
  110. xinference/web/ui/node_modules/.cache/babel-loader/bcee2b4e76b07620f9087989eb86d43c645ba3c7a74132cf926260af1164af0e.json +0 -1
  111. xinference/web/ui/node_modules/.cache/babel-loader/cc2ddd02ccc1dad1a2737ac247c79e6f6ed2c7836c6b68e511e3048f666b64af.json +0 -1
  112. xinference/web/ui/node_modules/.cache/babel-loader/d2e8e6665a7efc832b43907dadf4e3c896a59eaf8129f9a520882466c8f2e489.json +0 -1
  113. xinference/web/ui/node_modules/.cache/babel-loader/d8a42e9df7157de9f28eecefdf178fd113bf2280d28471b6e32a8a45276042df.json +0 -1
  114. xinference/web/ui/node_modules/.cache/babel-loader/e26750d9556e9741912333349e4da454c53dbfddbfc6002ab49518dcf02af745.json +0 -1
  115. xinference/web/ui/node_modules/.cache/babel-loader/ef42ec014d7bc373b874b2a1ff0dcd785490f125e913698bc049b0bd778e4d66.json +0 -1
  116. xinference/web/ui/node_modules/.cache/babel-loader/fe3eb4d76c79ca98833f686d642224eeeb94cc83ad14300d281623796d087f0a.json +0 -1
  117. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/LICENSE +0 -0
  118. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/WHEEL +0 -0
  119. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/entry_points.txt +0 -0
  120. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/top_level.txt +0 -0
xinference/core/model.py CHANGED
@@ -13,21 +13,21 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import asyncio
16
+ import functools
16
17
  import inspect
17
18
  import json
18
19
  import os
19
- import uuid
20
+ import time
21
+ import types
22
+ import weakref
20
23
  from typing import (
21
24
  TYPE_CHECKING,
22
- Any,
23
25
  AsyncGenerator,
24
26
  Callable,
25
27
  Dict,
26
- Generic,
27
28
  Iterator,
28
29
  List,
29
30
  Optional,
30
- TypeVar,
31
31
  Union,
32
32
  )
33
33
 
@@ -35,8 +35,9 @@ import sse_starlette.sse
35
35
  import xoscar as xo
36
36
 
37
37
  if TYPE_CHECKING:
38
+ from .worker import WorkerActor
38
39
  from ..model.llm.core import LLM
39
- from ..types import ChatCompletionChunk, CompletionChunk
40
+ from ..model.core import ModelDescription
40
41
  import PIL
41
42
 
42
43
  import logging
@@ -45,8 +46,6 @@ logger = logging.getLogger(__name__)
45
46
 
46
47
  from .utils import json_dumps, log_async
47
48
 
48
- T = TypeVar("T")
49
-
50
49
  try:
51
50
  from torch.cuda import OutOfMemoryError
52
51
  except ImportError:
@@ -88,38 +87,30 @@ def request_limit(fn):
88
87
  return wrapped_func
89
88
 
90
89
 
91
- class IteratorWrapper(Generic[T]):
92
- def __init__(self, uid: str, model_actor_addr: str, model_actor_uid: str):
93
- self._uid = uid
94
- self._model_actor_addr = model_actor_addr
95
- self._model_actor_uid = model_actor_uid
96
- self._model_actor_ref: Optional[xo.ActorRefType["ModelActor"]] = None
97
-
98
- async def destroy(self):
99
- if self._model_actor_ref is None:
100
- self._model_actor_ref = await xo.actor_ref(
101
- address=self._model_actor_addr, uid=self._model_actor_uid
102
- )
103
- assert self._model_actor_ref is not None
104
- return await self._model_actor_ref.destroy_generator(self._uid)
90
+ def oom_check(fn):
91
+ @functools.wraps(fn)
92
+ def _wrapper(*args, **kwargs):
93
+ try:
94
+ return fn(*args, **kwargs)
95
+ except OutOfMemoryError:
96
+ logger.exception("Model actor is out of memory.")
97
+ os._exit(1)
105
98
 
106
- def __aiter__(self):
107
- return self
99
+ @functools.wraps(fn)
100
+ async def _async_wrapper(*args, **kwargs):
101
+ try:
102
+ return await fn(*args, **kwargs)
103
+ except OutOfMemoryError:
104
+ logger.exception("Model actor is out of memory.")
105
+ os._exit(1)
108
106
 
109
- async def __anext__(self) -> T:
110
- if self._model_actor_ref is None:
111
- self._model_actor_ref = await xo.actor_ref(
112
- address=self._model_actor_addr, uid=self._model_actor_uid
113
- )
107
+ assert not inspect.isasyncgen(fn)
108
+ assert not inspect.isgenerator(fn)
114
109
 
115
- try:
116
- assert self._model_actor_ref is not None
117
- return await self._model_actor_ref.next(self._uid)
118
- except Exception as e:
119
- if "StopIteration" in str(e):
120
- raise StopAsyncIteration
121
- else:
122
- raise
110
+ if asyncio.iscoroutinefunction(fn):
111
+ return _async_wrapper
112
+ else:
113
+ return _wrapper
123
114
 
124
115
 
125
116
  class ModelActor(xo.StatelessActor):
@@ -152,22 +143,91 @@ class ModelActor(xo.StatelessActor):
152
143
  gc.collect()
153
144
  torch.cuda.empty_cache()
154
145
 
155
- def __init__(self, model: "LLM", request_limits: Optional[int] = None):
146
+ def __init__(
147
+ self,
148
+ worker_address: str,
149
+ model: "LLM",
150
+ model_description: Optional["ModelDescription"] = None,
151
+ request_limits: Optional[int] = None,
152
+ ):
156
153
  super().__init__()
157
154
  from ..model.llm.pytorch.core import PytorchModel
158
155
  from ..model.llm.pytorch.spec_model import SpeculativeModel
159
156
  from ..model.llm.vllm.core import VLLMModel
160
157
 
158
+ self._worker_address = worker_address
161
159
  self._model = model
160
+ self._model_description = (
161
+ model_description.to_dict() if model_description else {}
162
+ )
162
163
  self._request_limits = request_limits
163
164
 
164
165
  self._generators: Dict[str, Union[Iterator, AsyncGenerator]] = {}
166
+ self._current_generator = lambda: None
165
167
  self._lock = (
166
168
  None
167
169
  if isinstance(self._model, (PytorchModel, SpeculativeModel, VLLMModel))
168
170
  else asyncio.locks.Lock()
169
171
  )
172
+ self._worker_ref = None
170
173
  self._serve_count = 0
174
+ self._metrics_labels = {
175
+ "type": self._model_description.get("model_type", "unknown"),
176
+ "model": self.model_uid(),
177
+ "node": self._worker_address,
178
+ "format": self._model_description.get("model_format", "unknown"),
179
+ "quantization": self._model_description.get("quantization", "none"),
180
+ }
181
+ self._loop: Optional[asyncio.AbstractEventLoop] = None
182
+
183
+ async def __post_create__(self):
184
+ self._loop = asyncio.get_running_loop()
185
+
186
+ async def _record_completion_metrics(
187
+ self, duration, completion_tokens, prompt_tokens
188
+ ):
189
+ coros = []
190
+ if completion_tokens > 0:
191
+ coros.append(
192
+ self.record_metrics(
193
+ "output_tokens_total_counter",
194
+ "add",
195
+ {
196
+ "labels": self._metrics_labels,
197
+ "value": completion_tokens,
198
+ },
199
+ )
200
+ )
201
+ if prompt_tokens > 0:
202
+ coros.append(
203
+ self.record_metrics(
204
+ "input_tokens_total_counter",
205
+ "add",
206
+ {"labels": self._metrics_labels, "value": prompt_tokens},
207
+ )
208
+ )
209
+ if completion_tokens > 0:
210
+ generate_throughput = completion_tokens / duration
211
+ coros.append(
212
+ self.record_metrics(
213
+ "generate_throughput",
214
+ "set",
215
+ {
216
+ "labels": self._metrics_labels,
217
+ "value": generate_throughput,
218
+ },
219
+ )
220
+ )
221
+ await asyncio.gather(*coros)
222
+
223
+ async def _get_worker_ref(self) -> xo.ActorRefType["WorkerActor"]:
224
+ from .worker import WorkerActor
225
+
226
+ if self._worker_ref is None:
227
+ self._worker_ref = await xo.actor_ref(
228
+ address=self._worker_address, uid=WorkerActor.uid()
229
+ )
230
+ return self._worker_ref
171
231
 
172
232
  def is_vllm_backend(self) -> bool:
173
233
  from ..model.llm.vllm.core import VLLMModel
@@ -188,106 +248,158 @@ class ModelActor(xo.StatelessActor):
188
248
  )
189
249
  )
190
250
 
191
- def _wrap_generator(self, ret: Any):
192
- if inspect.isgenerator(ret) or inspect.isasyncgen(ret):
193
- if self._lock is not None and self._generators:
194
- raise Exception("Parallel generation is not supported by ggml.")
195
- generator_uid = str(uuid.uuid1())
196
- self._generators[generator_uid] = ret
197
-
198
- return IteratorWrapper(
199
- uid=generator_uid,
200
- model_actor_addr=self.address,
201
- model_actor_uid=self.uid,
202
- )
203
- else:
204
- return json_dumps(ret)
205
-
206
- async def _call_wrapper(self, _wrapper: Callable):
251
+ def _to_json_generator(self, gen: types.GeneratorType):
252
+ start_time = time.time()
253
+ time_to_first_token = None
254
+ final_usage = None
207
255
  try:
208
- assert not (
209
- inspect.iscoroutinefunction(_wrapper)
210
- or inspect.isasyncgenfunction(_wrapper)
211
- )
212
- if self._lock is None:
213
- return await asyncio.to_thread(_wrapper)
214
- else:
215
- async with self._lock:
216
- return await asyncio.to_thread(_wrapper)
256
+ for v in gen:
257
+ if time_to_first_token is None:
258
+ time_to_first_token = (time.time() - start_time) * 1000
259
+ final_usage = v.pop("usage", None)
260
+ v = dict(data=json.dumps(v))
261
+ yield sse_starlette.sse.ensure_bytes(v, None)
217
262
  except OutOfMemoryError:
218
263
  logger.exception(
219
264
  "Model actor is out of memory, model id: %s", self.model_uid()
220
265
  )
221
266
  os._exit(1)
267
+ finally:
268
+ if self._loop is not None and time_to_first_token is not None:
269
+ coro = self.record_metrics(
270
+ "time_to_first_token",
271
+ "set",
272
+ {"labels": self._metrics_labels, "value": time_to_first_token},
273
+ )
274
+ asyncio.run_coroutine_threadsafe(coro, loop=self._loop)
275
+ if self._loop is not None and final_usage is not None:
276
+ coro = self._record_completion_metrics(
277
+ time.time() - start_time,
278
+ completion_tokens=final_usage["completion_tokens"],
279
+ prompt_tokens=final_usage["prompt_tokens"],
280
+ )
281
+ asyncio.run_coroutine_threadsafe(coro, loop=self._loop)
222
282
 
223
- async def _call_async_wrapper(self, _wrapper: Callable):
283
+ async def _to_json_async_gen(self, gen: types.AsyncGeneratorType):
284
+ start_time = time.time()
285
+ time_to_first_token = None
286
+ final_usage = None
224
287
  try:
225
- return await asyncio.create_task(_wrapper())
288
+ async for v in gen:
289
+ if time_to_first_token is None:
290
+ time_to_first_token = (time.time() - start_time) * 1000
291
+ final_usage = v.pop("usage", None)
292
+ v = await asyncio.to_thread(json.dumps, v)
293
+ v = dict(data=v) # noqa: F821
294
+ yield await asyncio.to_thread(sse_starlette.sse.ensure_bytes, v, None)
226
295
  except OutOfMemoryError:
227
296
  logger.exception(
228
297
  "Model actor is out of memory, model id: %s", self.model_uid()
229
298
  )
230
299
  os._exit(1)
300
+ finally:
301
+ coros = []
302
+ if time_to_first_token is not None:
303
+ coros.append(
304
+ self.record_metrics(
305
+ "time_to_first_token",
306
+ "set",
307
+ {"labels": self._metrics_labels, "value": time_to_first_token},
308
+ )
309
+ )
310
+ if final_usage is not None:
311
+ coros.append(
312
+ self._record_completion_metrics(
313
+ time.time() - start_time,
314
+ completion_tokens=final_usage["completion_tokens"],
315
+ prompt_tokens=final_usage["prompt_tokens"],
316
+ )
317
+ )
318
+ await asyncio.gather(*coros)
319
+
320
+ @oom_check
321
+ async def _call_wrapper(self, fn: Callable, *args, **kwargs):
322
+ if self._lock is None:
323
+ if inspect.iscoroutinefunction(fn):
324
+ ret = await fn(*args, **kwargs)
325
+ else:
326
+ ret = await asyncio.to_thread(fn, *args, **kwargs)
327
+ else:
328
+ async with self._lock:
329
+ if inspect.iscoroutinefunction(fn):
330
+ ret = await fn(*args, **kwargs)
331
+ else:
332
+ ret = await asyncio.to_thread(fn, *args, **kwargs)
333
+
334
+ if self._lock is not None and self._current_generator():
335
+ raise Exception("Parallel generation is not supported by ggml.")
336
+
337
+ if inspect.isgenerator(ret):
338
+ gen = self._to_json_generator(ret)
339
+ self._current_generator = weakref.ref(gen)
340
+ return gen
341
+ if inspect.isasyncgen(ret):
342
+ gen = self._to_json_async_gen(ret)
343
+ self._current_generator = weakref.ref(gen)
344
+ return gen
345
+ return await asyncio.to_thread(json_dumps, ret)
231
346
 
232
347
  @log_async(logger=logger)
233
348
  @request_limit
349
+ @xo.generator
234
350
  async def generate(self, prompt: str, *args, **kwargs):
235
- if not hasattr(self._model, "generate") and not hasattr(
236
- self._model, "async_generate"
237
- ):
238
- raise AttributeError(f"Model {self._model.model_spec} is not for generate.")
239
-
240
- def _wrapper():
241
- return self._wrap_generator(
242
- getattr(self._model, "generate")(prompt, *args, **kwargs)
351
+ if hasattr(self._model, "generate"):
352
+ return await self._call_wrapper(
353
+ self._model.generate, prompt, *args, **kwargs
243
354
  )
244
-
245
- async def _async_wrapper():
246
- # for vLLM.
247
- return self._wrap_generator(
248
- await getattr(self._model, "async_generate")(prompt, *args, **kwargs)
355
+ if hasattr(self._model, "async_generate"):
356
+ return await self._call_wrapper(
357
+ self._model.async_generate, prompt, *args, **kwargs
249
358
  )
250
-
251
- if hasattr(self._model, "generate"):
252
- return await self._call_wrapper(_wrapper)
253
- else:
254
- return await self._call_async_wrapper(_async_wrapper)
359
+ raise AttributeError(f"Model {self._model.model_spec} is not for generate.")
255
360
 
256
361
  @log_async(logger=logger)
257
362
  @request_limit
363
+ @xo.generator
258
364
  async def chat(self, prompt: str, *args, **kwargs):
259
- if not hasattr(self._model, "chat") and not hasattr(self._model, "async_chat"):
365
+ start_time = time.time()
366
+ response = None
367
+ try:
368
+ if hasattr(self._model, "chat"):
369
+ response = await self._call_wrapper(
370
+ self._model.chat, prompt, *args, **kwargs
371
+ )
372
+ return response
373
+ if hasattr(self._model, "async_chat"):
374
+ response = await self._call_wrapper(
375
+ self._model.async_chat, prompt, *args, **kwargs
376
+ )
377
+ return response
260
378
  raise AttributeError(f"Model {self._model.model_spec} is not for chat.")
261
-
262
- def _wrapper():
263
- return self._wrap_generator(
264
- getattr(self._model, "chat")(prompt, *args, **kwargs)
265
- )
266
-
267
- async def _async_wrapper():
268
- # for vLLM.
269
- return self._wrap_generator(
270
- await getattr(self._model, "async_chat")(prompt, *args, **kwargs)
271
- )
272
-
273
- if hasattr(self._model, "async_chat"):
274
- return await self._call_async_wrapper(_async_wrapper)
275
- else:
276
- return await self._call_wrapper(_wrapper)
379
+ finally:
380
+ # For the non stream result.
381
+ if response is not None and isinstance(response, dict):
382
+ usage = response["usage"]
383
+ # Some backends may not have a valid usage, we just skip them.
384
+ completion_tokens = usage["completion_tokens"]
385
+ prompt_tokens = usage["prompt_tokens"]
386
+ await self._record_completion_metrics(
387
+ time.time() - start_time,
388
+ completion_tokens,
389
+ prompt_tokens,
390
+ )
277
391
 
278
392
  @log_async(logger=logger)
279
393
  @request_limit
280
394
  async def create_embedding(self, input: Union[str, List[str]], *args, **kwargs):
281
- if not hasattr(self._model, "create_embedding"):
282
- raise AttributeError(
283
- f"Model {self._model.model_spec} is not for creating embedding."
395
+ if hasattr(self._model, "create_embedding"):
396
+ return await self._call_wrapper(
397
+ self._model.create_embedding, input, *args, **kwargs
284
398
  )
285
399
 
286
- def _wrapper():
287
- data = getattr(self._model, "create_embedding")(input, *args, **kwargs)
288
- return json_dumps(data)
289
-
290
- return await self._call_wrapper(_wrapper)
400
+ raise AttributeError(
401
+ f"Model {self._model.model_spec} is not for creating embedding."
402
+ )
291
403
 
292
404
  @log_async(logger=logger)
293
405
  @request_limit
@@ -301,13 +413,9 @@ class ModelActor(xo.StatelessActor):
301
413
  *args,
302
414
  **kwargs,
303
415
  ):
304
- if not hasattr(self._model, "rerank"):
305
- raise AttributeError(
306
- f"Model {self._model.model_spec} is not for reranking."
307
- )
308
-
309
- def _wrapper():
310
- data = getattr(self._model, "rerank")(
416
+ if hasattr(self._model, "rerank"):
417
+ return await self._call_wrapper(
418
+ self._model.rerank,
311
419
  documents,
312
420
  query,
313
421
  top_n,
@@ -316,9 +424,7 @@ class ModelActor(xo.StatelessActor):
316
424
  *args,
317
425
  **kwargs,
318
426
  )
319
- return json_dumps(data)
320
-
321
- return await self._call_wrapper(_wrapper)
427
+ raise AttributeError(f"Model {self._model.model_spec} is not for reranking.")
322
428
 
323
429
  @log_async(logger=logger)
324
430
  @request_limit
@@ -331,18 +437,19 @@ class ModelActor(xo.StatelessActor):
331
437
  *args,
332
438
  **kwargs,
333
439
  ):
334
- if not hasattr(self._model, "text_to_image"):
335
- raise AttributeError(
336
- f"Model {self._model.model_spec} is not for creating image."
337
- )
338
-
339
- def _wrapper():
340
- r = getattr(self._model, "text_to_image")(
341
- prompt, n, size, response_format, *args, **kwargs
440
+ if hasattr(self._model, "text_to_image"):
441
+ return await self._call_wrapper(
442
+ self._model.text_to_image,
443
+ prompt,
444
+ n,
445
+ size,
446
+ response_format,
447
+ *args,
448
+ **kwargs,
342
449
  )
343
- return json_dumps(r)
344
-
345
- return await self._call_wrapper(_wrapper)
450
+ raise AttributeError(
451
+ f"Model {self._model.model_spec} is not for creating image."
452
+ )
346
453
 
347
454
  async def image_to_image(
348
455
  self,
@@ -355,13 +462,9 @@ class ModelActor(xo.StatelessActor):
355
462
  *args,
356
463
  **kwargs,
357
464
  ):
358
- if not hasattr(self._model, "image_to_image"):
359
- raise AttributeError(
360
- f"Model {self._model.model_spec} is not for creating image."
361
- )
362
-
363
- def _wrapper():
364
- r = getattr(self._model, "image_to_image")(
465
+ if hasattr(self._model, "image_to_image"):
466
+ return await self._call_wrapper(
467
+ self._model.image_to_image,
365
468
  image,
366
469
  prompt,
367
470
  negative_prompt,
@@ -371,50 +474,10 @@ class ModelActor(xo.StatelessActor):
371
474
  *args,
372
475
  **kwargs,
373
476
  )
374
- return json_dumps(r)
375
-
376
- return await self._call_wrapper(_wrapper)
377
-
378
- async def next(
379
- self, generator_uid: str
380
- ) -> Union["ChatCompletionChunk", "CompletionChunk"]:
381
- assert generator_uid in self._generators
382
- stop = object()
383
- gen = self._generators[generator_uid]
384
-
385
- def _wrapper():
386
- try:
387
- v = dict(data=json.dumps(next(gen)))
388
- return sse_starlette.sse.ensure_bytes(v, None)
389
- except StopIteration:
390
- return stop
391
-
392
- async def _async_wrapper():
393
- try:
394
- # anext is only available for Python >= 3.10
395
- v = await gen.__anext__()
396
- v = await asyncio.to_thread(json.dumps, v)
397
- v = dict(data=v) # noqa: F821
398
- return await asyncio.to_thread(sse_starlette.sse.ensure_bytes, v, None)
399
- except StopAsyncIteration:
400
- return stop
401
-
402
- if inspect.isgenerator(gen):
403
- r = await self._call_wrapper(_wrapper)
404
- elif inspect.isasyncgen(gen):
405
- # for vLLM.
406
- r = await self._call_async_wrapper(_async_wrapper)
407
- else:
408
- raise TypeError(
409
- f"Unexpected type {type(gen)}, expecting generator or async generator"
410
- )
411
-
412
- if r is stop:
413
- self._generators.pop(generator_uid, None)
414
- raise Exception("StopIteration")
415
- else:
416
- return r
477
+ raise AttributeError(
478
+ f"Model {self._model.model_spec} is not for creating image."
479
+ )
417
480
 
418
- @log_async(logger=logger)
419
- async def destroy_generator(self, generator_uid: str):
420
- self._generators.pop(generator_uid, None)
481
+ async def record_metrics(self, name, op, kwargs):
482
+ worker_ref = await self._get_worker_ref()
483
+ await worker_ref.record_metrics(name, op, kwargs)
@@ -0,0 +1,86 @@
1
+ # Copyright 2022-2024 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from enum import Enum
15
+ from logging import getLogger
16
+ from typing import Dict, List, Optional
17
+
18
+ import xoscar as xo
19
+ from pydantic import BaseModel
20
+
21
+ logger = getLogger(__name__)
22
+
23
+
24
+ class LaunchStatus(Enum):
25
+ CREATING = 1
26
+ UPDATING = 2
27
+ TERMINATING = 3
28
+ TERMINATED = 4
29
+ READY = 5
30
+ ERROR = 6
31
+
32
+
33
+ class InstanceInfo(BaseModel):
34
+ model_name: str
35
+ model_uid: str
36
+ model_ability: List[str]
37
+ replica: int
38
+ status: str
39
+ instance_created_ts: int
40
+
41
+ def update(self, **kwargs):
42
+ for field, value in kwargs.items():
43
+ setattr(self, field, value)
44
+
45
+
46
+ class StatusGuardActor(xo.StatelessActor):
47
+ def __init__(self):
48
+ super().__init__()
49
+ self._model_uid_to_info: Dict[str, InstanceInfo] = {}
50
+
51
+ @classmethod
52
+ def uid(cls) -> str:
53
+ return "status_guard"
54
+
55
+ @staticmethod
56
+ def _drop_terminated_info(instance_infos: List[InstanceInfo]) -> List[InstanceInfo]:
57
+ return [
58
+ info
59
+ for info in instance_infos
60
+ if info.status != LaunchStatus.TERMINATED.name
61
+ ]
62
+
63
+ def set_instance_info(self, model_uid: str, info: InstanceInfo):
64
+ self._model_uid_to_info[model_uid] = info
65
+
66
+ def get_instance_info(
67
+ self, model_name: Optional[str] = None, model_uid: Optional[str] = None
68
+ ) -> List[InstanceInfo]:
69
+ if model_uid is not None:
70
+ return (
71
+ self._drop_terminated_info([self._model_uid_to_info[model_uid]])
72
+ if model_uid in self._model_uid_to_info
73
+ else []
74
+ )
75
+ all_infos: List[InstanceInfo] = list(self._model_uid_to_info.values())
76
+ filtered_infos: List[InstanceInfo] = list(
77
+ filter(lambda info: info.model_name == model_name, all_infos)
78
+ )
79
+ return (
80
+ self._drop_terminated_info(filtered_infos)
81
+ if model_name is not None
82
+ else self._drop_terminated_info(all_infos)
83
+ )
84
+
85
+ def update_instance_info(self, model_uid: str, info: Dict):
86
+ self._model_uid_to_info[model_uid].update(**info)