xinference 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/oauth2/__init__.py +13 -0
- xinference/api/oauth2/common.py +14 -0
- xinference/api/oauth2/core.py +93 -0
- xinference/api/oauth2/types.py +36 -0
- xinference/api/oauth2/utils.py +44 -0
- xinference/api/restful_api.py +216 -27
- xinference/client/oscar/actor_client.py +18 -18
- xinference/client/restful/restful_client.py +96 -33
- xinference/conftest.py +63 -1
- xinference/constants.py +1 -0
- xinference/core/chat_interface.py +143 -3
- xinference/core/metrics.py +83 -0
- xinference/core/model.py +244 -181
- xinference/core/status_guard.py +86 -0
- xinference/core/supervisor.py +57 -7
- xinference/core/worker.py +134 -13
- xinference/deploy/cmdline.py +142 -16
- xinference/deploy/local.py +39 -7
- xinference/deploy/supervisor.py +2 -0
- xinference/deploy/worker.py +33 -5
- xinference/fields.py +4 -1
- xinference/model/core.py +8 -1
- xinference/model/embedding/core.py +3 -2
- xinference/model/embedding/model_spec_modelscope.json +60 -18
- xinference/model/image/stable_diffusion/core.py +4 -3
- xinference/model/llm/__init__.py +7 -0
- xinference/model/llm/ggml/llamacpp.py +3 -2
- xinference/model/llm/llm_family.json +87 -3
- xinference/model/llm/llm_family.py +15 -5
- xinference/model/llm/llm_family_modelscope.json +92 -3
- xinference/model/llm/pytorch/chatglm.py +70 -28
- xinference/model/llm/pytorch/core.py +11 -30
- xinference/model/llm/pytorch/internlm2.py +155 -0
- xinference/model/llm/pytorch/utils.py +0 -153
- xinference/model/llm/utils.py +37 -8
- xinference/model/llm/vllm/core.py +15 -3
- xinference/model/multimodal/__init__.py +15 -8
- xinference/model/multimodal/core.py +8 -1
- xinference/model/multimodal/model_spec.json +9 -0
- xinference/model/multimodal/model_spec_modelscope.json +45 -0
- xinference/model/multimodal/qwen_vl.py +5 -9
- xinference/model/utils.py +7 -2
- xinference/types.py +2 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.b83095c2.js +3 -0
- xinference/web/ui/build/static/js/{main.236e72e7.js.LICENSE.txt → main.b83095c2.js.LICENSE.txt} +7 -0
- xinference/web/ui/build/static/js/main.b83095c2.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0a853b2fa1902551e262a2f1a4b7894341f27b3dd9587f2ef7aaea195af89518.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/101923c539819f26ad11fbcbd6f6e56436b285efbb090dcc7dd648c6e924c4a8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/193e7ba39e70d4bb2895a5cb317f6f293a5fd02e7e324c02a1eba2f83216419c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/22858de5265f2d279fca9f2f54dfb147e4b2704200dfb5d2ad3ec9769417328f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/27696db5fcd4fcf0e7974cadf1e4a2ab89690474045c3188eafd586323ad13bb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/27bcada3ee8f89d21184b359f022fc965f350ffaca52c9814c29f1fc37121173.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/27bdbe25deab8cf08f7fab8f05f8f26cf84a98809527a37986a4ab73a57ba96a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2bee7b8bd3d52976a45d6068e1333df88b943e0e679403c809e45382e3818037.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/30670751f55508ef3b861e13dd71b9e5a10d2561373357a12fc3831a0b77fd93.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3605cd3a96ff2a3b443c70a101575482279ad26847924cab0684d165ba0d2492.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3789ef437d3ecbf945bb9cea39093d1f16ebbfa32dbe6daf35abcfb6d48de6f1.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4942da6bc03bf7373af068e22f916341aabc5b5df855d73c1d348c696724ce37.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4d933e35e0fe79867d3aa6c46db28804804efddf5490347cb6c2c2879762a157.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4d96f071168af43965e0fab2ded658fa0a15b8d9ca03789a5ef9c5c16a4e3cee.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/52a6136cb2dbbf9c51d461724d9b283ebe74a73fb19d5df7ba8e13c42bd7174d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5c408307c982f07f9c09c85c98212d1b1c22548a9194c69548750a3016b91b88.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/663adbcb60b942e9cf094c8d9fabe57517f5e5e6e722d28b4948a40b7445a3b8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/666bb2e1b250dc731311a7e4880886177885dfa768508d2ed63e02630cc78725.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/71493aadd34d568fbe605cacaba220aa69bd09273251ee4ba27930f8d01fccd8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8b071db2a5a9ef68dc14d5f606540bd23d9785e365a11997c510656764d2dccf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8b246d79cd3f6fc78f11777e6a6acca6a2c5d4ecce7f2dd4dcf9a48126440d3c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a4d72d3b806ba061919115f0c513738726872e3c79cf258f007519d3f91d1a16.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b4e4fccaf8f2489a29081f0bf3b191656bd452fb3c8b5e3c6d92d94f680964d5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b53eb7c7967f6577bd3e678293c44204fb03ffa7fdc1dd59d3099015c68f6f7f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d06af85a84e5c5a29d3acf2dbb5b30c0cf75c8aec4ab5f975e6096f944ee4324.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d5e150bff31715977d8f537c970f06d4fe3de9909d7e8342244a83a9f6447121.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/de36e5c08fd524e341d664883dda6cb1745acc852a4f1b011a35a0b4615f72fa.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f037ffef5992af0892d6d991053c1dace364cd39a3f11f1a41f92776e8a59459.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f23ab356a8603d4a2aaa74388c2f381675c207d37c4d1c832df922e9655c9a6b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f7c23b0922f4087b9e2e3e46f15c946b772daa46c28c3a12426212ecaf481deb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f95a8bd358eeb55fa2f49f1224cc2f4f36006359856744ff09ae4bb295f59ec1.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +36 -0
- xinference/web/ui/node_modules/@types/cookie/package.json +30 -0
- xinference/web/ui/node_modules/@types/hoist-non-react-statics/package.json +33 -0
- xinference/web/ui/node_modules/react-cookie/package.json +55 -0
- xinference/web/ui/node_modules/universal-cookie/package.json +48 -0
- xinference/web/ui/package-lock.json +37 -0
- xinference/web/ui/package.json +3 -2
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/METADATA +17 -6
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/RECORD +101 -66
- xinference/web/ui/build/static/js/main.236e72e7.js +0 -3
- xinference/web/ui/build/static/js/main.236e72e7.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0cccfbe5d963b8e31eb679f9d9677392839cedd04aa2956ac6b33cf19599d597.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0f3b6cc71b7c83bdc85aa4835927aeb86af2ce0d2ac241917ecfbf90f75c6d27.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2f651cf60b1bde50c0601c7110f77dd44819fb6e2501ff748a631724d91445d4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/42bb623f337ad08ed076484185726e072ca52bb88e373d72c7b052db4c273342.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/57af83639c604bd3362d0f03f7505e81c6f67ff77bee7c6bb31f6e5523eba185.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/667753ce39ce1d4bcbf9a5f1a103d653be1d19d42f4e1fbaceb9b507679a52c7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/66ed1bd4c06748c1b176a625c25c856997edc787856c73162f82f2b465c5d956.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/78f2521da2e2a98b075a2666cb782c7e2c019cd3c72199eecd5901c82d8655df.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8d2b0b3c6988d1894694dcbbe708ef91cfe62d62dac317031f09915ced637953.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/9427ae7f1e94ae8dcd2333fb361e381f4054fde07394fe5448658e3417368476.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bcee2b4e76b07620f9087989eb86d43c645ba3c7a74132cf926260af1164af0e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/cc2ddd02ccc1dad1a2737ac247c79e6f6ed2c7836c6b68e511e3048f666b64af.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d2e8e6665a7efc832b43907dadf4e3c896a59eaf8129f9a520882466c8f2e489.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d8a42e9df7157de9f28eecefdf178fd113bf2280d28471b6e32a8a45276042df.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e26750d9556e9741912333349e4da454c53dbfddbfc6002ab49518dcf02af745.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/ef42ec014d7bc373b874b2a1ff0dcd785490f125e913698bc049b0bd778e4d66.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fe3eb4d76c79ca98833f686d642224eeeb94cc83ad14300d281623796d087f0a.json +0 -1
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/LICENSE +0 -0
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/WHEEL +0 -0
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/top_level.txt +0 -0
xinference/core/model.py
CHANGED
|
@@ -13,21 +13,21 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import asyncio
|
|
16
|
+
import functools
|
|
16
17
|
import inspect
|
|
17
18
|
import json
|
|
18
19
|
import os
|
|
19
|
-
import
|
|
20
|
+
import time
|
|
21
|
+
import types
|
|
22
|
+
import weakref
|
|
20
23
|
from typing import (
|
|
21
24
|
TYPE_CHECKING,
|
|
22
|
-
Any,
|
|
23
25
|
AsyncGenerator,
|
|
24
26
|
Callable,
|
|
25
27
|
Dict,
|
|
26
|
-
Generic,
|
|
27
28
|
Iterator,
|
|
28
29
|
List,
|
|
29
30
|
Optional,
|
|
30
|
-
TypeVar,
|
|
31
31
|
Union,
|
|
32
32
|
)
|
|
33
33
|
|
|
@@ -35,8 +35,9 @@ import sse_starlette.sse
|
|
|
35
35
|
import xoscar as xo
|
|
36
36
|
|
|
37
37
|
if TYPE_CHECKING:
|
|
38
|
+
from .worker import WorkerActor
|
|
38
39
|
from ..model.llm.core import LLM
|
|
39
|
-
from ..
|
|
40
|
+
from ..model.core import ModelDescription
|
|
40
41
|
import PIL
|
|
41
42
|
|
|
42
43
|
import logging
|
|
@@ -45,8 +46,6 @@ logger = logging.getLogger(__name__)
|
|
|
45
46
|
|
|
46
47
|
from .utils import json_dumps, log_async
|
|
47
48
|
|
|
48
|
-
T = TypeVar("T")
|
|
49
|
-
|
|
50
49
|
try:
|
|
51
50
|
from torch.cuda import OutOfMemoryError
|
|
52
51
|
except ImportError:
|
|
@@ -88,38 +87,30 @@ def request_limit(fn):
|
|
|
88
87
|
return wrapped_func
|
|
89
88
|
|
|
90
89
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
if self._model_actor_ref is None:
|
|
100
|
-
self._model_actor_ref = await xo.actor_ref(
|
|
101
|
-
address=self._model_actor_addr, uid=self._model_actor_uid
|
|
102
|
-
)
|
|
103
|
-
assert self._model_actor_ref is not None
|
|
104
|
-
return await self._model_actor_ref.destroy_generator(self._uid)
|
|
90
|
+
def oom_check(fn):
|
|
91
|
+
@functools.wraps(fn)
|
|
92
|
+
def _wrapper(*args, **kwargs):
|
|
93
|
+
try:
|
|
94
|
+
return fn(*args, **kwargs)
|
|
95
|
+
except OutOfMemoryError:
|
|
96
|
+
logger.exception("Model actor is out of memory.")
|
|
97
|
+
os._exit(1)
|
|
105
98
|
|
|
106
|
-
|
|
107
|
-
|
|
99
|
+
@functools.wraps(fn)
|
|
100
|
+
async def _async_wrapper(*args, **kwargs):
|
|
101
|
+
try:
|
|
102
|
+
return await fn(*args, **kwargs)
|
|
103
|
+
except OutOfMemoryError:
|
|
104
|
+
logger.exception("Model actor is out of memory.")
|
|
105
|
+
os._exit(1)
|
|
108
106
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
self._model_actor_ref = await xo.actor_ref(
|
|
112
|
-
address=self._model_actor_addr, uid=self._model_actor_uid
|
|
113
|
-
)
|
|
107
|
+
assert not inspect.isasyncgen(fn)
|
|
108
|
+
assert not inspect.isgenerator(fn)
|
|
114
109
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
if "StopIteration" in str(e):
|
|
120
|
-
raise StopAsyncIteration
|
|
121
|
-
else:
|
|
122
|
-
raise
|
|
110
|
+
if asyncio.iscoroutinefunction(fn):
|
|
111
|
+
return _async_wrapper
|
|
112
|
+
else:
|
|
113
|
+
return _wrapper
|
|
123
114
|
|
|
124
115
|
|
|
125
116
|
class ModelActor(xo.StatelessActor):
|
|
@@ -152,22 +143,91 @@ class ModelActor(xo.StatelessActor):
|
|
|
152
143
|
gc.collect()
|
|
153
144
|
torch.cuda.empty_cache()
|
|
154
145
|
|
|
155
|
-
def __init__(
|
|
146
|
+
def __init__(
|
|
147
|
+
self,
|
|
148
|
+
worker_address: str,
|
|
149
|
+
model: "LLM",
|
|
150
|
+
model_description: Optional["ModelDescription"] = None,
|
|
151
|
+
request_limits: Optional[int] = None,
|
|
152
|
+
):
|
|
156
153
|
super().__init__()
|
|
157
154
|
from ..model.llm.pytorch.core import PytorchModel
|
|
158
155
|
from ..model.llm.pytorch.spec_model import SpeculativeModel
|
|
159
156
|
from ..model.llm.vllm.core import VLLMModel
|
|
160
157
|
|
|
158
|
+
self._worker_address = worker_address
|
|
161
159
|
self._model = model
|
|
160
|
+
self._model_description = (
|
|
161
|
+
model_description.to_dict() if model_description else {}
|
|
162
|
+
)
|
|
162
163
|
self._request_limits = request_limits
|
|
163
164
|
|
|
164
165
|
self._generators: Dict[str, Union[Iterator, AsyncGenerator]] = {}
|
|
166
|
+
self._current_generator = lambda: None
|
|
165
167
|
self._lock = (
|
|
166
168
|
None
|
|
167
169
|
if isinstance(self._model, (PytorchModel, SpeculativeModel, VLLMModel))
|
|
168
170
|
else asyncio.locks.Lock()
|
|
169
171
|
)
|
|
172
|
+
self._worker_ref = None
|
|
170
173
|
self._serve_count = 0
|
|
174
|
+
self._metrics_labels = {
|
|
175
|
+
"type": self._model_description.get("model_type", "unknown"),
|
|
176
|
+
"model": self.model_uid(),
|
|
177
|
+
"node": self._worker_address,
|
|
178
|
+
"format": self._model_description.get("model_format", "unknown"),
|
|
179
|
+
"quantization": self._model_description.get("quantization", "none"),
|
|
180
|
+
}
|
|
181
|
+
self._loop: Optional[asyncio.AbstractEventLoop] = None
|
|
182
|
+
|
|
183
|
+
async def __post_create__(self):
|
|
184
|
+
self._loop = asyncio.get_running_loop()
|
|
185
|
+
|
|
186
|
+
async def _record_completion_metrics(
|
|
187
|
+
self, duration, completion_tokens, prompt_tokens
|
|
188
|
+
):
|
|
189
|
+
coros = []
|
|
190
|
+
if completion_tokens > 0:
|
|
191
|
+
coros.append(
|
|
192
|
+
self.record_metrics(
|
|
193
|
+
"output_tokens_total_counter",
|
|
194
|
+
"add",
|
|
195
|
+
{
|
|
196
|
+
"labels": self._metrics_labels,
|
|
197
|
+
"value": completion_tokens,
|
|
198
|
+
},
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
if prompt_tokens > 0:
|
|
202
|
+
coros.append(
|
|
203
|
+
self.record_metrics(
|
|
204
|
+
"input_tokens_total_counter",
|
|
205
|
+
"add",
|
|
206
|
+
{"labels": self._metrics_labels, "value": prompt_tokens},
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
if completion_tokens > 0:
|
|
210
|
+
generate_throughput = completion_tokens / duration
|
|
211
|
+
coros.append(
|
|
212
|
+
self.record_metrics(
|
|
213
|
+
"generate_throughput",
|
|
214
|
+
"set",
|
|
215
|
+
{
|
|
216
|
+
"labels": self._metrics_labels,
|
|
217
|
+
"value": generate_throughput,
|
|
218
|
+
},
|
|
219
|
+
)
|
|
220
|
+
)
|
|
221
|
+
await asyncio.gather(*coros)
|
|
222
|
+
|
|
223
|
+
async def _get_worker_ref(self) -> xo.ActorRefType["WorkerActor"]:
|
|
224
|
+
from .worker import WorkerActor
|
|
225
|
+
|
|
226
|
+
if self._worker_ref is None:
|
|
227
|
+
self._worker_ref = await xo.actor_ref(
|
|
228
|
+
address=self._worker_address, uid=WorkerActor.uid()
|
|
229
|
+
)
|
|
230
|
+
return self._worker_ref
|
|
171
231
|
|
|
172
232
|
def is_vllm_backend(self) -> bool:
|
|
173
233
|
from ..model.llm.vllm.core import VLLMModel
|
|
@@ -188,106 +248,158 @@ class ModelActor(xo.StatelessActor):
|
|
|
188
248
|
)
|
|
189
249
|
)
|
|
190
250
|
|
|
191
|
-
def
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
generator_uid = str(uuid.uuid1())
|
|
196
|
-
self._generators[generator_uid] = ret
|
|
197
|
-
|
|
198
|
-
return IteratorWrapper(
|
|
199
|
-
uid=generator_uid,
|
|
200
|
-
model_actor_addr=self.address,
|
|
201
|
-
model_actor_uid=self.uid,
|
|
202
|
-
)
|
|
203
|
-
else:
|
|
204
|
-
return json_dumps(ret)
|
|
205
|
-
|
|
206
|
-
async def _call_wrapper(self, _wrapper: Callable):
|
|
251
|
+
def _to_json_generator(self, gen: types.GeneratorType):
|
|
252
|
+
start_time = time.time()
|
|
253
|
+
time_to_first_token = None
|
|
254
|
+
final_usage = None
|
|
207
255
|
try:
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
else:
|
|
215
|
-
async with self._lock:
|
|
216
|
-
return await asyncio.to_thread(_wrapper)
|
|
256
|
+
for v in gen:
|
|
257
|
+
if time_to_first_token is None:
|
|
258
|
+
time_to_first_token = (time.time() - start_time) * 1000
|
|
259
|
+
final_usage = v.pop("usage", None)
|
|
260
|
+
v = dict(data=json.dumps(v))
|
|
261
|
+
yield sse_starlette.sse.ensure_bytes(v, None)
|
|
217
262
|
except OutOfMemoryError:
|
|
218
263
|
logger.exception(
|
|
219
264
|
"Model actor is out of memory, model id: %s", self.model_uid()
|
|
220
265
|
)
|
|
221
266
|
os._exit(1)
|
|
267
|
+
finally:
|
|
268
|
+
if self._loop is not None and time_to_first_token is not None:
|
|
269
|
+
coro = self.record_metrics(
|
|
270
|
+
"time_to_first_token",
|
|
271
|
+
"set",
|
|
272
|
+
{"labels": self._metrics_labels, "value": time_to_first_token},
|
|
273
|
+
)
|
|
274
|
+
asyncio.run_coroutine_threadsafe(coro, loop=self._loop)
|
|
275
|
+
if self._loop is not None and final_usage is not None:
|
|
276
|
+
coro = self._record_completion_metrics(
|
|
277
|
+
time.time() - start_time,
|
|
278
|
+
completion_tokens=final_usage["completion_tokens"],
|
|
279
|
+
prompt_tokens=final_usage["prompt_tokens"],
|
|
280
|
+
)
|
|
281
|
+
asyncio.run_coroutine_threadsafe(coro, loop=self._loop)
|
|
222
282
|
|
|
223
|
-
async def
|
|
283
|
+
async def _to_json_async_gen(self, gen: types.AsyncGeneratorType):
|
|
284
|
+
start_time = time.time()
|
|
285
|
+
time_to_first_token = None
|
|
286
|
+
final_usage = None
|
|
224
287
|
try:
|
|
225
|
-
|
|
288
|
+
async for v in gen:
|
|
289
|
+
if time_to_first_token is None:
|
|
290
|
+
time_to_first_token = (time.time() - start_time) * 1000
|
|
291
|
+
final_usage = v.pop("usage", None)
|
|
292
|
+
v = await asyncio.to_thread(json.dumps, v)
|
|
293
|
+
v = dict(data=v) # noqa: F821
|
|
294
|
+
yield await asyncio.to_thread(sse_starlette.sse.ensure_bytes, v, None)
|
|
226
295
|
except OutOfMemoryError:
|
|
227
296
|
logger.exception(
|
|
228
297
|
"Model actor is out of memory, model id: %s", self.model_uid()
|
|
229
298
|
)
|
|
230
299
|
os._exit(1)
|
|
300
|
+
finally:
|
|
301
|
+
coros = []
|
|
302
|
+
if time_to_first_token is not None:
|
|
303
|
+
coros.append(
|
|
304
|
+
self.record_metrics(
|
|
305
|
+
"time_to_first_token",
|
|
306
|
+
"set",
|
|
307
|
+
{"labels": self._metrics_labels, "value": time_to_first_token},
|
|
308
|
+
)
|
|
309
|
+
)
|
|
310
|
+
if final_usage is not None:
|
|
311
|
+
coros.append(
|
|
312
|
+
self._record_completion_metrics(
|
|
313
|
+
time.time() - start_time,
|
|
314
|
+
completion_tokens=final_usage["completion_tokens"],
|
|
315
|
+
prompt_tokens=final_usage["prompt_tokens"],
|
|
316
|
+
)
|
|
317
|
+
)
|
|
318
|
+
await asyncio.gather(*coros)
|
|
319
|
+
|
|
320
|
+
@oom_check
|
|
321
|
+
async def _call_wrapper(self, fn: Callable, *args, **kwargs):
|
|
322
|
+
if self._lock is None:
|
|
323
|
+
if inspect.iscoroutinefunction(fn):
|
|
324
|
+
ret = await fn(*args, **kwargs)
|
|
325
|
+
else:
|
|
326
|
+
ret = await asyncio.to_thread(fn, *args, **kwargs)
|
|
327
|
+
else:
|
|
328
|
+
async with self._lock:
|
|
329
|
+
if inspect.iscoroutinefunction(fn):
|
|
330
|
+
ret = await fn(*args, **kwargs)
|
|
331
|
+
else:
|
|
332
|
+
ret = await asyncio.to_thread(fn, *args, **kwargs)
|
|
333
|
+
|
|
334
|
+
if self._lock is not None and self._current_generator():
|
|
335
|
+
raise Exception("Parallel generation is not supported by ggml.")
|
|
336
|
+
|
|
337
|
+
if inspect.isgenerator(ret):
|
|
338
|
+
gen = self._to_json_generator(ret)
|
|
339
|
+
self._current_generator = weakref.ref(gen)
|
|
340
|
+
return gen
|
|
341
|
+
if inspect.isasyncgen(ret):
|
|
342
|
+
gen = self._to_json_async_gen(ret)
|
|
343
|
+
self._current_generator = weakref.ref(gen)
|
|
344
|
+
return gen
|
|
345
|
+
return await asyncio.to_thread(json_dumps, ret)
|
|
231
346
|
|
|
232
347
|
@log_async(logger=logger)
|
|
233
348
|
@request_limit
|
|
349
|
+
@xo.generator
|
|
234
350
|
async def generate(self, prompt: str, *args, **kwargs):
|
|
235
|
-
if
|
|
236
|
-
self.
|
|
237
|
-
|
|
238
|
-
raise AttributeError(f"Model {self._model.model_spec} is not for generate.")
|
|
239
|
-
|
|
240
|
-
def _wrapper():
|
|
241
|
-
return self._wrap_generator(
|
|
242
|
-
getattr(self._model, "generate")(prompt, *args, **kwargs)
|
|
351
|
+
if hasattr(self._model, "generate"):
|
|
352
|
+
return await self._call_wrapper(
|
|
353
|
+
self._model.generate, prompt, *args, **kwargs
|
|
243
354
|
)
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
return self._wrap_generator(
|
|
248
|
-
await getattr(self._model, "async_generate")(prompt, *args, **kwargs)
|
|
355
|
+
if hasattr(self._model, "async_generate"):
|
|
356
|
+
return await self._call_wrapper(
|
|
357
|
+
self._model.async_generate, prompt, *args, **kwargs
|
|
249
358
|
)
|
|
250
|
-
|
|
251
|
-
if hasattr(self._model, "generate"):
|
|
252
|
-
return await self._call_wrapper(_wrapper)
|
|
253
|
-
else:
|
|
254
|
-
return await self._call_async_wrapper(_async_wrapper)
|
|
359
|
+
raise AttributeError(f"Model {self._model.model_spec} is not for generate.")
|
|
255
360
|
|
|
256
361
|
@log_async(logger=logger)
|
|
257
362
|
@request_limit
|
|
363
|
+
@xo.generator
|
|
258
364
|
async def chat(self, prompt: str, *args, **kwargs):
|
|
259
|
-
|
|
365
|
+
start_time = time.time()
|
|
366
|
+
response = None
|
|
367
|
+
try:
|
|
368
|
+
if hasattr(self._model, "chat"):
|
|
369
|
+
response = await self._call_wrapper(
|
|
370
|
+
self._model.chat, prompt, *args, **kwargs
|
|
371
|
+
)
|
|
372
|
+
return response
|
|
373
|
+
if hasattr(self._model, "async_chat"):
|
|
374
|
+
response = await self._call_wrapper(
|
|
375
|
+
self._model.async_chat, prompt, *args, **kwargs
|
|
376
|
+
)
|
|
377
|
+
return response
|
|
260
378
|
raise AttributeError(f"Model {self._model.model_spec} is not for chat.")
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
if hasattr(self._model, "async_chat"):
|
|
274
|
-
return await self._call_async_wrapper(_async_wrapper)
|
|
275
|
-
else:
|
|
276
|
-
return await self._call_wrapper(_wrapper)
|
|
379
|
+
finally:
|
|
380
|
+
# For the non stream result.
|
|
381
|
+
if response is not None and isinstance(response, dict):
|
|
382
|
+
usage = response["usage"]
|
|
383
|
+
# Some backends may not have a valid usage, we just skip them.
|
|
384
|
+
completion_tokens = usage["completion_tokens"]
|
|
385
|
+
prompt_tokens = usage["prompt_tokens"]
|
|
386
|
+
await self._record_completion_metrics(
|
|
387
|
+
time.time() - start_time,
|
|
388
|
+
completion_tokens,
|
|
389
|
+
prompt_tokens,
|
|
390
|
+
)
|
|
277
391
|
|
|
278
392
|
@log_async(logger=logger)
|
|
279
393
|
@request_limit
|
|
280
394
|
async def create_embedding(self, input: Union[str, List[str]], *args, **kwargs):
|
|
281
|
-
if
|
|
282
|
-
|
|
283
|
-
|
|
395
|
+
if hasattr(self._model, "create_embedding"):
|
|
396
|
+
return await self._call_wrapper(
|
|
397
|
+
self._model.create_embedding, input, *args, **kwargs
|
|
284
398
|
)
|
|
285
399
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
return await self._call_wrapper(_wrapper)
|
|
400
|
+
raise AttributeError(
|
|
401
|
+
f"Model {self._model.model_spec} is not for creating embedding."
|
|
402
|
+
)
|
|
291
403
|
|
|
292
404
|
@log_async(logger=logger)
|
|
293
405
|
@request_limit
|
|
@@ -301,13 +413,9 @@ class ModelActor(xo.StatelessActor):
|
|
|
301
413
|
*args,
|
|
302
414
|
**kwargs,
|
|
303
415
|
):
|
|
304
|
-
if
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
def _wrapper():
|
|
310
|
-
data = getattr(self._model, "rerank")(
|
|
416
|
+
if hasattr(self._model, "rerank"):
|
|
417
|
+
return await self._call_wrapper(
|
|
418
|
+
self._model.rerank,
|
|
311
419
|
documents,
|
|
312
420
|
query,
|
|
313
421
|
top_n,
|
|
@@ -316,9 +424,7 @@ class ModelActor(xo.StatelessActor):
|
|
|
316
424
|
*args,
|
|
317
425
|
**kwargs,
|
|
318
426
|
)
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
return await self._call_wrapper(_wrapper)
|
|
427
|
+
raise AttributeError(f"Model {self._model.model_spec} is not for reranking.")
|
|
322
428
|
|
|
323
429
|
@log_async(logger=logger)
|
|
324
430
|
@request_limit
|
|
@@ -331,18 +437,19 @@ class ModelActor(xo.StatelessActor):
|
|
|
331
437
|
*args,
|
|
332
438
|
**kwargs,
|
|
333
439
|
):
|
|
334
|
-
if
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
440
|
+
if hasattr(self._model, "text_to_image"):
|
|
441
|
+
return await self._call_wrapper(
|
|
442
|
+
self._model.text_to_image,
|
|
443
|
+
prompt,
|
|
444
|
+
n,
|
|
445
|
+
size,
|
|
446
|
+
response_format,
|
|
447
|
+
*args,
|
|
448
|
+
**kwargs,
|
|
342
449
|
)
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
450
|
+
raise AttributeError(
|
|
451
|
+
f"Model {self._model.model_spec} is not for creating image."
|
|
452
|
+
)
|
|
346
453
|
|
|
347
454
|
async def image_to_image(
|
|
348
455
|
self,
|
|
@@ -355,13 +462,9 @@ class ModelActor(xo.StatelessActor):
|
|
|
355
462
|
*args,
|
|
356
463
|
**kwargs,
|
|
357
464
|
):
|
|
358
|
-
if
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
)
|
|
362
|
-
|
|
363
|
-
def _wrapper():
|
|
364
|
-
r = getattr(self._model, "image_to_image")(
|
|
465
|
+
if hasattr(self._model, "image_to_image"):
|
|
466
|
+
return await self._call_wrapper(
|
|
467
|
+
self._model.image_to_image,
|
|
365
468
|
image,
|
|
366
469
|
prompt,
|
|
367
470
|
negative_prompt,
|
|
@@ -371,50 +474,10 @@ class ModelActor(xo.StatelessActor):
|
|
|
371
474
|
*args,
|
|
372
475
|
**kwargs,
|
|
373
476
|
)
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
async def next(
|
|
379
|
-
self, generator_uid: str
|
|
380
|
-
) -> Union["ChatCompletionChunk", "CompletionChunk"]:
|
|
381
|
-
assert generator_uid in self._generators
|
|
382
|
-
stop = object()
|
|
383
|
-
gen = self._generators[generator_uid]
|
|
384
|
-
|
|
385
|
-
def _wrapper():
|
|
386
|
-
try:
|
|
387
|
-
v = dict(data=json.dumps(next(gen)))
|
|
388
|
-
return sse_starlette.sse.ensure_bytes(v, None)
|
|
389
|
-
except StopIteration:
|
|
390
|
-
return stop
|
|
391
|
-
|
|
392
|
-
async def _async_wrapper():
|
|
393
|
-
try:
|
|
394
|
-
# anext is only available for Python >= 3.10
|
|
395
|
-
v = await gen.__anext__()
|
|
396
|
-
v = await asyncio.to_thread(json.dumps, v)
|
|
397
|
-
v = dict(data=v) # noqa: F821
|
|
398
|
-
return await asyncio.to_thread(sse_starlette.sse.ensure_bytes, v, None)
|
|
399
|
-
except StopAsyncIteration:
|
|
400
|
-
return stop
|
|
401
|
-
|
|
402
|
-
if inspect.isgenerator(gen):
|
|
403
|
-
r = await self._call_wrapper(_wrapper)
|
|
404
|
-
elif inspect.isasyncgen(gen):
|
|
405
|
-
# for vLLM.
|
|
406
|
-
r = await self._call_async_wrapper(_async_wrapper)
|
|
407
|
-
else:
|
|
408
|
-
raise TypeError(
|
|
409
|
-
f"Unexpected type {type(gen)}, expecting generator or async generator"
|
|
410
|
-
)
|
|
411
|
-
|
|
412
|
-
if r is stop:
|
|
413
|
-
self._generators.pop(generator_uid, None)
|
|
414
|
-
raise Exception("StopIteration")
|
|
415
|
-
else:
|
|
416
|
-
return r
|
|
477
|
+
raise AttributeError(
|
|
478
|
+
f"Model {self._model.model_spec} is not for creating image."
|
|
479
|
+
)
|
|
417
480
|
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
481
|
+
async def record_metrics(self, name, op, kwargs):
|
|
482
|
+
worker_ref = await self._get_worker_ref()
|
|
483
|
+
await worker_ref.record_metrics(name, op, kwargs)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Copyright 2022-2024 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from enum import Enum
|
|
15
|
+
from logging import getLogger
|
|
16
|
+
from typing import Dict, List, Optional
|
|
17
|
+
|
|
18
|
+
import xoscar as xo
|
|
19
|
+
from pydantic import BaseModel
|
|
20
|
+
|
|
21
|
+
logger = getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LaunchStatus(Enum):
|
|
25
|
+
CREATING = 1
|
|
26
|
+
UPDATING = 2
|
|
27
|
+
TERMINATING = 3
|
|
28
|
+
TERMINATED = 4
|
|
29
|
+
READY = 5
|
|
30
|
+
ERROR = 6
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class InstanceInfo(BaseModel):
|
|
34
|
+
model_name: str
|
|
35
|
+
model_uid: str
|
|
36
|
+
model_ability: List[str]
|
|
37
|
+
replica: int
|
|
38
|
+
status: str
|
|
39
|
+
instance_created_ts: int
|
|
40
|
+
|
|
41
|
+
def update(self, **kwargs):
|
|
42
|
+
for field, value in kwargs.items():
|
|
43
|
+
setattr(self, field, value)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class StatusGuardActor(xo.StatelessActor):
|
|
47
|
+
def __init__(self):
|
|
48
|
+
super().__init__()
|
|
49
|
+
self._model_uid_to_info: Dict[str, InstanceInfo] = {}
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def uid(cls) -> str:
|
|
53
|
+
return "status_guard"
|
|
54
|
+
|
|
55
|
+
@staticmethod
|
|
56
|
+
def _drop_terminated_info(instance_infos: List[InstanceInfo]) -> List[InstanceInfo]:
|
|
57
|
+
return [
|
|
58
|
+
info
|
|
59
|
+
for info in instance_infos
|
|
60
|
+
if info.status != LaunchStatus.TERMINATED.name
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
def set_instance_info(self, model_uid: str, info: InstanceInfo):
|
|
64
|
+
self._model_uid_to_info[model_uid] = info
|
|
65
|
+
|
|
66
|
+
def get_instance_info(
|
|
67
|
+
self, model_name: Optional[str] = None, model_uid: Optional[str] = None
|
|
68
|
+
) -> List[InstanceInfo]:
|
|
69
|
+
if model_uid is not None:
|
|
70
|
+
return (
|
|
71
|
+
self._drop_terminated_info([self._model_uid_to_info[model_uid]])
|
|
72
|
+
if model_uid in self._model_uid_to_info
|
|
73
|
+
else []
|
|
74
|
+
)
|
|
75
|
+
all_infos: List[InstanceInfo] = list(self._model_uid_to_info.values())
|
|
76
|
+
filtered_infos: List[InstanceInfo] = list(
|
|
77
|
+
filter(lambda info: info.model_name == model_name, all_infos)
|
|
78
|
+
)
|
|
79
|
+
return (
|
|
80
|
+
self._drop_terminated_info(filtered_infos)
|
|
81
|
+
if model_name is not None
|
|
82
|
+
else self._drop_terminated_info(all_infos)
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def update_instance_info(self, model_uid: str, info: Dict):
|
|
86
|
+
self._model_uid_to_info[model_uid].update(**info)
|