xinference 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/oauth2/__init__.py +13 -0
- xinference/api/oauth2/common.py +14 -0
- xinference/api/oauth2/core.py +93 -0
- xinference/api/oauth2/types.py +36 -0
- xinference/api/oauth2/utils.py +44 -0
- xinference/api/restful_api.py +216 -27
- xinference/client/oscar/actor_client.py +18 -18
- xinference/client/restful/restful_client.py +96 -33
- xinference/conftest.py +63 -1
- xinference/constants.py +1 -0
- xinference/core/chat_interface.py +143 -3
- xinference/core/metrics.py +83 -0
- xinference/core/model.py +244 -181
- xinference/core/status_guard.py +86 -0
- xinference/core/supervisor.py +57 -7
- xinference/core/worker.py +134 -13
- xinference/deploy/cmdline.py +142 -16
- xinference/deploy/local.py +39 -7
- xinference/deploy/supervisor.py +2 -0
- xinference/deploy/worker.py +33 -5
- xinference/fields.py +4 -1
- xinference/model/core.py +8 -1
- xinference/model/embedding/core.py +3 -2
- xinference/model/embedding/model_spec_modelscope.json +60 -18
- xinference/model/image/stable_diffusion/core.py +4 -3
- xinference/model/llm/__init__.py +7 -0
- xinference/model/llm/ggml/llamacpp.py +3 -2
- xinference/model/llm/llm_family.json +87 -3
- xinference/model/llm/llm_family.py +15 -5
- xinference/model/llm/llm_family_modelscope.json +92 -3
- xinference/model/llm/pytorch/chatglm.py +70 -28
- xinference/model/llm/pytorch/core.py +11 -30
- xinference/model/llm/pytorch/internlm2.py +155 -0
- xinference/model/llm/pytorch/utils.py +0 -153
- xinference/model/llm/utils.py +37 -8
- xinference/model/llm/vllm/core.py +15 -3
- xinference/model/multimodal/__init__.py +15 -8
- xinference/model/multimodal/core.py +8 -1
- xinference/model/multimodal/model_spec.json +9 -0
- xinference/model/multimodal/model_spec_modelscope.json +45 -0
- xinference/model/multimodal/qwen_vl.py +5 -9
- xinference/model/utils.py +7 -2
- xinference/types.py +2 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.b83095c2.js +3 -0
- xinference/web/ui/build/static/js/{main.236e72e7.js.LICENSE.txt → main.b83095c2.js.LICENSE.txt} +7 -0
- xinference/web/ui/build/static/js/main.b83095c2.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/0a853b2fa1902551e262a2f1a4b7894341f27b3dd9587f2ef7aaea195af89518.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/101923c539819f26ad11fbcbd6f6e56436b285efbb090dcc7dd648c6e924c4a8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/193e7ba39e70d4bb2895a5cb317f6f293a5fd02e7e324c02a1eba2f83216419c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/22858de5265f2d279fca9f2f54dfb147e4b2704200dfb5d2ad3ec9769417328f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/27696db5fcd4fcf0e7974cadf1e4a2ab89690474045c3188eafd586323ad13bb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/27bcada3ee8f89d21184b359f022fc965f350ffaca52c9814c29f1fc37121173.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/27bdbe25deab8cf08f7fab8f05f8f26cf84a98809527a37986a4ab73a57ba96a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2bee7b8bd3d52976a45d6068e1333df88b943e0e679403c809e45382e3818037.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/30670751f55508ef3b861e13dd71b9e5a10d2561373357a12fc3831a0b77fd93.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3605cd3a96ff2a3b443c70a101575482279ad26847924cab0684d165ba0d2492.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3789ef437d3ecbf945bb9cea39093d1f16ebbfa32dbe6daf35abcfb6d48de6f1.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4942da6bc03bf7373af068e22f916341aabc5b5df855d73c1d348c696724ce37.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4d933e35e0fe79867d3aa6c46db28804804efddf5490347cb6c2c2879762a157.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4d96f071168af43965e0fab2ded658fa0a15b8d9ca03789a5ef9c5c16a4e3cee.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/52a6136cb2dbbf9c51d461724d9b283ebe74a73fb19d5df7ba8e13c42bd7174d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5c408307c982f07f9c09c85c98212d1b1c22548a9194c69548750a3016b91b88.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/663adbcb60b942e9cf094c8d9fabe57517f5e5e6e722d28b4948a40b7445a3b8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/666bb2e1b250dc731311a7e4880886177885dfa768508d2ed63e02630cc78725.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/71493aadd34d568fbe605cacaba220aa69bd09273251ee4ba27930f8d01fccd8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8b071db2a5a9ef68dc14d5f606540bd23d9785e365a11997c510656764d2dccf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8b246d79cd3f6fc78f11777e6a6acca6a2c5d4ecce7f2dd4dcf9a48126440d3c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a4d72d3b806ba061919115f0c513738726872e3c79cf258f007519d3f91d1a16.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b4e4fccaf8f2489a29081f0bf3b191656bd452fb3c8b5e3c6d92d94f680964d5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b53eb7c7967f6577bd3e678293c44204fb03ffa7fdc1dd59d3099015c68f6f7f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d06af85a84e5c5a29d3acf2dbb5b30c0cf75c8aec4ab5f975e6096f944ee4324.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d5e150bff31715977d8f537c970f06d4fe3de9909d7e8342244a83a9f6447121.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/de36e5c08fd524e341d664883dda6cb1745acc852a4f1b011a35a0b4615f72fa.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f037ffef5992af0892d6d991053c1dace364cd39a3f11f1a41f92776e8a59459.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f23ab356a8603d4a2aaa74388c2f381675c207d37c4d1c832df922e9655c9a6b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f7c23b0922f4087b9e2e3e46f15c946b772daa46c28c3a12426212ecaf481deb.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f95a8bd358eeb55fa2f49f1224cc2f4f36006359856744ff09ae4bb295f59ec1.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +36 -0
- xinference/web/ui/node_modules/@types/cookie/package.json +30 -0
- xinference/web/ui/node_modules/@types/hoist-non-react-statics/package.json +33 -0
- xinference/web/ui/node_modules/react-cookie/package.json +55 -0
- xinference/web/ui/node_modules/universal-cookie/package.json +48 -0
- xinference/web/ui/package-lock.json +37 -0
- xinference/web/ui/package.json +3 -2
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/METADATA +17 -6
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/RECORD +101 -66
- xinference/web/ui/build/static/js/main.236e72e7.js +0 -3
- xinference/web/ui/build/static/js/main.236e72e7.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0cccfbe5d963b8e31eb679f9d9677392839cedd04aa2956ac6b33cf19599d597.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0f3b6cc71b7c83bdc85aa4835927aeb86af2ce0d2ac241917ecfbf90f75c6d27.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2f651cf60b1bde50c0601c7110f77dd44819fb6e2501ff748a631724d91445d4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/42bb623f337ad08ed076484185726e072ca52bb88e373d72c7b052db4c273342.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/57af83639c604bd3362d0f03f7505e81c6f67ff77bee7c6bb31f6e5523eba185.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/667753ce39ce1d4bcbf9a5f1a103d653be1d19d42f4e1fbaceb9b507679a52c7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/66ed1bd4c06748c1b176a625c25c856997edc787856c73162f82f2b465c5d956.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/78f2521da2e2a98b075a2666cb782c7e2c019cd3c72199eecd5901c82d8655df.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8d2b0b3c6988d1894694dcbbe708ef91cfe62d62dac317031f09915ced637953.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/9427ae7f1e94ae8dcd2333fb361e381f4054fde07394fe5448658e3417368476.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bcee2b4e76b07620f9087989eb86d43c645ba3c7a74132cf926260af1164af0e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/cc2ddd02ccc1dad1a2737ac247c79e6f6ed2c7836c6b68e511e3048f666b64af.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d2e8e6665a7efc832b43907dadf4e3c896a59eaf8129f9a520882466c8f2e489.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d8a42e9df7157de9f28eecefdf178fd113bf2280d28471b6e32a8a45276042df.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e26750d9556e9741912333349e4da454c53dbfddbfc6002ab49518dcf02af745.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/ef42ec014d7bc373b874b2a1ff0dcd785490f125e913698bc049b0bd778e4d66.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fe3eb4d76c79ca98833f686d642224eeeb94cc83ad14300d281623796d087f0a.json +0 -1
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/LICENSE +0 -0
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/WHEEL +0 -0
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/top_level.txt +0 -0
xinference/deploy/cmdline.py
CHANGED
|
@@ -24,13 +24,13 @@ from xoscar.utils import get_next_port
|
|
|
24
24
|
|
|
25
25
|
from .. import __version__
|
|
26
26
|
from ..client import RESTfulClient
|
|
27
|
-
from ..client.oscar.actor_client import ActorClient
|
|
28
27
|
from ..client.restful.restful_client import (
|
|
29
28
|
RESTfulChatglmCppChatModelHandle,
|
|
30
29
|
RESTfulChatModelHandle,
|
|
31
30
|
RESTfulGenerateModelHandle,
|
|
32
31
|
)
|
|
33
32
|
from ..constants import (
|
|
33
|
+
XINFERENCE_AUTH_DIR,
|
|
34
34
|
XINFERENCE_DEFAULT_DISTRIBUTED_HOST,
|
|
35
35
|
XINFERENCE_DEFAULT_ENDPOINT_PORT,
|
|
36
36
|
XINFERENCE_DEFAULT_LOCAL_HOST,
|
|
@@ -62,10 +62,37 @@ def get_endpoint(endpoint: Optional[str]) -> str:
|
|
|
62
62
|
return endpoint
|
|
63
63
|
|
|
64
64
|
|
|
65
|
+
def get_hash_endpoint(endpoint: str) -> str:
|
|
66
|
+
import hashlib
|
|
67
|
+
|
|
68
|
+
m = hashlib.sha256()
|
|
69
|
+
m.update(bytes(endpoint, "utf-8"))
|
|
70
|
+
return m.hexdigest()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_stored_token(
|
|
74
|
+
endpoint: str, client: Optional[RESTfulClient] = None
|
|
75
|
+
) -> Optional[str]:
|
|
76
|
+
rest_client = RESTfulClient(endpoint) if client is None else client
|
|
77
|
+
authed = rest_client._cluster_authed
|
|
78
|
+
if not authed:
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
token_path = os.path.join(XINFERENCE_AUTH_DIR, get_hash_endpoint(endpoint))
|
|
82
|
+
if not os.path.exists(token_path):
|
|
83
|
+
raise RuntimeError("Cannot find access token, please login first!")
|
|
84
|
+
with open(token_path, "r") as f:
|
|
85
|
+
access_token = str(f.read())
|
|
86
|
+
return access_token
|
|
87
|
+
|
|
88
|
+
|
|
65
89
|
def start_local_cluster(
|
|
66
90
|
log_level: str,
|
|
67
91
|
host: str,
|
|
68
92
|
port: int,
|
|
93
|
+
metrics_exporter_host: Optional[str] = None,
|
|
94
|
+
metrics_exporter_port: Optional[int] = None,
|
|
95
|
+
auth_config_file: Optional[str] = None,
|
|
69
96
|
):
|
|
70
97
|
from .local import main
|
|
71
98
|
|
|
@@ -80,7 +107,10 @@ def start_local_cluster(
|
|
|
80
107
|
main(
|
|
81
108
|
host=host,
|
|
82
109
|
port=port,
|
|
110
|
+
metrics_exporter_host=metrics_exporter_host,
|
|
111
|
+
metrics_exporter_port=metrics_exporter_port,
|
|
83
112
|
logging_conf=dict_config,
|
|
113
|
+
auth_config_file=auth_config_file,
|
|
84
114
|
)
|
|
85
115
|
|
|
86
116
|
|
|
@@ -159,12 +189,42 @@ def cli(
|
|
|
159
189
|
type=int,
|
|
160
190
|
help="Specify the port number for the Xinference server.",
|
|
161
191
|
)
|
|
192
|
+
@click.option(
|
|
193
|
+
"--metrics-exporter-host",
|
|
194
|
+
"-MH",
|
|
195
|
+
default=None,
|
|
196
|
+
type=str,
|
|
197
|
+
help="Specify the host address for the Xinference metrics exporter server, default is the same as --host.",
|
|
198
|
+
)
|
|
199
|
+
@click.option(
|
|
200
|
+
"--metrics-exporter-port",
|
|
201
|
+
"-mp",
|
|
202
|
+
type=int,
|
|
203
|
+
help="Specify the port number for the Xinference metrics exporter server.",
|
|
204
|
+
)
|
|
205
|
+
@click.option(
|
|
206
|
+
"--auth-config",
|
|
207
|
+
type=str,
|
|
208
|
+
help="Specify the auth config json file.",
|
|
209
|
+
)
|
|
162
210
|
def local(
|
|
163
211
|
log_level: str,
|
|
164
212
|
host: str,
|
|
165
213
|
port: int,
|
|
214
|
+
metrics_exporter_host: Optional[str],
|
|
215
|
+
metrics_exporter_port: Optional[int],
|
|
216
|
+
auth_config: Optional[str],
|
|
166
217
|
):
|
|
167
|
-
|
|
218
|
+
if metrics_exporter_host is None:
|
|
219
|
+
metrics_exporter_host = host
|
|
220
|
+
start_local_cluster(
|
|
221
|
+
log_level=log_level,
|
|
222
|
+
host=host,
|
|
223
|
+
port=port,
|
|
224
|
+
metrics_exporter_host=metrics_exporter_host,
|
|
225
|
+
metrics_exporter_port=metrics_exporter_port,
|
|
226
|
+
auth_config_file=auth_config,
|
|
227
|
+
)
|
|
168
228
|
|
|
169
229
|
|
|
170
230
|
@click.command(
|
|
@@ -196,7 +256,18 @@ def local(
|
|
|
196
256
|
type=int,
|
|
197
257
|
help="Specify the port number for the Xinference supervisor.",
|
|
198
258
|
)
|
|
199
|
-
|
|
259
|
+
@click.option(
|
|
260
|
+
"--auth-config",
|
|
261
|
+
type=str,
|
|
262
|
+
help="Specify the auth config json file.",
|
|
263
|
+
)
|
|
264
|
+
def supervisor(
|
|
265
|
+
log_level: str,
|
|
266
|
+
host: str,
|
|
267
|
+
port: int,
|
|
268
|
+
supervisor_port: Optional[int],
|
|
269
|
+
auth_config: Optional[str],
|
|
270
|
+
):
|
|
200
271
|
from ..deploy.supervisor import main
|
|
201
272
|
|
|
202
273
|
dict_config = get_config_dict(
|
|
@@ -208,7 +279,11 @@ def supervisor(log_level: str, host: str, port: int, supervisor_port: Optional[i
|
|
|
208
279
|
logging.config.dictConfig(dict_config) # type: ignore
|
|
209
280
|
|
|
210
281
|
main(
|
|
211
|
-
host=host,
|
|
282
|
+
host=host,
|
|
283
|
+
port=port,
|
|
284
|
+
supervisor_port=supervisor_port,
|
|
285
|
+
logging_conf=dict_config,
|
|
286
|
+
auth_config_file=auth_config,
|
|
212
287
|
)
|
|
213
288
|
|
|
214
289
|
|
|
@@ -235,8 +310,25 @@ def supervisor(log_level: str, host: str, port: int, supervisor_port: Optional[i
|
|
|
235
310
|
type=int,
|
|
236
311
|
help="Specify the port number for the Xinference worker.",
|
|
237
312
|
)
|
|
313
|
+
@click.option(
|
|
314
|
+
"--metrics-exporter-host",
|
|
315
|
+
"-MH",
|
|
316
|
+
default=XINFERENCE_DEFAULT_DISTRIBUTED_HOST,
|
|
317
|
+
type=str,
|
|
318
|
+
help="Specify the host address for the metrics exporter server.",
|
|
319
|
+
)
|
|
320
|
+
@click.option(
|
|
321
|
+
"--metrics-exporter-port",
|
|
322
|
+
type=int,
|
|
323
|
+
help="Specify the port number for the Xinference metrics exporter worker.",
|
|
324
|
+
)
|
|
238
325
|
def worker(
|
|
239
|
-
log_level: str,
|
|
326
|
+
log_level: str,
|
|
327
|
+
endpoint: Optional[str],
|
|
328
|
+
host: str,
|
|
329
|
+
worker_port: Optional[int],
|
|
330
|
+
metrics_exporter_host: Optional[str],
|
|
331
|
+
metrics_exporter_port: Optional[int],
|
|
240
332
|
):
|
|
241
333
|
from ..deploy.worker import main
|
|
242
334
|
|
|
@@ -257,6 +349,8 @@ def worker(
|
|
|
257
349
|
main(
|
|
258
350
|
address=address,
|
|
259
351
|
supervisor_address=supervisor_internal_addr,
|
|
352
|
+
metrics_exporter_host=metrics_exporter_host,
|
|
353
|
+
metrics_exporter_port=metrics_exporter_port,
|
|
260
354
|
logging_conf=dict_config,
|
|
261
355
|
)
|
|
262
356
|
|
|
@@ -288,6 +382,7 @@ def register_model(
|
|
|
288
382
|
model = fd.read()
|
|
289
383
|
|
|
290
384
|
client = RESTfulClient(base_url=endpoint)
|
|
385
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
291
386
|
client.register_model(
|
|
292
387
|
model_type=model_type,
|
|
293
388
|
model=model,
|
|
@@ -316,6 +411,7 @@ def unregister_model(
|
|
|
316
411
|
endpoint = get_endpoint(endpoint)
|
|
317
412
|
|
|
318
413
|
client = RESTfulClient(base_url=endpoint)
|
|
414
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
319
415
|
client.unregister_model(
|
|
320
416
|
model_type=model_type,
|
|
321
417
|
model_name=model_name,
|
|
@@ -343,8 +439,9 @@ def list_model_registrations(
|
|
|
343
439
|
from tabulate import tabulate
|
|
344
440
|
|
|
345
441
|
endpoint = get_endpoint(endpoint)
|
|
346
|
-
|
|
347
442
|
client = RESTfulClient(base_url=endpoint)
|
|
443
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
444
|
+
|
|
348
445
|
registrations = client.list_model_registrations(model_type=model_type)
|
|
349
446
|
|
|
350
447
|
table = []
|
|
@@ -518,8 +615,9 @@ def model_launch(
|
|
|
518
615
|
if size_in_billions is None or "_" in size_in_billions
|
|
519
616
|
else int(size_in_billions)
|
|
520
617
|
)
|
|
521
|
-
|
|
522
618
|
client = RESTfulClient(base_url=endpoint)
|
|
619
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
620
|
+
|
|
523
621
|
model_uid = client.launch_model(
|
|
524
622
|
model_name=model_name,
|
|
525
623
|
model_type=model_type,
|
|
@@ -550,6 +648,7 @@ def model_list(endpoint: Optional[str]):
|
|
|
550
648
|
|
|
551
649
|
endpoint = get_endpoint(endpoint)
|
|
552
650
|
client = RESTfulClient(base_url=endpoint)
|
|
651
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
553
652
|
|
|
554
653
|
llm_table = []
|
|
555
654
|
embedding_table = []
|
|
@@ -626,8 +725,8 @@ def model_terminate(
|
|
|
626
725
|
model_uid: str,
|
|
627
726
|
):
|
|
628
727
|
endpoint = get_endpoint(endpoint)
|
|
629
|
-
|
|
630
728
|
client = RESTfulClient(base_url=endpoint)
|
|
729
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
631
730
|
client.terminate_model(model_uid=model_uid)
|
|
632
731
|
|
|
633
732
|
|
|
@@ -657,6 +756,8 @@ def model_generate(
|
|
|
657
756
|
stream: bool,
|
|
658
757
|
):
|
|
659
758
|
endpoint = get_endpoint(endpoint)
|
|
759
|
+
client = RESTfulClient(base_url=endpoint)
|
|
760
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
660
761
|
if stream:
|
|
661
762
|
# TODO: when stream=True, RestfulClient cannot generate words one by one.
|
|
662
763
|
# So use Client in temporary. The implementation needs to be changed to
|
|
@@ -669,7 +770,7 @@ def model_generate(
|
|
|
669
770
|
if prompt == "":
|
|
670
771
|
break
|
|
671
772
|
print(f"Completion: {prompt}", end="", file=sys.stdout)
|
|
672
|
-
|
|
773
|
+
for chunk in model.generate(
|
|
673
774
|
prompt=prompt,
|
|
674
775
|
generate_config={"stream": stream, "max_tokens": max_tokens},
|
|
675
776
|
):
|
|
@@ -680,7 +781,6 @@ def model_generate(
|
|
|
680
781
|
print(choice["text"], end="", flush=True, file=sys.stdout)
|
|
681
782
|
print("", file=sys.stdout)
|
|
682
783
|
|
|
683
|
-
client = ActorClient(endpoint=endpoint)
|
|
684
784
|
model = client.get_model(model_uid=model_uid)
|
|
685
785
|
|
|
686
786
|
loop = asyncio.get_event_loop()
|
|
@@ -700,8 +800,7 @@ def model_generate(
|
|
|
700
800
|
# avoid displaying exception-unhandled warnings
|
|
701
801
|
task.exception()
|
|
702
802
|
else:
|
|
703
|
-
|
|
704
|
-
restful_model = restful_client.get_model(model_uid=model_uid)
|
|
803
|
+
restful_model = client.get_model(model_uid=model_uid)
|
|
705
804
|
if not isinstance(
|
|
706
805
|
restful_model, (RESTfulChatModelHandle, RESTfulGenerateModelHandle)
|
|
707
806
|
):
|
|
@@ -744,6 +843,9 @@ def model_chat(
|
|
|
744
843
|
):
|
|
745
844
|
# TODO: chat model roles may not be user and assistant.
|
|
746
845
|
endpoint = get_endpoint(endpoint)
|
|
846
|
+
client = RESTfulClient(base_url=endpoint)
|
|
847
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
848
|
+
|
|
747
849
|
chat_history: "List[ChatCompletionMessage]" = []
|
|
748
850
|
if stream:
|
|
749
851
|
# TODO: when stream=True, RestfulClient cannot generate words one by one.
|
|
@@ -758,7 +860,7 @@ def model_chat(
|
|
|
758
860
|
break
|
|
759
861
|
print("Assistant: ", end="", file=sys.stdout)
|
|
760
862
|
response_content = ""
|
|
761
|
-
|
|
863
|
+
for chunk in model.chat(
|
|
762
864
|
prompt=prompt,
|
|
763
865
|
chat_history=chat_history,
|
|
764
866
|
generate_config={"stream": stream, "max_tokens": max_tokens},
|
|
@@ -775,7 +877,6 @@ def model_chat(
|
|
|
775
877
|
ChatCompletionMessage(role="assistant", content=response_content)
|
|
776
878
|
)
|
|
777
879
|
|
|
778
|
-
client = ActorClient(endpoint=endpoint)
|
|
779
880
|
model = client.get_model(model_uid=model_uid)
|
|
780
881
|
|
|
781
882
|
loop = asyncio.get_event_loop()
|
|
@@ -795,8 +896,7 @@ def model_chat(
|
|
|
795
896
|
# avoid displaying exception-unhandled warnings
|
|
796
897
|
task.exception()
|
|
797
898
|
else:
|
|
798
|
-
|
|
799
|
-
restful_model = restful_client.get_model(model_uid=model_uid)
|
|
899
|
+
restful_model = client.get_model(model_uid=model_uid)
|
|
800
900
|
if not isinstance(
|
|
801
901
|
restful_model, (RESTfulChatModelHandle, RESTfulChatglmCppChatModelHandle)
|
|
802
902
|
):
|
|
@@ -822,5 +922,31 @@ def model_chat(
|
|
|
822
922
|
)
|
|
823
923
|
|
|
824
924
|
|
|
925
|
+
@cli.command("login", help="Login when the cluster is authenticated.")
|
|
926
|
+
@click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
|
|
927
|
+
@click.option("--username", type=str, required=True, help="Username.")
|
|
928
|
+
@click.option(
|
|
929
|
+
"--password",
|
|
930
|
+
type=str,
|
|
931
|
+
required=True,
|
|
932
|
+
help="Password.",
|
|
933
|
+
)
|
|
934
|
+
def cluster_login(
|
|
935
|
+
endpoint: Optional[str],
|
|
936
|
+
username: str,
|
|
937
|
+
password: str,
|
|
938
|
+
):
|
|
939
|
+
endpoint = get_endpoint(endpoint)
|
|
940
|
+
restful_client = RESTfulClient(base_url=endpoint)
|
|
941
|
+
if restful_client._cluster_authed:
|
|
942
|
+
restful_client.login(username, password)
|
|
943
|
+
access_token = restful_client._get_token()
|
|
944
|
+
assert access_token is not None
|
|
945
|
+
os.makedirs(XINFERENCE_AUTH_DIR, exist_ok=True)
|
|
946
|
+
hashed_ep = get_hash_endpoint(endpoint)
|
|
947
|
+
with open(os.path.join(XINFERENCE_AUTH_DIR, hashed_ep), "w") as f:
|
|
948
|
+
f.write(access_token)
|
|
949
|
+
|
|
950
|
+
|
|
825
951
|
if __name__ == "__main__":
|
|
826
952
|
cli()
|
xinference/deploy/local.py
CHANGED
|
@@ -35,6 +35,8 @@ logger = logging.getLogger(__name__)
|
|
|
35
35
|
|
|
36
36
|
async def _start_local_cluster(
|
|
37
37
|
address: str,
|
|
38
|
+
metrics_exporter_host: Optional[str] = None,
|
|
39
|
+
metrics_exporter_port: Optional[int] = None,
|
|
38
40
|
logging_conf: Optional[Dict] = None,
|
|
39
41
|
):
|
|
40
42
|
from .utils import create_worker_actor_pool
|
|
@@ -50,7 +52,11 @@ async def _start_local_cluster(
|
|
|
50
52
|
SupervisorActor, address=address, uid=SupervisorActor.uid()
|
|
51
53
|
)
|
|
52
54
|
await start_worker_components(
|
|
53
|
-
address=address,
|
|
55
|
+
address=address,
|
|
56
|
+
supervisor_address=address,
|
|
57
|
+
main_pool=pool,
|
|
58
|
+
metrics_exporter_host=metrics_exporter_host,
|
|
59
|
+
metrics_exporter_port=metrics_exporter_port,
|
|
54
60
|
)
|
|
55
61
|
await pool.join()
|
|
56
62
|
except asyncio.CancelledError:
|
|
@@ -58,7 +64,12 @@ async def _start_local_cluster(
|
|
|
58
64
|
await pool.stop()
|
|
59
65
|
|
|
60
66
|
|
|
61
|
-
def run(
|
|
67
|
+
def run(
|
|
68
|
+
address: str,
|
|
69
|
+
metrics_exporter_host: Optional[str] = None,
|
|
70
|
+
metrics_exporter_port: Optional[int] = None,
|
|
71
|
+
logging_conf: Optional[Dict] = None,
|
|
72
|
+
):
|
|
62
73
|
def sigterm_handler(signum, frame):
|
|
63
74
|
sys.exit(0)
|
|
64
75
|
|
|
@@ -66,22 +77,42 @@ def run(address: str, logging_conf: Optional[Dict] = None):
|
|
|
66
77
|
|
|
67
78
|
loop = asyncio.get_event_loop()
|
|
68
79
|
task = loop.create_task(
|
|
69
|
-
_start_local_cluster(
|
|
80
|
+
_start_local_cluster(
|
|
81
|
+
address=address,
|
|
82
|
+
metrics_exporter_host=metrics_exporter_host,
|
|
83
|
+
metrics_exporter_port=metrics_exporter_port,
|
|
84
|
+
logging_conf=logging_conf,
|
|
85
|
+
)
|
|
70
86
|
)
|
|
71
87
|
loop.run_until_complete(task)
|
|
72
88
|
|
|
73
89
|
|
|
74
90
|
def run_in_subprocess(
|
|
75
|
-
address: str,
|
|
91
|
+
address: str,
|
|
92
|
+
metrics_exporter_host: Optional[str] = None,
|
|
93
|
+
metrics_exporter_port: Optional[int] = None,
|
|
94
|
+
logging_conf: Optional[Dict] = None,
|
|
76
95
|
) -> multiprocessing.Process:
|
|
77
|
-
p = multiprocessing.Process(
|
|
96
|
+
p = multiprocessing.Process(
|
|
97
|
+
target=run,
|
|
98
|
+
args=(address, metrics_exporter_host, metrics_exporter_port, logging_conf),
|
|
99
|
+
)
|
|
78
100
|
p.start()
|
|
79
101
|
return p
|
|
80
102
|
|
|
81
103
|
|
|
82
|
-
def main(
|
|
104
|
+
def main(
|
|
105
|
+
host: str,
|
|
106
|
+
port: int,
|
|
107
|
+
metrics_exporter_host: Optional[str] = None,
|
|
108
|
+
metrics_exporter_port: Optional[int] = None,
|
|
109
|
+
logging_conf: Optional[Dict] = None,
|
|
110
|
+
auth_config_file: Optional[str] = None,
|
|
111
|
+
):
|
|
83
112
|
supervisor_address = f"{host}:{get_next_port()}"
|
|
84
|
-
local_cluster = run_in_subprocess(
|
|
113
|
+
local_cluster = run_in_subprocess(
|
|
114
|
+
supervisor_address, metrics_exporter_host, metrics_exporter_port, logging_conf
|
|
115
|
+
)
|
|
85
116
|
|
|
86
117
|
if not health_check(
|
|
87
118
|
address=supervisor_address,
|
|
@@ -98,6 +129,7 @@ def main(host: str, port: int, logging_conf: Optional[Dict] = None):
|
|
|
98
129
|
host=host,
|
|
99
130
|
port=port,
|
|
100
131
|
logging_conf=logging_conf,
|
|
132
|
+
auth_config_file=auth_config_file,
|
|
101
133
|
)
|
|
102
134
|
finally:
|
|
103
135
|
local_cluster.terminate()
|
xinference/deploy/supervisor.py
CHANGED
|
@@ -75,6 +75,7 @@ def main(
|
|
|
75
75
|
port: int,
|
|
76
76
|
supervisor_port: Optional[int],
|
|
77
77
|
logging_conf: Optional[Dict] = None,
|
|
78
|
+
auth_config_file: Optional[str] = None,
|
|
78
79
|
):
|
|
79
80
|
supervisor_address = f"{host}:{supervisor_port or get_next_port()}"
|
|
80
81
|
local_cluster = run_in_subprocess(supervisor_address, logging_conf)
|
|
@@ -94,6 +95,7 @@ def main(
|
|
|
94
95
|
host=host,
|
|
95
96
|
port=port,
|
|
96
97
|
logging_conf=logging_conf,
|
|
98
|
+
auth_config_file=auth_config_file,
|
|
97
99
|
)
|
|
98
100
|
finally:
|
|
99
101
|
local_cluster.terminate()
|
xinference/deploy/worker.py
CHANGED
|
@@ -27,7 +27,11 @@ logger = logging.getLogger(__name__)
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
async def start_worker_components(
|
|
30
|
-
address: str,
|
|
30
|
+
address: str,
|
|
31
|
+
supervisor_address: str,
|
|
32
|
+
main_pool: MainActorPoolType,
|
|
33
|
+
metrics_exporter_host: Optional[str],
|
|
34
|
+
metrics_exporter_port: Optional[int],
|
|
31
35
|
):
|
|
32
36
|
cuda_device_indices = []
|
|
33
37
|
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
|
|
@@ -43,24 +47,48 @@ async def start_worker_components(
|
|
|
43
47
|
supervisor_address=supervisor_address,
|
|
44
48
|
main_pool=main_pool,
|
|
45
49
|
cuda_devices=cuda_device_indices,
|
|
50
|
+
metrics_exporter_host=metrics_exporter_host,
|
|
51
|
+
metrics_exporter_port=metrics_exporter_port,
|
|
46
52
|
)
|
|
47
53
|
|
|
48
54
|
|
|
49
55
|
async def _start_worker(
|
|
50
|
-
address: str,
|
|
56
|
+
address: str,
|
|
57
|
+
supervisor_address: str,
|
|
58
|
+
metrics_exporter_host: Optional[str] = None,
|
|
59
|
+
metrics_exporter_port: Optional[int] = None,
|
|
60
|
+
logging_conf: Any = None,
|
|
51
61
|
):
|
|
52
62
|
from .utils import create_worker_actor_pool
|
|
53
63
|
|
|
54
64
|
pool = await create_worker_actor_pool(address=address, logging_conf=logging_conf)
|
|
55
65
|
await start_worker_components(
|
|
56
|
-
address=address,
|
|
66
|
+
address=address,
|
|
67
|
+
supervisor_address=supervisor_address,
|
|
68
|
+
main_pool=pool,
|
|
69
|
+
metrics_exporter_host=metrics_exporter_host,
|
|
70
|
+
metrics_exporter_port=metrics_exporter_port,
|
|
57
71
|
)
|
|
58
72
|
await pool.join()
|
|
59
73
|
|
|
60
74
|
|
|
61
|
-
def main(
|
|
75
|
+
def main(
|
|
76
|
+
address: str,
|
|
77
|
+
supervisor_address: str,
|
|
78
|
+
metrics_exporter_host: Optional[str] = None,
|
|
79
|
+
metrics_exporter_port: Optional[int] = None,
|
|
80
|
+
logging_conf: Optional[dict] = None,
|
|
81
|
+
):
|
|
62
82
|
loop = asyncio.get_event_loop()
|
|
63
|
-
task = loop.create_task(
|
|
83
|
+
task = loop.create_task(
|
|
84
|
+
_start_worker(
|
|
85
|
+
address,
|
|
86
|
+
supervisor_address,
|
|
87
|
+
metrics_exporter_host,
|
|
88
|
+
metrics_exporter_port,
|
|
89
|
+
logging_conf,
|
|
90
|
+
)
|
|
91
|
+
)
|
|
64
92
|
|
|
65
93
|
try:
|
|
66
94
|
loop.run_until_complete(task)
|
xinference/fields.py
CHANGED
|
@@ -30,7 +30,10 @@ logprobs_field = Field(
|
|
|
30
30
|
)
|
|
31
31
|
|
|
32
32
|
max_tokens_field = Field(
|
|
33
|
-
default=
|
|
33
|
+
default=1024,
|
|
34
|
+
ge=1,
|
|
35
|
+
le=32768,
|
|
36
|
+
description="The maximum number of tokens to generate.",
|
|
34
37
|
)
|
|
35
38
|
|
|
36
39
|
temperature_field = Field(
|
xinference/model/core.py
CHANGED
|
@@ -78,7 +78,14 @@ def create_model_instance(
|
|
|
78
78
|
elif model_type == "multimodal":
|
|
79
79
|
kwargs.pop("trust_remote_code", None)
|
|
80
80
|
return create_multimodal_model_instance(
|
|
81
|
-
subpool_addr,
|
|
81
|
+
subpool_addr,
|
|
82
|
+
devices,
|
|
83
|
+
model_uid,
|
|
84
|
+
model_name,
|
|
85
|
+
model_format,
|
|
86
|
+
model_size_in_billions,
|
|
87
|
+
quantization,
|
|
88
|
+
**kwargs,
|
|
82
89
|
)
|
|
83
90
|
else:
|
|
84
91
|
raise ValueError(f"Unsupported model type: {model_type}.")
|
|
@@ -40,7 +40,8 @@ class EmbeddingModelSpec(BaseModel):
|
|
|
40
40
|
max_tokens: int
|
|
41
41
|
language: List[str]
|
|
42
42
|
model_id: str
|
|
43
|
-
model_revision: str
|
|
43
|
+
model_revision: Optional[str]
|
|
44
|
+
model_hub: str = "huggingface"
|
|
44
45
|
|
|
45
46
|
|
|
46
47
|
class EmbeddingModelDescription(ModelDescription):
|
|
@@ -165,7 +166,7 @@ def cache(model_spec: EmbeddingModelSpec):
|
|
|
165
166
|
if valid_model_revision(meta_path, model_spec.model_revision):
|
|
166
167
|
return cache_dir
|
|
167
168
|
|
|
168
|
-
from_modelscope: bool = model_spec.
|
|
169
|
+
from_modelscope: bool = model_spec.model_hub == "modelscope"
|
|
169
170
|
if from_modelscope:
|
|
170
171
|
download_dir = retry_download(
|
|
171
172
|
ms_download,
|