xinference 0.7.5__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (120) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/__init__.py +13 -0
  3. xinference/api/oauth2/common.py +14 -0
  4. xinference/api/oauth2/core.py +93 -0
  5. xinference/api/oauth2/types.py +36 -0
  6. xinference/api/oauth2/utils.py +44 -0
  7. xinference/api/restful_api.py +216 -27
  8. xinference/client/oscar/actor_client.py +18 -18
  9. xinference/client/restful/restful_client.py +96 -33
  10. xinference/conftest.py +63 -1
  11. xinference/constants.py +1 -0
  12. xinference/core/chat_interface.py +143 -3
  13. xinference/core/metrics.py +83 -0
  14. xinference/core/model.py +244 -181
  15. xinference/core/status_guard.py +86 -0
  16. xinference/core/supervisor.py +57 -7
  17. xinference/core/worker.py +134 -13
  18. xinference/deploy/cmdline.py +142 -16
  19. xinference/deploy/local.py +39 -7
  20. xinference/deploy/supervisor.py +2 -0
  21. xinference/deploy/worker.py +33 -5
  22. xinference/fields.py +4 -1
  23. xinference/model/core.py +8 -1
  24. xinference/model/embedding/core.py +3 -2
  25. xinference/model/embedding/model_spec_modelscope.json +60 -18
  26. xinference/model/image/stable_diffusion/core.py +4 -3
  27. xinference/model/llm/__init__.py +7 -0
  28. xinference/model/llm/ggml/llamacpp.py +3 -2
  29. xinference/model/llm/llm_family.json +87 -3
  30. xinference/model/llm/llm_family.py +15 -5
  31. xinference/model/llm/llm_family_modelscope.json +92 -3
  32. xinference/model/llm/pytorch/chatglm.py +70 -28
  33. xinference/model/llm/pytorch/core.py +11 -30
  34. xinference/model/llm/pytorch/internlm2.py +155 -0
  35. xinference/model/llm/pytorch/utils.py +0 -153
  36. xinference/model/llm/utils.py +37 -8
  37. xinference/model/llm/vllm/core.py +15 -3
  38. xinference/model/multimodal/__init__.py +15 -8
  39. xinference/model/multimodal/core.py +8 -1
  40. xinference/model/multimodal/model_spec.json +9 -0
  41. xinference/model/multimodal/model_spec_modelscope.json +45 -0
  42. xinference/model/multimodal/qwen_vl.py +5 -9
  43. xinference/model/utils.py +7 -2
  44. xinference/types.py +2 -0
  45. xinference/web/ui/build/asset-manifest.json +3 -3
  46. xinference/web/ui/build/index.html +1 -1
  47. xinference/web/ui/build/static/js/main.b83095c2.js +3 -0
  48. xinference/web/ui/build/static/js/{main.236e72e7.js.LICENSE.txt → main.b83095c2.js.LICENSE.txt} +7 -0
  49. xinference/web/ui/build/static/js/main.b83095c2.js.map +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/0a853b2fa1902551e262a2f1a4b7894341f27b3dd9587f2ef7aaea195af89518.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/101923c539819f26ad11fbcbd6f6e56436b285efbb090dcc7dd648c6e924c4a8.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/193e7ba39e70d4bb2895a5cb317f6f293a5fd02e7e324c02a1eba2f83216419c.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/22858de5265f2d279fca9f2f54dfb147e4b2704200dfb5d2ad3ec9769417328f.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/27696db5fcd4fcf0e7974cadf1e4a2ab89690474045c3188eafd586323ad13bb.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/27bcada3ee8f89d21184b359f022fc965f350ffaca52c9814c29f1fc37121173.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/27bdbe25deab8cf08f7fab8f05f8f26cf84a98809527a37986a4ab73a57ba96a.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/2bee7b8bd3d52976a45d6068e1333df88b943e0e679403c809e45382e3818037.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/30670751f55508ef3b861e13dd71b9e5a10d2561373357a12fc3831a0b77fd93.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/3605cd3a96ff2a3b443c70a101575482279ad26847924cab0684d165ba0d2492.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/3789ef437d3ecbf945bb9cea39093d1f16ebbfa32dbe6daf35abcfb6d48de6f1.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/4942da6bc03bf7373af068e22f916341aabc5b5df855d73c1d348c696724ce37.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/4d933e35e0fe79867d3aa6c46db28804804efddf5490347cb6c2c2879762a157.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/4d96f071168af43965e0fab2ded658fa0a15b8d9ca03789a5ef9c5c16a4e3cee.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/52a6136cb2dbbf9c51d461724d9b283ebe74a73fb19d5df7ba8e13c42bd7174d.json +1 -0
  66. xinference/web/ui/node_modules/.cache/babel-loader/5c408307c982f07f9c09c85c98212d1b1c22548a9194c69548750a3016b91b88.json +1 -0
  67. xinference/web/ui/node_modules/.cache/babel-loader/663adbcb60b942e9cf094c8d9fabe57517f5e5e6e722d28b4948a40b7445a3b8.json +1 -0
  68. xinference/web/ui/node_modules/.cache/babel-loader/666bb2e1b250dc731311a7e4880886177885dfa768508d2ed63e02630cc78725.json +1 -0
  69. xinference/web/ui/node_modules/.cache/babel-loader/71493aadd34d568fbe605cacaba220aa69bd09273251ee4ba27930f8d01fccd8.json +1 -0
  70. xinference/web/ui/node_modules/.cache/babel-loader/8b071db2a5a9ef68dc14d5f606540bd23d9785e365a11997c510656764d2dccf.json +1 -0
  71. xinference/web/ui/node_modules/.cache/babel-loader/8b246d79cd3f6fc78f11777e6a6acca6a2c5d4ecce7f2dd4dcf9a48126440d3c.json +1 -0
  72. xinference/web/ui/node_modules/.cache/babel-loader/8d33354bd2100c8602afc3341f131a88cc36aaeecd5a4b365ed038514708e350.json +1 -0
  73. xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +1 -0
  74. xinference/web/ui/node_modules/.cache/babel-loader/a4d72d3b806ba061919115f0c513738726872e3c79cf258f007519d3f91d1a16.json +1 -0
  75. xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +1 -0
  76. xinference/web/ui/node_modules/.cache/babel-loader/b4e4fccaf8f2489a29081f0bf3b191656bd452fb3c8b5e3c6d92d94f680964d5.json +1 -0
  77. xinference/web/ui/node_modules/.cache/babel-loader/b53eb7c7967f6577bd3e678293c44204fb03ffa7fdc1dd59d3099015c68f6f7f.json +1 -0
  78. xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +1 -0
  79. xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +1 -0
  80. xinference/web/ui/node_modules/.cache/babel-loader/d06af85a84e5c5a29d3acf2dbb5b30c0cf75c8aec4ab5f975e6096f944ee4324.json +1 -0
  81. xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +1 -0
  82. xinference/web/ui/node_modules/.cache/babel-loader/d5e150bff31715977d8f537c970f06d4fe3de9909d7e8342244a83a9f6447121.json +1 -0
  83. xinference/web/ui/node_modules/.cache/babel-loader/de36e5c08fd524e341d664883dda6cb1745acc852a4f1b011a35a0b4615f72fa.json +1 -0
  84. xinference/web/ui/node_modules/.cache/babel-loader/f037ffef5992af0892d6d991053c1dace364cd39a3f11f1a41f92776e8a59459.json +1 -0
  85. xinference/web/ui/node_modules/.cache/babel-loader/f23ab356a8603d4a2aaa74388c2f381675c207d37c4d1c832df922e9655c9a6b.json +1 -0
  86. xinference/web/ui/node_modules/.cache/babel-loader/f7c23b0922f4087b9e2e3e46f15c946b772daa46c28c3a12426212ecaf481deb.json +1 -0
  87. xinference/web/ui/node_modules/.cache/babel-loader/f95a8bd358eeb55fa2f49f1224cc2f4f36006359856744ff09ae4bb295f59ec1.json +1 -0
  88. xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +1 -0
  89. xinference/web/ui/node_modules/.package-lock.json +36 -0
  90. xinference/web/ui/node_modules/@types/cookie/package.json +30 -0
  91. xinference/web/ui/node_modules/@types/hoist-non-react-statics/package.json +33 -0
  92. xinference/web/ui/node_modules/react-cookie/package.json +55 -0
  93. xinference/web/ui/node_modules/universal-cookie/package.json +48 -0
  94. xinference/web/ui/package-lock.json +37 -0
  95. xinference/web/ui/package.json +3 -2
  96. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/METADATA +17 -6
  97. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/RECORD +101 -66
  98. xinference/web/ui/build/static/js/main.236e72e7.js +0 -3
  99. xinference/web/ui/build/static/js/main.236e72e7.js.map +0 -1
  100. xinference/web/ui/node_modules/.cache/babel-loader/0cccfbe5d963b8e31eb679f9d9677392839cedd04aa2956ac6b33cf19599d597.json +0 -1
  101. xinference/web/ui/node_modules/.cache/babel-loader/0f3b6cc71b7c83bdc85aa4835927aeb86af2ce0d2ac241917ecfbf90f75c6d27.json +0 -1
  102. xinference/web/ui/node_modules/.cache/babel-loader/2f651cf60b1bde50c0601c7110f77dd44819fb6e2501ff748a631724d91445d4.json +0 -1
  103. xinference/web/ui/node_modules/.cache/babel-loader/42bb623f337ad08ed076484185726e072ca52bb88e373d72c7b052db4c273342.json +0 -1
  104. xinference/web/ui/node_modules/.cache/babel-loader/57af83639c604bd3362d0f03f7505e81c6f67ff77bee7c6bb31f6e5523eba185.json +0 -1
  105. xinference/web/ui/node_modules/.cache/babel-loader/667753ce39ce1d4bcbf9a5f1a103d653be1d19d42f4e1fbaceb9b507679a52c7.json +0 -1
  106. xinference/web/ui/node_modules/.cache/babel-loader/66ed1bd4c06748c1b176a625c25c856997edc787856c73162f82f2b465c5d956.json +0 -1
  107. xinference/web/ui/node_modules/.cache/babel-loader/78f2521da2e2a98b075a2666cb782c7e2c019cd3c72199eecd5901c82d8655df.json +0 -1
  108. xinference/web/ui/node_modules/.cache/babel-loader/8d2b0b3c6988d1894694dcbbe708ef91cfe62d62dac317031f09915ced637953.json +0 -1
  109. xinference/web/ui/node_modules/.cache/babel-loader/9427ae7f1e94ae8dcd2333fb361e381f4054fde07394fe5448658e3417368476.json +0 -1
  110. xinference/web/ui/node_modules/.cache/babel-loader/bcee2b4e76b07620f9087989eb86d43c645ba3c7a74132cf926260af1164af0e.json +0 -1
  111. xinference/web/ui/node_modules/.cache/babel-loader/cc2ddd02ccc1dad1a2737ac247c79e6f6ed2c7836c6b68e511e3048f666b64af.json +0 -1
  112. xinference/web/ui/node_modules/.cache/babel-loader/d2e8e6665a7efc832b43907dadf4e3c896a59eaf8129f9a520882466c8f2e489.json +0 -1
  113. xinference/web/ui/node_modules/.cache/babel-loader/d8a42e9df7157de9f28eecefdf178fd113bf2280d28471b6e32a8a45276042df.json +0 -1
  114. xinference/web/ui/node_modules/.cache/babel-loader/e26750d9556e9741912333349e4da454c53dbfddbfc6002ab49518dcf02af745.json +0 -1
  115. xinference/web/ui/node_modules/.cache/babel-loader/ef42ec014d7bc373b874b2a1ff0dcd785490f125e913698bc049b0bd778e4d66.json +0 -1
  116. xinference/web/ui/node_modules/.cache/babel-loader/fe3eb4d76c79ca98833f686d642224eeeb94cc83ad14300d281623796d087f0a.json +0 -1
  117. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/LICENSE +0 -0
  118. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/WHEEL +0 -0
  119. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/entry_points.txt +0 -0
  120. {xinference-0.7.5.dist-info → xinference-0.8.1.dist-info}/top_level.txt +0 -0
@@ -24,13 +24,13 @@ from xoscar.utils import get_next_port
24
24
 
25
25
  from .. import __version__
26
26
  from ..client import RESTfulClient
27
- from ..client.oscar.actor_client import ActorClient
28
27
  from ..client.restful.restful_client import (
29
28
  RESTfulChatglmCppChatModelHandle,
30
29
  RESTfulChatModelHandle,
31
30
  RESTfulGenerateModelHandle,
32
31
  )
33
32
  from ..constants import (
33
+ XINFERENCE_AUTH_DIR,
34
34
  XINFERENCE_DEFAULT_DISTRIBUTED_HOST,
35
35
  XINFERENCE_DEFAULT_ENDPOINT_PORT,
36
36
  XINFERENCE_DEFAULT_LOCAL_HOST,
@@ -62,10 +62,37 @@ def get_endpoint(endpoint: Optional[str]) -> str:
62
62
  return endpoint
63
63
 
64
64
 
65
+ def get_hash_endpoint(endpoint: str) -> str:
66
+ import hashlib
67
+
68
+ m = hashlib.sha256()
69
+ m.update(bytes(endpoint, "utf-8"))
70
+ return m.hexdigest()
71
+
72
+
73
+ def get_stored_token(
74
+ endpoint: str, client: Optional[RESTfulClient] = None
75
+ ) -> Optional[str]:
76
+ rest_client = RESTfulClient(endpoint) if client is None else client
77
+ authed = rest_client._cluster_authed
78
+ if not authed:
79
+ return None
80
+
81
+ token_path = os.path.join(XINFERENCE_AUTH_DIR, get_hash_endpoint(endpoint))
82
+ if not os.path.exists(token_path):
83
+ raise RuntimeError("Cannot find access token, please login first!")
84
+ with open(token_path, "r") as f:
85
+ access_token = str(f.read())
86
+ return access_token
87
+
88
+
65
89
  def start_local_cluster(
66
90
  log_level: str,
67
91
  host: str,
68
92
  port: int,
93
+ metrics_exporter_host: Optional[str] = None,
94
+ metrics_exporter_port: Optional[int] = None,
95
+ auth_config_file: Optional[str] = None,
69
96
  ):
70
97
  from .local import main
71
98
 
@@ -80,7 +107,10 @@ def start_local_cluster(
80
107
  main(
81
108
  host=host,
82
109
  port=port,
110
+ metrics_exporter_host=metrics_exporter_host,
111
+ metrics_exporter_port=metrics_exporter_port,
83
112
  logging_conf=dict_config,
113
+ auth_config_file=auth_config_file,
84
114
  )
85
115
 
86
116
 
@@ -159,12 +189,42 @@ def cli(
159
189
  type=int,
160
190
  help="Specify the port number for the Xinference server.",
161
191
  )
192
+ @click.option(
193
+ "--metrics-exporter-host",
194
+ "-MH",
195
+ default=None,
196
+ type=str,
197
+ help="Specify the host address for the Xinference metrics exporter server, default is the same as --host.",
198
+ )
199
+ @click.option(
200
+ "--metrics-exporter-port",
201
+ "-mp",
202
+ type=int,
203
+ help="Specify the port number for the Xinference metrics exporter server.",
204
+ )
205
+ @click.option(
206
+ "--auth-config",
207
+ type=str,
208
+ help="Specify the auth config json file.",
209
+ )
162
210
  def local(
163
211
  log_level: str,
164
212
  host: str,
165
213
  port: int,
214
+ metrics_exporter_host: Optional[str],
215
+ metrics_exporter_port: Optional[int],
216
+ auth_config: Optional[str],
166
217
  ):
167
- start_local_cluster(log_level=log_level, host=host, port=port)
218
+ if metrics_exporter_host is None:
219
+ metrics_exporter_host = host
220
+ start_local_cluster(
221
+ log_level=log_level,
222
+ host=host,
223
+ port=port,
224
+ metrics_exporter_host=metrics_exporter_host,
225
+ metrics_exporter_port=metrics_exporter_port,
226
+ auth_config_file=auth_config,
227
+ )
168
228
 
169
229
 
170
230
  @click.command(
@@ -196,7 +256,18 @@ def local(
196
256
  type=int,
197
257
  help="Specify the port number for the Xinference supervisor.",
198
258
  )
199
- def supervisor(log_level: str, host: str, port: int, supervisor_port: Optional[int]):
259
+ @click.option(
260
+ "--auth-config",
261
+ type=str,
262
+ help="Specify the auth config json file.",
263
+ )
264
+ def supervisor(
265
+ log_level: str,
266
+ host: str,
267
+ port: int,
268
+ supervisor_port: Optional[int],
269
+ auth_config: Optional[str],
270
+ ):
200
271
  from ..deploy.supervisor import main
201
272
 
202
273
  dict_config = get_config_dict(
@@ -208,7 +279,11 @@ def supervisor(log_level: str, host: str, port: int, supervisor_port: Optional[i
208
279
  logging.config.dictConfig(dict_config) # type: ignore
209
280
 
210
281
  main(
211
- host=host, port=port, supervisor_port=supervisor_port, logging_conf=dict_config
282
+ host=host,
283
+ port=port,
284
+ supervisor_port=supervisor_port,
285
+ logging_conf=dict_config,
286
+ auth_config_file=auth_config,
212
287
  )
213
288
 
214
289
 
@@ -235,8 +310,25 @@ def supervisor(log_level: str, host: str, port: int, supervisor_port: Optional[i
235
310
  type=int,
236
311
  help="Specify the port number for the Xinference worker.",
237
312
  )
313
+ @click.option(
314
+ "--metrics-exporter-host",
315
+ "-MH",
316
+ default=XINFERENCE_DEFAULT_DISTRIBUTED_HOST,
317
+ type=str,
318
+ help="Specify the host address for the metrics exporter server.",
319
+ )
320
+ @click.option(
321
+ "--metrics-exporter-port",
322
+ type=int,
323
+ help="Specify the port number for the Xinference metrics exporter worker.",
324
+ )
238
325
  def worker(
239
- log_level: str, endpoint: Optional[str], host: str, worker_port: Optional[int]
326
+ log_level: str,
327
+ endpoint: Optional[str],
328
+ host: str,
329
+ worker_port: Optional[int],
330
+ metrics_exporter_host: Optional[str],
331
+ metrics_exporter_port: Optional[int],
240
332
  ):
241
333
  from ..deploy.worker import main
242
334
 
@@ -257,6 +349,8 @@ def worker(
257
349
  main(
258
350
  address=address,
259
351
  supervisor_address=supervisor_internal_addr,
352
+ metrics_exporter_host=metrics_exporter_host,
353
+ metrics_exporter_port=metrics_exporter_port,
260
354
  logging_conf=dict_config,
261
355
  )
262
356
 
@@ -288,6 +382,7 @@ def register_model(
288
382
  model = fd.read()
289
383
 
290
384
  client = RESTfulClient(base_url=endpoint)
385
+ client._set_token(get_stored_token(endpoint, client))
291
386
  client.register_model(
292
387
  model_type=model_type,
293
388
  model=model,
@@ -316,6 +411,7 @@ def unregister_model(
316
411
  endpoint = get_endpoint(endpoint)
317
412
 
318
413
  client = RESTfulClient(base_url=endpoint)
414
+ client._set_token(get_stored_token(endpoint, client))
319
415
  client.unregister_model(
320
416
  model_type=model_type,
321
417
  model_name=model_name,
@@ -343,8 +439,9 @@ def list_model_registrations(
343
439
  from tabulate import tabulate
344
440
 
345
441
  endpoint = get_endpoint(endpoint)
346
-
347
442
  client = RESTfulClient(base_url=endpoint)
443
+ client._set_token(get_stored_token(endpoint, client))
444
+
348
445
  registrations = client.list_model_registrations(model_type=model_type)
349
446
 
350
447
  table = []
@@ -518,8 +615,9 @@ def model_launch(
518
615
  if size_in_billions is None or "_" in size_in_billions
519
616
  else int(size_in_billions)
520
617
  )
521
-
522
618
  client = RESTfulClient(base_url=endpoint)
619
+ client._set_token(get_stored_token(endpoint, client))
620
+
523
621
  model_uid = client.launch_model(
524
622
  model_name=model_name,
525
623
  model_type=model_type,
@@ -550,6 +648,7 @@ def model_list(endpoint: Optional[str]):
550
648
 
551
649
  endpoint = get_endpoint(endpoint)
552
650
  client = RESTfulClient(base_url=endpoint)
651
+ client._set_token(get_stored_token(endpoint, client))
553
652
 
554
653
  llm_table = []
555
654
  embedding_table = []
@@ -626,8 +725,8 @@ def model_terminate(
626
725
  model_uid: str,
627
726
  ):
628
727
  endpoint = get_endpoint(endpoint)
629
-
630
728
  client = RESTfulClient(base_url=endpoint)
729
+ client._set_token(get_stored_token(endpoint, client))
631
730
  client.terminate_model(model_uid=model_uid)
632
731
 
633
732
 
@@ -657,6 +756,8 @@ def model_generate(
657
756
  stream: bool,
658
757
  ):
659
758
  endpoint = get_endpoint(endpoint)
759
+ client = RESTfulClient(base_url=endpoint)
760
+ client._set_token(get_stored_token(endpoint, client))
660
761
  if stream:
661
762
  # TODO: when stream=True, RestfulClient cannot generate words one by one.
662
763
  # So use Client in temporary. The implementation needs to be changed to
@@ -669,7 +770,7 @@ def model_generate(
669
770
  if prompt == "":
670
771
  break
671
772
  print(f"Completion: {prompt}", end="", file=sys.stdout)
672
- async for chunk in model.generate(
773
+ for chunk in model.generate(
673
774
  prompt=prompt,
674
775
  generate_config={"stream": stream, "max_tokens": max_tokens},
675
776
  ):
@@ -680,7 +781,6 @@ def model_generate(
680
781
  print(choice["text"], end="", flush=True, file=sys.stdout)
681
782
  print("", file=sys.stdout)
682
783
 
683
- client = ActorClient(endpoint=endpoint)
684
784
  model = client.get_model(model_uid=model_uid)
685
785
 
686
786
  loop = asyncio.get_event_loop()
@@ -700,8 +800,7 @@ def model_generate(
700
800
  # avoid displaying exception-unhandled warnings
701
801
  task.exception()
702
802
  else:
703
- restful_client = RESTfulClient(base_url=endpoint)
704
- restful_model = restful_client.get_model(model_uid=model_uid)
803
+ restful_model = client.get_model(model_uid=model_uid)
705
804
  if not isinstance(
706
805
  restful_model, (RESTfulChatModelHandle, RESTfulGenerateModelHandle)
707
806
  ):
@@ -744,6 +843,9 @@ def model_chat(
744
843
  ):
745
844
  # TODO: chat model roles may not be user and assistant.
746
845
  endpoint = get_endpoint(endpoint)
846
+ client = RESTfulClient(base_url=endpoint)
847
+ client._set_token(get_stored_token(endpoint, client))
848
+
747
849
  chat_history: "List[ChatCompletionMessage]" = []
748
850
  if stream:
749
851
  # TODO: when stream=True, RestfulClient cannot generate words one by one.
@@ -758,7 +860,7 @@ def model_chat(
758
860
  break
759
861
  print("Assistant: ", end="", file=sys.stdout)
760
862
  response_content = ""
761
- async for chunk in model.chat(
863
+ for chunk in model.chat(
762
864
  prompt=prompt,
763
865
  chat_history=chat_history,
764
866
  generate_config={"stream": stream, "max_tokens": max_tokens},
@@ -775,7 +877,6 @@ def model_chat(
775
877
  ChatCompletionMessage(role="assistant", content=response_content)
776
878
  )
777
879
 
778
- client = ActorClient(endpoint=endpoint)
779
880
  model = client.get_model(model_uid=model_uid)
780
881
 
781
882
  loop = asyncio.get_event_loop()
@@ -795,8 +896,7 @@ def model_chat(
795
896
  # avoid displaying exception-unhandled warnings
796
897
  task.exception()
797
898
  else:
798
- restful_client = RESTfulClient(base_url=endpoint)
799
- restful_model = restful_client.get_model(model_uid=model_uid)
899
+ restful_model = client.get_model(model_uid=model_uid)
800
900
  if not isinstance(
801
901
  restful_model, (RESTfulChatModelHandle, RESTfulChatglmCppChatModelHandle)
802
902
  ):
@@ -822,5 +922,31 @@ def model_chat(
822
922
  )
823
923
 
824
924
 
925
+ @cli.command("login", help="Login when the cluster is authenticated.")
926
+ @click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
927
+ @click.option("--username", type=str, required=True, help="Username.")
928
+ @click.option(
929
+ "--password",
930
+ type=str,
931
+ required=True,
932
+ help="Password.",
933
+ )
934
+ def cluster_login(
935
+ endpoint: Optional[str],
936
+ username: str,
937
+ password: str,
938
+ ):
939
+ endpoint = get_endpoint(endpoint)
940
+ restful_client = RESTfulClient(base_url=endpoint)
941
+ if restful_client._cluster_authed:
942
+ restful_client.login(username, password)
943
+ access_token = restful_client._get_token()
944
+ assert access_token is not None
945
+ os.makedirs(XINFERENCE_AUTH_DIR, exist_ok=True)
946
+ hashed_ep = get_hash_endpoint(endpoint)
947
+ with open(os.path.join(XINFERENCE_AUTH_DIR, hashed_ep), "w") as f:
948
+ f.write(access_token)
949
+
950
+
825
951
  if __name__ == "__main__":
826
952
  cli()
@@ -35,6 +35,8 @@ logger = logging.getLogger(__name__)
35
35
 
36
36
  async def _start_local_cluster(
37
37
  address: str,
38
+ metrics_exporter_host: Optional[str] = None,
39
+ metrics_exporter_port: Optional[int] = None,
38
40
  logging_conf: Optional[Dict] = None,
39
41
  ):
40
42
  from .utils import create_worker_actor_pool
@@ -50,7 +52,11 @@ async def _start_local_cluster(
50
52
  SupervisorActor, address=address, uid=SupervisorActor.uid()
51
53
  )
52
54
  await start_worker_components(
53
- address=address, supervisor_address=address, main_pool=pool
55
+ address=address,
56
+ supervisor_address=address,
57
+ main_pool=pool,
58
+ metrics_exporter_host=metrics_exporter_host,
59
+ metrics_exporter_port=metrics_exporter_port,
54
60
  )
55
61
  await pool.join()
56
62
  except asyncio.CancelledError:
@@ -58,7 +64,12 @@ async def _start_local_cluster(
58
64
  await pool.stop()
59
65
 
60
66
 
61
- def run(address: str, logging_conf: Optional[Dict] = None):
67
+ def run(
68
+ address: str,
69
+ metrics_exporter_host: Optional[str] = None,
70
+ metrics_exporter_port: Optional[int] = None,
71
+ logging_conf: Optional[Dict] = None,
72
+ ):
62
73
  def sigterm_handler(signum, frame):
63
74
  sys.exit(0)
64
75
 
@@ -66,22 +77,42 @@ def run(address: str, logging_conf: Optional[Dict] = None):
66
77
 
67
78
  loop = asyncio.get_event_loop()
68
79
  task = loop.create_task(
69
- _start_local_cluster(address=address, logging_conf=logging_conf)
80
+ _start_local_cluster(
81
+ address=address,
82
+ metrics_exporter_host=metrics_exporter_host,
83
+ metrics_exporter_port=metrics_exporter_port,
84
+ logging_conf=logging_conf,
85
+ )
70
86
  )
71
87
  loop.run_until_complete(task)
72
88
 
73
89
 
74
90
  def run_in_subprocess(
75
- address: str, logging_conf: Optional[Dict] = None
91
+ address: str,
92
+ metrics_exporter_host: Optional[str] = None,
93
+ metrics_exporter_port: Optional[int] = None,
94
+ logging_conf: Optional[Dict] = None,
76
95
  ) -> multiprocessing.Process:
77
- p = multiprocessing.Process(target=run, args=(address, logging_conf))
96
+ p = multiprocessing.Process(
97
+ target=run,
98
+ args=(address, metrics_exporter_host, metrics_exporter_port, logging_conf),
99
+ )
78
100
  p.start()
79
101
  return p
80
102
 
81
103
 
82
- def main(host: str, port: int, logging_conf: Optional[Dict] = None):
104
+ def main(
105
+ host: str,
106
+ port: int,
107
+ metrics_exporter_host: Optional[str] = None,
108
+ metrics_exporter_port: Optional[int] = None,
109
+ logging_conf: Optional[Dict] = None,
110
+ auth_config_file: Optional[str] = None,
111
+ ):
83
112
  supervisor_address = f"{host}:{get_next_port()}"
84
- local_cluster = run_in_subprocess(supervisor_address, logging_conf)
113
+ local_cluster = run_in_subprocess(
114
+ supervisor_address, metrics_exporter_host, metrics_exporter_port, logging_conf
115
+ )
85
116
 
86
117
  if not health_check(
87
118
  address=supervisor_address,
@@ -98,6 +129,7 @@ def main(host: str, port: int, logging_conf: Optional[Dict] = None):
98
129
  host=host,
99
130
  port=port,
100
131
  logging_conf=logging_conf,
132
+ auth_config_file=auth_config_file,
101
133
  )
102
134
  finally:
103
135
  local_cluster.terminate()
@@ -75,6 +75,7 @@ def main(
75
75
  port: int,
76
76
  supervisor_port: Optional[int],
77
77
  logging_conf: Optional[Dict] = None,
78
+ auth_config_file: Optional[str] = None,
78
79
  ):
79
80
  supervisor_address = f"{host}:{supervisor_port or get_next_port()}"
80
81
  local_cluster = run_in_subprocess(supervisor_address, logging_conf)
@@ -94,6 +95,7 @@ def main(
94
95
  host=host,
95
96
  port=port,
96
97
  logging_conf=logging_conf,
98
+ auth_config_file=auth_config_file,
97
99
  )
98
100
  finally:
99
101
  local_cluster.terminate()
@@ -27,7 +27,11 @@ logger = logging.getLogger(__name__)
27
27
 
28
28
 
29
29
  async def start_worker_components(
30
- address: str, supervisor_address: str, main_pool: MainActorPoolType
30
+ address: str,
31
+ supervisor_address: str,
32
+ main_pool: MainActorPoolType,
33
+ metrics_exporter_host: Optional[str],
34
+ metrics_exporter_port: Optional[int],
31
35
  ):
32
36
  cuda_device_indices = []
33
37
  cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
@@ -43,24 +47,48 @@ async def start_worker_components(
43
47
  supervisor_address=supervisor_address,
44
48
  main_pool=main_pool,
45
49
  cuda_devices=cuda_device_indices,
50
+ metrics_exporter_host=metrics_exporter_host,
51
+ metrics_exporter_port=metrics_exporter_port,
46
52
  )
47
53
 
48
54
 
49
55
  async def _start_worker(
50
- address: str, supervisor_address: str, logging_conf: Any = None
56
+ address: str,
57
+ supervisor_address: str,
58
+ metrics_exporter_host: Optional[str] = None,
59
+ metrics_exporter_port: Optional[int] = None,
60
+ logging_conf: Any = None,
51
61
  ):
52
62
  from .utils import create_worker_actor_pool
53
63
 
54
64
  pool = await create_worker_actor_pool(address=address, logging_conf=logging_conf)
55
65
  await start_worker_components(
56
- address=address, supervisor_address=supervisor_address, main_pool=pool
66
+ address=address,
67
+ supervisor_address=supervisor_address,
68
+ main_pool=pool,
69
+ metrics_exporter_host=metrics_exporter_host,
70
+ metrics_exporter_port=metrics_exporter_port,
57
71
  )
58
72
  await pool.join()
59
73
 
60
74
 
61
- def main(address: str, supervisor_address: str, logging_conf: Optional[dict] = None):
75
+ def main(
76
+ address: str,
77
+ supervisor_address: str,
78
+ metrics_exporter_host: Optional[str] = None,
79
+ metrics_exporter_port: Optional[int] = None,
80
+ logging_conf: Optional[dict] = None,
81
+ ):
62
82
  loop = asyncio.get_event_loop()
63
- task = loop.create_task(_start_worker(address, supervisor_address, logging_conf))
83
+ task = loop.create_task(
84
+ _start_worker(
85
+ address,
86
+ supervisor_address,
87
+ metrics_exporter_host,
88
+ metrics_exporter_port,
89
+ logging_conf,
90
+ )
91
+ )
64
92
 
65
93
  try:
66
94
  loop.run_until_complete(task)
xinference/fields.py CHANGED
@@ -30,7 +30,10 @@ logprobs_field = Field(
30
30
  )
31
31
 
32
32
  max_tokens_field = Field(
33
- default=128, ge=1, le=32768, description="The maximum number of tokens to generate."
33
+ default=1024,
34
+ ge=1,
35
+ le=32768,
36
+ description="The maximum number of tokens to generate.",
34
37
  )
35
38
 
36
39
  temperature_field = Field(
xinference/model/core.py CHANGED
@@ -78,7 +78,14 @@ def create_model_instance(
78
78
  elif model_type == "multimodal":
79
79
  kwargs.pop("trust_remote_code", None)
80
80
  return create_multimodal_model_instance(
81
- subpool_addr, devices, model_uid, model_name, **kwargs
81
+ subpool_addr,
82
+ devices,
83
+ model_uid,
84
+ model_name,
85
+ model_format,
86
+ model_size_in_billions,
87
+ quantization,
88
+ **kwargs,
82
89
  )
83
90
  else:
84
91
  raise ValueError(f"Unsupported model type: {model_type}.")
@@ -40,7 +40,8 @@ class EmbeddingModelSpec(BaseModel):
40
40
  max_tokens: int
41
41
  language: List[str]
42
42
  model_id: str
43
- model_revision: str
43
+ model_revision: Optional[str]
44
+ model_hub: str = "huggingface"
44
45
 
45
46
 
46
47
  class EmbeddingModelDescription(ModelDescription):
@@ -165,7 +166,7 @@ def cache(model_spec: EmbeddingModelSpec):
165
166
  if valid_model_revision(meta_path, model_spec.model_revision):
166
167
  return cache_dir
167
168
 
168
- from_modelscope: bool = model_spec.model_id.startswith("Xorbits/")
169
+ from_modelscope: bool = model_spec.model_hub == "modelscope"
169
170
  if from_modelscope:
170
171
  download_dir = retry_download(
171
172
  ms_download,