xinference 0.9.4__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (103) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/auth_service.py +47 -18
  3. xinference/api/oauth2/types.py +1 -0
  4. xinference/api/restful_api.py +34 -7
  5. xinference/client/oscar/actor_client.py +4 -3
  6. xinference/client/restful/restful_client.py +20 -4
  7. xinference/conftest.py +13 -2
  8. xinference/core/supervisor.py +48 -1
  9. xinference/core/worker.py +139 -20
  10. xinference/deploy/cmdline.py +119 -20
  11. xinference/model/embedding/core.py +1 -2
  12. xinference/model/llm/__init__.py +4 -6
  13. xinference/model/llm/ggml/llamacpp.py +2 -10
  14. xinference/model/llm/llm_family.json +877 -13
  15. xinference/model/llm/llm_family.py +15 -0
  16. xinference/model/llm/llm_family_modelscope.json +571 -0
  17. xinference/model/llm/pytorch/chatglm.py +2 -0
  18. xinference/model/llm/pytorch/core.py +22 -26
  19. xinference/model/llm/pytorch/deepseek_vl.py +232 -0
  20. xinference/model/llm/pytorch/internlm2.py +2 -0
  21. xinference/model/llm/pytorch/omnilmm.py +153 -0
  22. xinference/model/llm/pytorch/qwen_vl.py +2 -0
  23. xinference/model/llm/pytorch/yi_vl.py +4 -2
  24. xinference/model/llm/utils.py +53 -5
  25. xinference/model/llm/vllm/core.py +54 -6
  26. xinference/model/rerank/core.py +3 -0
  27. xinference/thirdparty/deepseek_vl/__init__.py +31 -0
  28. xinference/thirdparty/deepseek_vl/models/__init__.py +28 -0
  29. xinference/thirdparty/deepseek_vl/models/clip_encoder.py +242 -0
  30. xinference/thirdparty/deepseek_vl/models/image_processing_vlm.py +208 -0
  31. xinference/thirdparty/deepseek_vl/models/modeling_vlm.py +170 -0
  32. xinference/thirdparty/deepseek_vl/models/processing_vlm.py +390 -0
  33. xinference/thirdparty/deepseek_vl/models/projector.py +100 -0
  34. xinference/thirdparty/deepseek_vl/models/sam.py +593 -0
  35. xinference/thirdparty/deepseek_vl/models/siglip_vit.py +681 -0
  36. xinference/thirdparty/deepseek_vl/utils/__init__.py +18 -0
  37. xinference/thirdparty/deepseek_vl/utils/conversation.py +348 -0
  38. xinference/thirdparty/deepseek_vl/utils/io.py +78 -0
  39. xinference/thirdparty/omnilmm/__init__.py +0 -0
  40. xinference/thirdparty/omnilmm/chat.py +216 -0
  41. xinference/thirdparty/omnilmm/constants.py +4 -0
  42. xinference/thirdparty/omnilmm/conversation.py +332 -0
  43. xinference/thirdparty/omnilmm/model/__init__.py +1 -0
  44. xinference/thirdparty/omnilmm/model/omnilmm.py +594 -0
  45. xinference/thirdparty/omnilmm/model/resampler.py +166 -0
  46. xinference/thirdparty/omnilmm/model/utils.py +563 -0
  47. xinference/thirdparty/omnilmm/train/__init__.py +13 -0
  48. xinference/thirdparty/omnilmm/train/train_utils.py +150 -0
  49. xinference/thirdparty/omnilmm/utils.py +134 -0
  50. xinference/types.py +15 -19
  51. xinference/web/ui/build/asset-manifest.json +3 -3
  52. xinference/web/ui/build/index.html +1 -1
  53. xinference/web/ui/build/static/js/main.76ef2b17.js +3 -0
  54. xinference/web/ui/build/static/js/main.76ef2b17.js.map +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/35d0e4a317e5582cbb79d901302e9d706520ac53f8a734c2fd8bfde6eb5a4f02.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/b9cbcb6d77ba21b22c6950b6fb5b305d23c19cf747f99f7d48b6b046f8f7b1b0.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +1 -0
  66. xinference/web/ui/node_modules/.cache/babel-loader/d076fd56cf3b15ed2433e3744b98c6b4e4410a19903d1db4de5bba0e1a1b3347.json +1 -0
  67. xinference/web/ui/node_modules/.cache/babel-loader/daad8131d91134f6d7aef895a0c9c32e1cb928277cb5aa66c01028126d215be0.json +1 -0
  68. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +1 -0
  69. xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +1 -0
  70. xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +1 -0
  71. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +1 -0
  72. xinference/web/ui/node_modules/.cache/babel-loader/f16aec63602a77bd561d0e67fa00b76469ac54b8033754bba114ec5eb3257964.json +1 -0
  73. {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/METADATA +25 -12
  74. {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/RECORD +79 -58
  75. xinference/model/llm/ggml/ctransformers.py +0 -281
  76. xinference/model/llm/ggml/ctransformers_util.py +0 -161
  77. xinference/web/ui/build/static/js/main.66b1c4fb.js +0 -3
  78. xinference/web/ui/build/static/js/main.66b1c4fb.js.map +0 -1
  79. xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +0 -1
  80. xinference/web/ui/node_modules/.cache/babel-loader/0db651c046ef908f45cde73af0dbea0a797d3e35bb57f4a0863b481502103a64.json +0 -1
  81. xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/3d93bd9a74a1ab0cec85af40f9baa5f6a8e7384b9e18c409b95a81a7b45bb7e2.json +0 -1
  83. xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +0 -1
  84. xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +0 -1
  85. xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +0 -1
  86. xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +0 -1
  87. xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +0 -1
  88. xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +0 -1
  89. xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +0 -1
  90. xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +0 -1
  93. xinference/web/ui/node_modules/.cache/babel-loader/c2124cfe036b26befcbd386d1d17743b1a58d0b7a041a17bb67f9924400d63c3.json +0 -1
  94. xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +0 -1
  95. xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +0 -1
  96. xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +0 -1
  97. xinference/web/ui/node_modules/.cache/babel-loader/fd4a8ae5d192331af1bedd1d2d70efcc569708ee6cc4cb479b225d059482aa81.json +0 -1
  98. xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +0 -1
  99. /xinference/web/ui/build/static/js/{main.66b1c4fb.js.LICENSE.txt → main.76ef2b17.js.LICENSE.txt} +0 -0
  100. {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/LICENSE +0 -0
  101. {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/WHEEL +0 -0
  102. {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/entry_points.txt +0 -0
  103. {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/top_level.txt +0 -0
xinference/core/worker.py CHANGED
@@ -74,6 +74,10 @@ class WorkerActor(xo.StatelessActor):
74
74
  self._model_uid_to_model_spec: Dict[str, ModelDescription] = {}
75
75
  self._gpu_to_model_uid: Dict[int, str] = {}
76
76
  self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
77
+ # Dict structure: gpu_index: {(replica_model_uid, model_type)}
78
+ self._user_specified_gpu_to_model_uids: Dict[
79
+ int, Set[Tuple[str, str]]
80
+ ] = defaultdict(set)
77
81
  self._model_uid_to_addr: Dict[str, str] = {}
78
82
  self._model_uid_to_recover_count: Dict[str, int] = {}
79
83
  self._model_uid_to_launch_args: Dict[str, Dict] = {}
@@ -268,12 +272,27 @@ class WorkerActor(xo.StatelessActor):
268
272
  """
269
273
  candidates = []
270
274
  for _dev in self._total_gpu_devices:
271
- if _dev not in self._gpu_to_model_uid:
275
+ if (
276
+ _dev not in self._gpu_to_model_uid
277
+ and _dev not in self._user_specified_gpu_to_model_uids
278
+ ): # no possible vllm model on it, add it to candidates
272
279
  candidates.append(_dev)
273
- else:
274
- existing_model_uid = self._gpu_to_model_uid[_dev]
275
- is_vllm_model = await self.is_model_vllm_backend(existing_model_uid)
276
- if not is_vllm_model:
280
+ else: # need to judge that whether to have vllm model on this device
281
+ has_vllm_model = False
282
+ if _dev in self._gpu_to_model_uid:
283
+ existing_model_uid = self._gpu_to_model_uid[_dev]
284
+ has_vllm_model = await self.is_model_vllm_backend(
285
+ existing_model_uid
286
+ )
287
+ if (
288
+ not has_vllm_model
289
+ and _dev in self._user_specified_gpu_to_model_uids
290
+ ):
291
+ for rep_uid, _ in self._user_specified_gpu_to_model_uids[_dev]:
292
+ has_vllm_model = await self.is_model_vllm_backend(rep_uid)
293
+ if has_vllm_model:
294
+ break
295
+ if not has_vllm_model:
277
296
  candidates.append(_dev)
278
297
 
279
298
  if len(candidates) == 0:
@@ -285,9 +304,13 @@ class WorkerActor(xo.StatelessActor):
285
304
  device, min_cnt = -1, -1
286
305
  # Pick the device with the fewest existing models among all the candidate devices.
287
306
  for _dev in candidates:
288
- existing_cnt = len(self._gpu_to_embedding_model_uids[_dev])
307
+ existing_cnt = 0
308
+ if _dev in self._gpu_to_embedding_model_uids:
309
+ existing_cnt += len(self._gpu_to_embedding_model_uids[_dev])
289
310
  if _dev in self._gpu_to_model_uid:
290
311
  existing_cnt += 1
312
+ if _dev in self._user_specified_gpu_to_model_uids:
313
+ existing_cnt += len(self._user_specified_gpu_to_model_uids[_dev])
291
314
  if min_cnt == -1 or existing_cnt < min_cnt:
292
315
  device, min_cnt = _dev, existing_cnt
293
316
 
@@ -295,17 +318,82 @@ class WorkerActor(xo.StatelessActor):
295
318
  return device
296
319
 
297
320
  def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
298
- if n_gpu > len(self._total_gpu_devices) - len(self._gpu_to_model_uid):
321
+ user_specified_allocated_devices: Set[int] = set()
322
+ for dev, model_infos in self._user_specified_gpu_to_model_uids.items():
323
+ allocated_non_embedding_rerank_models = False
324
+ for _, model_type in model_infos:
325
+ allocated_non_embedding_rerank_models = model_type not in [
326
+ "embedding",
327
+ "rerank",
328
+ ]
329
+ if allocated_non_embedding_rerank_models:
330
+ break
331
+ if allocated_non_embedding_rerank_models:
332
+ user_specified_allocated_devices.add(dev)
333
+ allocated_devices = set(self._gpu_to_model_uid.keys()).union(
334
+ user_specified_allocated_devices
335
+ )
336
+ if n_gpu > len(self._total_gpu_devices) - len(allocated_devices):
299
337
  raise RuntimeError("No available slot found for the model")
300
338
 
301
339
  devices: List[int] = [
302
- dev for dev in self._total_gpu_devices if dev not in self._gpu_to_model_uid
340
+ dev
341
+ for dev in self._total_gpu_devices
342
+ if dev not in self._gpu_to_model_uid
343
+ and dev not in user_specified_allocated_devices
303
344
  ][:n_gpu]
304
345
  for dev in devices:
305
346
  self._gpu_to_model_uid[int(dev)] = model_uid
306
347
 
307
348
  return sorted(devices)
308
349
 
350
+ async def allocate_devices_with_gpu_idx(
351
+ self, model_uid: str, model_type: str, gpu_idx: List[int]
352
+ ) -> List[int]:
353
+ """
354
+ When user specifies the gpu_idx, allocate models on user-specified GPUs whenever possible
355
+ """
356
+ # must be subset of total devices visible to this worker
357
+ if not set(gpu_idx) <= set(self._total_gpu_devices):
358
+ raise ValueError(
359
+ f"Worker {self.address} cannot use the GPUs with these indexes: {gpu_idx}. "
360
+ f"Worker {self.address} can only see these GPUs: {self._total_gpu_devices}."
361
+ )
362
+ # currently just report a warning log when there are already models on these GPUs
363
+ for idx in gpu_idx:
364
+ existing_model_uids = []
365
+ if idx in self._gpu_to_model_uid:
366
+ rep_uid = self._gpu_to_model_uid[idx]
367
+ is_vllm_model = await self.is_model_vllm_backend(rep_uid)
368
+ if is_vllm_model:
369
+ raise RuntimeError(
370
+ f"GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
371
+ f"therefore cannot allocate GPU memory for a new model."
372
+ )
373
+ existing_model_uids.append(rep_uid)
374
+ if idx in self._gpu_to_embedding_model_uids:
375
+ existing_model_uids.extend(self._gpu_to_embedding_model_uids[idx])
376
+ # If user has run the vLLM model on the GPU that was forced to be specified,
377
+ # it is not possible to force this GPU to be allocated again
378
+ if idx in self._user_specified_gpu_to_model_uids:
379
+ for rep_uid, _ in self._user_specified_gpu_to_model_uids[idx]:
380
+ is_vllm_model = await self.is_model_vllm_backend(rep_uid)
381
+ if is_vllm_model:
382
+ raise RuntimeError(
383
+ f"User specified GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
384
+ f"therefore cannot allocate GPU memory for a new model."
385
+ )
386
+
387
+ if existing_model_uids:
388
+ logger.warning(
389
+ f"WARNING!!! GPU index {idx} has been occupied "
390
+ f"with these models on it: {existing_model_uids}"
391
+ )
392
+
393
+ for idx in gpu_idx:
394
+ self._user_specified_gpu_to_model_uids[idx].add((model_uid, model_type))
395
+ return sorted(gpu_idx)
396
+
309
397
  def release_devices(self, model_uid: str):
310
398
  devices = [
311
399
  dev
@@ -320,27 +408,46 @@ class WorkerActor(xo.StatelessActor):
320
408
  if model_uid in self._gpu_to_embedding_model_uids[dev]:
321
409
  self._gpu_to_embedding_model_uids[dev].remove(model_uid)
322
410
 
411
+ # check user-specified slots
412
+ for dev in self._user_specified_gpu_to_model_uids:
413
+ model_infos = list(
414
+ filter(
415
+ lambda x: x[0] == model_uid,
416
+ self._user_specified_gpu_to_model_uids[dev],
417
+ )
418
+ )
419
+ for model_info in model_infos:
420
+ self._user_specified_gpu_to_model_uids[dev].remove(model_info)
421
+
323
422
  async def _create_subpool(
324
423
  self,
325
424
  model_uid: str,
326
425
  model_type: Optional[str] = None,
327
426
  n_gpu: Optional[Union[int, str]] = "auto",
427
+ gpu_idx: Optional[List[int]] = None,
328
428
  ) -> Tuple[str, List[str]]:
329
429
  env = {}
330
430
  devices = []
331
- if isinstance(n_gpu, int) or (n_gpu == "auto" and gpu_count() > 0):
332
- # Currently, n_gpu=auto means using 1 GPU
333
- gpu_cnt = n_gpu if isinstance(n_gpu, int) else 1
334
- devices = (
335
- [await self.allocate_devices_for_embedding(model_uid)]
336
- if model_type in ["embedding", "rerank"]
337
- else self.allocate_devices(model_uid=model_uid, n_gpu=gpu_cnt)
431
+ if gpu_idx is None:
432
+ if isinstance(n_gpu, int) or (n_gpu == "auto" and gpu_count() > 0):
433
+ # Currently, n_gpu=auto means using 1 GPU
434
+ gpu_cnt = n_gpu if isinstance(n_gpu, int) else 1
435
+ devices = (
436
+ [await self.allocate_devices_for_embedding(model_uid)]
437
+ if model_type in ["embedding", "rerank"]
438
+ else self.allocate_devices(model_uid=model_uid, n_gpu=gpu_cnt)
439
+ )
440
+ env["CUDA_VISIBLE_DEVICES"] = ",".join([str(dev) for dev in devices])
441
+ logger.debug(f"GPU selected: {devices} for model {model_uid}")
442
+ if n_gpu is None:
443
+ env["CUDA_VISIBLE_DEVICES"] = "-1"
444
+ logger.debug(f"GPU disabled for model {model_uid}")
445
+ else:
446
+ assert isinstance(gpu_idx, list)
447
+ devices = await self.allocate_devices_with_gpu_idx(
448
+ model_uid, model_type, gpu_idx # type: ignore
338
449
  )
339
450
  env["CUDA_VISIBLE_DEVICES"] = ",".join([str(dev) for dev in devices])
340
- logger.debug(f"GPU selected: {devices} for model {model_uid}")
341
- if n_gpu is None:
342
- env["CUDA_VISIBLE_DEVICES"] = "-1"
343
- logger.debug(f"GPU disabled for model {model_uid}")
344
451
 
345
452
  if os.name != "nt" and platform.system() != "Darwin":
346
453
  # Linux
@@ -495,6 +602,7 @@ class WorkerActor(xo.StatelessActor):
495
602
  image_lora_load_kwargs: Optional[Dict] = None,
496
603
  image_lora_fuse_kwargs: Optional[Dict] = None,
497
604
  request_limits: Optional[int] = None,
605
+ gpu_idx: Optional[Union[int, List[int]]] = None,
498
606
  **kwargs,
499
607
  ):
500
608
  event_model_uid, _, __ = parse_replica_model_uid(model_uid)
@@ -510,6 +618,17 @@ class WorkerActor(xo.StatelessActor):
510
618
  launch_args.pop("self")
511
619
  launch_args.pop("kwargs")
512
620
  launch_args.update(kwargs)
621
+
622
+ if gpu_idx is not None:
623
+ logger.info(
624
+ f"You specify to launch the model: {model_name} on GPU index: {gpu_idx} "
625
+ f"of the worker: {self.address}, "
626
+ f"xinference will automatically ignore the `n_gpu` option."
627
+ )
628
+ if isinstance(gpu_idx, int):
629
+ gpu_idx = [gpu_idx]
630
+ assert isinstance(gpu_idx, list)
631
+
513
632
  if n_gpu is not None:
514
633
  if isinstance(n_gpu, int) and (n_gpu <= 0 or n_gpu > gpu_count()):
515
634
  raise ValueError(
@@ -535,7 +654,7 @@ class WorkerActor(xo.StatelessActor):
535
654
  is_local_deployment = await self._supervisor_ref.is_local_deployment()
536
655
 
537
656
  subpool_address, devices = await self._create_subpool(
538
- model_uid, model_type, n_gpu=n_gpu
657
+ model_uid, model_type, n_gpu=n_gpu, gpu_idx=gpu_idx
539
658
  )
540
659
 
541
660
  try:
@@ -376,18 +376,27 @@ def worker(
376
376
  is_flag=True,
377
377
  help="Persist the model configuration to the filesystem, retains the model registration after server restarts.",
378
378
  )
379
+ @click.option(
380
+ "--api-key",
381
+ "-ak",
382
+ default=None,
383
+ type=str,
384
+ help="Api-Key for access xinference api with authorization.",
385
+ )
379
386
  def register_model(
380
387
  endpoint: Optional[str],
381
388
  model_type: str,
382
389
  file: str,
383
390
  persist: bool,
391
+ api_key: Optional[str],
384
392
  ):
385
393
  endpoint = get_endpoint(endpoint)
386
394
  with open(file) as fd:
387
395
  model = fd.read()
388
396
 
389
- client = RESTfulClient(base_url=endpoint)
390
- client._set_token(get_stored_token(endpoint, client))
397
+ client = RESTfulClient(base_url=endpoint, api_key=api_key)
398
+ if api_key is None:
399
+ client._set_token(get_stored_token(endpoint, client))
391
400
  client.register_model(
392
401
  model_type=model_type,
393
402
  model=model,
@@ -408,15 +417,24 @@ def register_model(
408
417
  help="Type of model to unregister (default is 'LLM').",
409
418
  )
410
419
  @click.option("--model-name", "-n", type=str, help="Name of the model to unregister.")
420
+ @click.option(
421
+ "--api-key",
422
+ "-ak",
423
+ default=None,
424
+ type=str,
425
+ help="Api-Key for access xinference api with authorization.",
426
+ )
411
427
  def unregister_model(
412
428
  endpoint: Optional[str],
413
429
  model_type: str,
414
430
  model_name: str,
431
+ api_key: Optional[str],
415
432
  ):
416
433
  endpoint = get_endpoint(endpoint)
417
434
 
418
- client = RESTfulClient(base_url=endpoint)
419
- client._set_token(get_stored_token(endpoint, client))
435
+ client = RESTfulClient(base_url=endpoint, api_key=api_key)
436
+ if api_key is None:
437
+ client._set_token(get_stored_token(endpoint, client))
420
438
  client.unregister_model(
421
439
  model_type=model_type,
422
440
  model_name=model_name,
@@ -437,15 +455,24 @@ def unregister_model(
437
455
  type=str,
438
456
  help="Filter by model type (default is 'LLM').",
439
457
  )
458
+ @click.option(
459
+ "--api-key",
460
+ "-ak",
461
+ default=None,
462
+ type=str,
463
+ help="Api-Key for access xinference api with authorization.",
464
+ )
440
465
  def list_model_registrations(
441
466
  endpoint: Optional[str],
442
467
  model_type: str,
468
+ api_key: Optional[str],
443
469
  ):
444
470
  from tabulate import tabulate
445
471
 
446
472
  endpoint = get_endpoint(endpoint)
447
- client = RESTfulClient(base_url=endpoint)
448
- client._set_token(get_stored_token(endpoint, client))
473
+ client = RESTfulClient(base_url=endpoint, api_key=api_key)
474
+ if api_key is None:
475
+ client._set_token(get_stored_token(endpoint, client))
449
476
 
450
477
  registrations = client.list_model_registrations(model_type=model_type)
451
478
 
@@ -632,12 +659,31 @@ def list_model_registrations(
632
659
  type=(str, str),
633
660
  multiple=True,
634
661
  )
662
+ @click.option(
663
+ "--worker-ip",
664
+ default=None,
665
+ type=str,
666
+ help="Specify which worker this model runs on by ip, for distributed situation.",
667
+ )
668
+ @click.option(
669
+ "--gpu-idx",
670
+ default=None,
671
+ type=str,
672
+ help="Specify which GPUs of a worker this model can run on, separated with commas.",
673
+ )
635
674
  @click.option(
636
675
  "--trust-remote-code",
637
676
  default=True,
638
677
  type=bool,
639
678
  help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
640
679
  )
680
+ @click.option(
681
+ "--api-key",
682
+ "-ak",
683
+ default=None,
684
+ type=str,
685
+ help="Api-Key for access xinference api with authorization.",
686
+ )
641
687
  @click.pass_context
642
688
  def model_launch(
643
689
  ctx,
@@ -653,7 +699,10 @@ def model_launch(
653
699
  peft_model_path: Optional[str],
654
700
  image_lora_load_kwargs: Optional[Tuple],
655
701
  image_lora_fuse_kwargs: Optional[Tuple],
702
+ worker_ip: Optional[str],
703
+ gpu_idx: Optional[str],
656
704
  trust_remote_code: bool,
705
+ api_key: Optional[str],
657
706
  ):
658
707
  kwargs = {}
659
708
  for i in range(0, len(ctx.args), 2):
@@ -680,14 +729,19 @@ def model_launch(
680
729
  else None
681
730
  )
682
731
 
732
+ _gpu_idx: Optional[List[int]] = (
733
+ None if gpu_idx is None else [int(idx) for idx in gpu_idx.split(",")]
734
+ )
735
+
683
736
  endpoint = get_endpoint(endpoint)
684
737
  model_size: Optional[Union[str, int]] = (
685
738
  size_in_billions
686
739
  if size_in_billions is None or "_" in size_in_billions
687
740
  else int(size_in_billions)
688
741
  )
689
- client = RESTfulClient(base_url=endpoint)
690
- client._set_token(get_stored_token(endpoint, client))
742
+ client = RESTfulClient(base_url=endpoint, api_key=api_key)
743
+ if api_key is None:
744
+ client._set_token(get_stored_token(endpoint, client))
691
745
 
692
746
  model_uid = client.launch_model(
693
747
  model_name=model_name,
@@ -701,6 +755,8 @@ def model_launch(
701
755
  peft_model_path=peft_model_path,
702
756
  image_lora_load_kwargs=image_lora_load_params,
703
757
  image_lora_fuse_kwargs=image_lora_fuse_params,
758
+ worker_ip=worker_ip,
759
+ gpu_idx=_gpu_idx,
704
760
  trust_remote_code=trust_remote_code,
705
761
  **kwargs,
706
762
  )
@@ -718,12 +774,20 @@ def model_launch(
718
774
  type=str,
719
775
  help="Xinference endpoint.",
720
776
  )
721
- def model_list(endpoint: Optional[str]):
777
+ @click.option(
778
+ "--api-key",
779
+ "-ak",
780
+ default=None,
781
+ type=str,
782
+ help="Api-Key for access xinference api with authorization.",
783
+ )
784
+ def model_list(endpoint: Optional[str], api_key: Optional[str]):
722
785
  from tabulate import tabulate
723
786
 
724
787
  endpoint = get_endpoint(endpoint)
725
- client = RESTfulClient(base_url=endpoint)
726
- client._set_token(get_stored_token(endpoint, client))
788
+ client = RESTfulClient(base_url=endpoint, api_key=api_key)
789
+ if api_key is None:
790
+ client._set_token(get_stored_token(endpoint, client))
727
791
 
728
792
  llm_table = []
729
793
  embedding_table = []
@@ -844,13 +908,22 @@ def model_list(endpoint: Optional[str]):
844
908
  required=True,
845
909
  help="The unique identifier (UID) of the model.",
846
910
  )
911
+ @click.option(
912
+ "--api-key",
913
+ "-ak",
914
+ default=None,
915
+ type=str,
916
+ help="Api-Key for access xinference api with authorization.",
917
+ )
847
918
  def model_terminate(
848
919
  endpoint: Optional[str],
849
920
  model_uid: str,
921
+ api_key: Optional[str],
850
922
  ):
851
923
  endpoint = get_endpoint(endpoint)
852
- client = RESTfulClient(base_url=endpoint)
853
- client._set_token(get_stored_token(endpoint, client))
924
+ client = RESTfulClient(base_url=endpoint, api_key=api_key)
925
+ if api_key is None:
926
+ client._set_token(get_stored_token(endpoint, client))
854
927
  client.terminate_model(model_uid=model_uid)
855
928
 
856
929
 
@@ -873,15 +946,24 @@ def model_terminate(
873
946
  type=bool,
874
947
  help="Whether to stream the generated text. Use 'True' for streaming (default is True).",
875
948
  )
949
+ @click.option(
950
+ "--api-key",
951
+ "-ak",
952
+ default=None,
953
+ type=str,
954
+ help="Api-Key for access xinference api with authorization.",
955
+ )
876
956
  def model_generate(
877
957
  endpoint: Optional[str],
878
958
  model_uid: str,
879
959
  max_tokens: int,
880
960
  stream: bool,
961
+ api_key: Optional[str],
881
962
  ):
882
963
  endpoint = get_endpoint(endpoint)
883
- client = RESTfulClient(base_url=endpoint)
884
- client._set_token(get_stored_token(endpoint, client))
964
+ client = RESTfulClient(base_url=endpoint, api_key=api_key)
965
+ if api_key is None:
966
+ client._set_token(get_stored_token(endpoint, client))
885
967
  if stream:
886
968
  # TODO: when stream=True, RestfulClient cannot generate words one by one.
887
969
  # So use Client in temporary. The implementation needs to be changed to
@@ -959,16 +1041,25 @@ def model_generate(
959
1041
  type=bool,
960
1042
  help="Whether to stream the chat messages. Use 'True' for streaming (default is True).",
961
1043
  )
1044
+ @click.option(
1045
+ "--api-key",
1046
+ "-ak",
1047
+ default=None,
1048
+ type=str,
1049
+ help="Api-Key for access xinference api with authorization.",
1050
+ )
962
1051
  def model_chat(
963
1052
  endpoint: Optional[str],
964
1053
  model_uid: str,
965
1054
  max_tokens: int,
966
1055
  stream: bool,
1056
+ api_key: Optional[str],
967
1057
  ):
968
1058
  # TODO: chat model roles may not be user and assistant.
969
1059
  endpoint = get_endpoint(endpoint)
970
- client = RESTfulClient(base_url=endpoint)
971
- client._set_token(get_stored_token(endpoint, client))
1060
+ client = RESTfulClient(base_url=endpoint, api_key=api_key)
1061
+ if api_key is None:
1062
+ client._set_token(get_stored_token(endpoint, client))
972
1063
 
973
1064
  chat_history: "List[ChatCompletionMessage]" = []
974
1065
  if stream:
@@ -1048,10 +1139,18 @@ def model_chat(
1048
1139
 
1049
1140
  @cli.command("vllm-models", help="Query and display models compatible with vLLM.")
1050
1141
  @click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
1051
- def vllm_models(endpoint: Optional[str]):
1142
+ @click.option(
1143
+ "--api-key",
1144
+ "-ak",
1145
+ default=None,
1146
+ type=str,
1147
+ help="Api-Key for access xinference api with authorization.",
1148
+ )
1149
+ def vllm_models(endpoint: Optional[str], api_key: Optional[str]):
1052
1150
  endpoint = get_endpoint(endpoint)
1053
- client = RESTfulClient(base_url=endpoint)
1054
- client._set_token(get_stored_token(endpoint, client))
1151
+ client = RESTfulClient(base_url=endpoint, api_key=api_key)
1152
+ if api_key is None:
1153
+ client._set_token(get_stored_token(endpoint, client))
1055
1154
  vllm_models_dict = client.vllm_models()
1056
1155
  print("VLLM supported model families:")
1057
1156
  chat_models = vllm_models_dict["chat"]
@@ -136,7 +136,7 @@ class EmbeddingModel:
136
136
  def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
137
137
  from sentence_transformers import SentenceTransformer
138
138
 
139
- normalize_embeddings = kwargs.pop("normalize_embeddings", True)
139
+ kwargs.setdefault("normalize_embeddings", True)
140
140
 
141
141
  # copied from sentence-transformers, and modify it to return tokens num
142
142
  @no_type_check
@@ -272,7 +272,6 @@ class EmbeddingModel:
272
272
  self._model,
273
273
  sentences,
274
274
  convert_to_numpy=False,
275
- normalize_embeddings=normalize_embeddings,
276
275
  **kwargs,
277
276
  )
278
277
  if isinstance(sentences, str):
@@ -49,14 +49,15 @@ from .llm_family import (
49
49
 
50
50
  def _install():
51
51
  from .ggml.chatglm import ChatglmCppChatModel
52
- from .ggml.ctransformers import CtransformersModel
53
52
  from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
54
53
  from .pytorch.baichuan import BaichuanPytorchChatModel
55
54
  from .pytorch.chatglm import ChatglmPytorchChatModel
56
55
  from .pytorch.core import PytorchChatModel, PytorchModel
56
+ from .pytorch.deepseek_vl import DeepSeekVLChatModel
57
57
  from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
58
58
  from .pytorch.internlm2 import Internlm2PytorchChatModel
59
59
  from .pytorch.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
60
+ from .pytorch.omnilmm import OmniLMMModel
60
61
  from .pytorch.qwen_vl import QwenVLChatModel
61
62
  from .pytorch.vicuna import VicunaPytorchChatModel
62
63
  from .pytorch.yi_vl import YiVLChatModel
@@ -75,11 +76,6 @@ def _install():
75
76
  ChatglmCppChatModel,
76
77
  ]
77
78
  )
78
- LLM_CLASSES.extend(
79
- [
80
- CtransformersModel,
81
- ]
82
- )
83
79
  LLM_CLASSES.extend([SGLANGModel, SGLANGChatModel])
84
80
  LLM_CLASSES.extend([VLLMModel, VLLMChatModel])
85
81
  LLM_CLASSES.extend(
@@ -94,7 +90,9 @@ def _install():
94
90
  FalconPytorchModel,
95
91
  Internlm2PytorchChatModel,
96
92
  QwenVLChatModel,
93
+ OmniLMMModel,
97
94
  YiVLChatModel,
95
+ DeepSeekVLChatModel,
98
96
  PytorchModel,
99
97
  ]
100
98
  )
@@ -30,7 +30,6 @@ from ....types import (
30
30
  from ..core import LLM
31
31
  from ..llm_family import LLMFamilyV1, LLMSpecV1
32
32
  from ..utils import ChatModelMixin
33
- from .ctransformers import CTRANSFORMERS_SUPPORTED_MODEL
34
33
 
35
34
  logger = logging.getLogger(__name__)
36
35
 
@@ -182,11 +181,7 @@ class LlamaCppModel(LLM):
182
181
  ) -> bool:
183
182
  if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
184
183
  return False
185
- if (
186
- "chatglm" in llm_family.model_name
187
- or "qwen" in llm_family.model_name
188
- or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL
189
- ):
184
+ if "chatglm" in llm_family.model_name or "qwen" in llm_family.model_name:
190
185
  return False
191
186
  if "generate" not in llm_family.model_ability:
192
187
  return False
@@ -250,10 +245,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
250
245
  ) -> bool:
251
246
  if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
252
247
  return False
253
- if (
254
- "chatglm" in llm_family.model_name
255
- or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL
256
- ):
248
+ if "chatglm" in llm_family.model_name:
257
249
  return False
258
250
  if "chat" not in llm_family.model_ability:
259
251
  return False