xinference 0.9.4__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/oauth2/auth_service.py +47 -18
- xinference/api/oauth2/types.py +1 -0
- xinference/api/restful_api.py +34 -7
- xinference/client/oscar/actor_client.py +4 -3
- xinference/client/restful/restful_client.py +20 -4
- xinference/conftest.py +13 -2
- xinference/core/supervisor.py +48 -1
- xinference/core/worker.py +139 -20
- xinference/deploy/cmdline.py +119 -20
- xinference/model/embedding/core.py +1 -2
- xinference/model/llm/__init__.py +4 -6
- xinference/model/llm/ggml/llamacpp.py +2 -10
- xinference/model/llm/llm_family.json +877 -13
- xinference/model/llm/llm_family.py +15 -0
- xinference/model/llm/llm_family_modelscope.json +571 -0
- xinference/model/llm/pytorch/chatglm.py +2 -0
- xinference/model/llm/pytorch/core.py +22 -26
- xinference/model/llm/pytorch/deepseek_vl.py +232 -0
- xinference/model/llm/pytorch/internlm2.py +2 -0
- xinference/model/llm/pytorch/omnilmm.py +153 -0
- xinference/model/llm/pytorch/qwen_vl.py +2 -0
- xinference/model/llm/pytorch/yi_vl.py +4 -2
- xinference/model/llm/utils.py +53 -5
- xinference/model/llm/vllm/core.py +54 -6
- xinference/model/rerank/core.py +3 -0
- xinference/thirdparty/deepseek_vl/__init__.py +31 -0
- xinference/thirdparty/deepseek_vl/models/__init__.py +28 -0
- xinference/thirdparty/deepseek_vl/models/clip_encoder.py +242 -0
- xinference/thirdparty/deepseek_vl/models/image_processing_vlm.py +208 -0
- xinference/thirdparty/deepseek_vl/models/modeling_vlm.py +170 -0
- xinference/thirdparty/deepseek_vl/models/processing_vlm.py +390 -0
- xinference/thirdparty/deepseek_vl/models/projector.py +100 -0
- xinference/thirdparty/deepseek_vl/models/sam.py +593 -0
- xinference/thirdparty/deepseek_vl/models/siglip_vit.py +681 -0
- xinference/thirdparty/deepseek_vl/utils/__init__.py +18 -0
- xinference/thirdparty/deepseek_vl/utils/conversation.py +348 -0
- xinference/thirdparty/deepseek_vl/utils/io.py +78 -0
- xinference/thirdparty/omnilmm/__init__.py +0 -0
- xinference/thirdparty/omnilmm/chat.py +216 -0
- xinference/thirdparty/omnilmm/constants.py +4 -0
- xinference/thirdparty/omnilmm/conversation.py +332 -0
- xinference/thirdparty/omnilmm/model/__init__.py +1 -0
- xinference/thirdparty/omnilmm/model/omnilmm.py +594 -0
- xinference/thirdparty/omnilmm/model/resampler.py +166 -0
- xinference/thirdparty/omnilmm/model/utils.py +563 -0
- xinference/thirdparty/omnilmm/train/__init__.py +13 -0
- xinference/thirdparty/omnilmm/train/train_utils.py +150 -0
- xinference/thirdparty/omnilmm/utils.py +134 -0
- xinference/types.py +15 -19
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.76ef2b17.js +3 -0
- xinference/web/ui/build/static/js/main.76ef2b17.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/35d0e4a317e5582cbb79d901302e9d706520ac53f8a734c2fd8bfde6eb5a4f02.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b9cbcb6d77ba21b22c6950b6fb5b305d23c19cf747f99f7d48b6b046f8f7b1b0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d076fd56cf3b15ed2433e3744b98c6b4e4410a19903d1db4de5bba0e1a1b3347.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/daad8131d91134f6d7aef895a0c9c32e1cb928277cb5aa66c01028126d215be0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f16aec63602a77bd561d0e67fa00b76469ac54b8033754bba114ec5eb3257964.json +1 -0
- {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/METADATA +25 -12
- {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/RECORD +79 -58
- xinference/model/llm/ggml/ctransformers.py +0 -281
- xinference/model/llm/ggml/ctransformers_util.py +0 -161
- xinference/web/ui/build/static/js/main.66b1c4fb.js +0 -3
- xinference/web/ui/build/static/js/main.66b1c4fb.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0db651c046ef908f45cde73af0dbea0a797d3e35bb57f4a0863b481502103a64.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3d93bd9a74a1ab0cec85af40f9baa5f6a8e7384b9e18c409b95a81a7b45bb7e2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c2124cfe036b26befcbd386d1d17743b1a58d0b7a041a17bb67f9924400d63c3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fd4a8ae5d192331af1bedd1d2d70efcc569708ee6cc4cb479b225d059482aa81.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +0 -1
- /xinference/web/ui/build/static/js/{main.66b1c4fb.js.LICENSE.txt → main.76ef2b17.js.LICENSE.txt} +0 -0
- {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/LICENSE +0 -0
- {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/WHEEL +0 -0
- {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.9.4.dist-info → xinference-0.10.1.dist-info}/top_level.txt +0 -0
xinference/core/worker.py
CHANGED
|
@@ -74,6 +74,10 @@ class WorkerActor(xo.StatelessActor):
|
|
|
74
74
|
self._model_uid_to_model_spec: Dict[str, ModelDescription] = {}
|
|
75
75
|
self._gpu_to_model_uid: Dict[int, str] = {}
|
|
76
76
|
self._gpu_to_embedding_model_uids: Dict[int, Set[str]] = defaultdict(set)
|
|
77
|
+
# Dict structure: gpu_index: {(replica_model_uid, model_type)}
|
|
78
|
+
self._user_specified_gpu_to_model_uids: Dict[
|
|
79
|
+
int, Set[Tuple[str, str]]
|
|
80
|
+
] = defaultdict(set)
|
|
77
81
|
self._model_uid_to_addr: Dict[str, str] = {}
|
|
78
82
|
self._model_uid_to_recover_count: Dict[str, int] = {}
|
|
79
83
|
self._model_uid_to_launch_args: Dict[str, Dict] = {}
|
|
@@ -268,12 +272,27 @@ class WorkerActor(xo.StatelessActor):
|
|
|
268
272
|
"""
|
|
269
273
|
candidates = []
|
|
270
274
|
for _dev in self._total_gpu_devices:
|
|
271
|
-
if
|
|
275
|
+
if (
|
|
276
|
+
_dev not in self._gpu_to_model_uid
|
|
277
|
+
and _dev not in self._user_specified_gpu_to_model_uids
|
|
278
|
+
): # no possible vllm model on it, add it to candidates
|
|
272
279
|
candidates.append(_dev)
|
|
273
|
-
else:
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
280
|
+
else: # need to judge that whether to have vllm model on this device
|
|
281
|
+
has_vllm_model = False
|
|
282
|
+
if _dev in self._gpu_to_model_uid:
|
|
283
|
+
existing_model_uid = self._gpu_to_model_uid[_dev]
|
|
284
|
+
has_vllm_model = await self.is_model_vllm_backend(
|
|
285
|
+
existing_model_uid
|
|
286
|
+
)
|
|
287
|
+
if (
|
|
288
|
+
not has_vllm_model
|
|
289
|
+
and _dev in self._user_specified_gpu_to_model_uids
|
|
290
|
+
):
|
|
291
|
+
for rep_uid, _ in self._user_specified_gpu_to_model_uids[_dev]:
|
|
292
|
+
has_vllm_model = await self.is_model_vllm_backend(rep_uid)
|
|
293
|
+
if has_vllm_model:
|
|
294
|
+
break
|
|
295
|
+
if not has_vllm_model:
|
|
277
296
|
candidates.append(_dev)
|
|
278
297
|
|
|
279
298
|
if len(candidates) == 0:
|
|
@@ -285,9 +304,13 @@ class WorkerActor(xo.StatelessActor):
|
|
|
285
304
|
device, min_cnt = -1, -1
|
|
286
305
|
# Pick the device with the fewest existing models among all the candidate devices.
|
|
287
306
|
for _dev in candidates:
|
|
288
|
-
existing_cnt =
|
|
307
|
+
existing_cnt = 0
|
|
308
|
+
if _dev in self._gpu_to_embedding_model_uids:
|
|
309
|
+
existing_cnt += len(self._gpu_to_embedding_model_uids[_dev])
|
|
289
310
|
if _dev in self._gpu_to_model_uid:
|
|
290
311
|
existing_cnt += 1
|
|
312
|
+
if _dev in self._user_specified_gpu_to_model_uids:
|
|
313
|
+
existing_cnt += len(self._user_specified_gpu_to_model_uids[_dev])
|
|
291
314
|
if min_cnt == -1 or existing_cnt < min_cnt:
|
|
292
315
|
device, min_cnt = _dev, existing_cnt
|
|
293
316
|
|
|
@@ -295,17 +318,82 @@ class WorkerActor(xo.StatelessActor):
|
|
|
295
318
|
return device
|
|
296
319
|
|
|
297
320
|
def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
|
|
298
|
-
|
|
321
|
+
user_specified_allocated_devices: Set[int] = set()
|
|
322
|
+
for dev, model_infos in self._user_specified_gpu_to_model_uids.items():
|
|
323
|
+
allocated_non_embedding_rerank_models = False
|
|
324
|
+
for _, model_type in model_infos:
|
|
325
|
+
allocated_non_embedding_rerank_models = model_type not in [
|
|
326
|
+
"embedding",
|
|
327
|
+
"rerank",
|
|
328
|
+
]
|
|
329
|
+
if allocated_non_embedding_rerank_models:
|
|
330
|
+
break
|
|
331
|
+
if allocated_non_embedding_rerank_models:
|
|
332
|
+
user_specified_allocated_devices.add(dev)
|
|
333
|
+
allocated_devices = set(self._gpu_to_model_uid.keys()).union(
|
|
334
|
+
user_specified_allocated_devices
|
|
335
|
+
)
|
|
336
|
+
if n_gpu > len(self._total_gpu_devices) - len(allocated_devices):
|
|
299
337
|
raise RuntimeError("No available slot found for the model")
|
|
300
338
|
|
|
301
339
|
devices: List[int] = [
|
|
302
|
-
dev
|
|
340
|
+
dev
|
|
341
|
+
for dev in self._total_gpu_devices
|
|
342
|
+
if dev not in self._gpu_to_model_uid
|
|
343
|
+
and dev not in user_specified_allocated_devices
|
|
303
344
|
][:n_gpu]
|
|
304
345
|
for dev in devices:
|
|
305
346
|
self._gpu_to_model_uid[int(dev)] = model_uid
|
|
306
347
|
|
|
307
348
|
return sorted(devices)
|
|
308
349
|
|
|
350
|
+
async def allocate_devices_with_gpu_idx(
|
|
351
|
+
self, model_uid: str, model_type: str, gpu_idx: List[int]
|
|
352
|
+
) -> List[int]:
|
|
353
|
+
"""
|
|
354
|
+
When user specifies the gpu_idx, allocate models on user-specified GPUs whenever possible
|
|
355
|
+
"""
|
|
356
|
+
# must be subset of total devices visible to this worker
|
|
357
|
+
if not set(gpu_idx) <= set(self._total_gpu_devices):
|
|
358
|
+
raise ValueError(
|
|
359
|
+
f"Worker {self.address} cannot use the GPUs with these indexes: {gpu_idx}. "
|
|
360
|
+
f"Worker {self.address} can only see these GPUs: {self._total_gpu_devices}."
|
|
361
|
+
)
|
|
362
|
+
# currently just report a warning log when there are already models on these GPUs
|
|
363
|
+
for idx in gpu_idx:
|
|
364
|
+
existing_model_uids = []
|
|
365
|
+
if idx in self._gpu_to_model_uid:
|
|
366
|
+
rep_uid = self._gpu_to_model_uid[idx]
|
|
367
|
+
is_vllm_model = await self.is_model_vllm_backend(rep_uid)
|
|
368
|
+
if is_vllm_model:
|
|
369
|
+
raise RuntimeError(
|
|
370
|
+
f"GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
|
|
371
|
+
f"therefore cannot allocate GPU memory for a new model."
|
|
372
|
+
)
|
|
373
|
+
existing_model_uids.append(rep_uid)
|
|
374
|
+
if idx in self._gpu_to_embedding_model_uids:
|
|
375
|
+
existing_model_uids.extend(self._gpu_to_embedding_model_uids[idx])
|
|
376
|
+
# If user has run the vLLM model on the GPU that was forced to be specified,
|
|
377
|
+
# it is not possible to force this GPU to be allocated again
|
|
378
|
+
if idx in self._user_specified_gpu_to_model_uids:
|
|
379
|
+
for rep_uid, _ in self._user_specified_gpu_to_model_uids[idx]:
|
|
380
|
+
is_vllm_model = await self.is_model_vllm_backend(rep_uid)
|
|
381
|
+
if is_vllm_model:
|
|
382
|
+
raise RuntimeError(
|
|
383
|
+
f"User specified GPU index {idx} has been occupied with a vLLM model: {rep_uid}, "
|
|
384
|
+
f"therefore cannot allocate GPU memory for a new model."
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
if existing_model_uids:
|
|
388
|
+
logger.warning(
|
|
389
|
+
f"WARNING!!! GPU index {idx} has been occupied "
|
|
390
|
+
f"with these models on it: {existing_model_uids}"
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
for idx in gpu_idx:
|
|
394
|
+
self._user_specified_gpu_to_model_uids[idx].add((model_uid, model_type))
|
|
395
|
+
return sorted(gpu_idx)
|
|
396
|
+
|
|
309
397
|
def release_devices(self, model_uid: str):
|
|
310
398
|
devices = [
|
|
311
399
|
dev
|
|
@@ -320,27 +408,46 @@ class WorkerActor(xo.StatelessActor):
|
|
|
320
408
|
if model_uid in self._gpu_to_embedding_model_uids[dev]:
|
|
321
409
|
self._gpu_to_embedding_model_uids[dev].remove(model_uid)
|
|
322
410
|
|
|
411
|
+
# check user-specified slots
|
|
412
|
+
for dev in self._user_specified_gpu_to_model_uids:
|
|
413
|
+
model_infos = list(
|
|
414
|
+
filter(
|
|
415
|
+
lambda x: x[0] == model_uid,
|
|
416
|
+
self._user_specified_gpu_to_model_uids[dev],
|
|
417
|
+
)
|
|
418
|
+
)
|
|
419
|
+
for model_info in model_infos:
|
|
420
|
+
self._user_specified_gpu_to_model_uids[dev].remove(model_info)
|
|
421
|
+
|
|
323
422
|
async def _create_subpool(
|
|
324
423
|
self,
|
|
325
424
|
model_uid: str,
|
|
326
425
|
model_type: Optional[str] = None,
|
|
327
426
|
n_gpu: Optional[Union[int, str]] = "auto",
|
|
427
|
+
gpu_idx: Optional[List[int]] = None,
|
|
328
428
|
) -> Tuple[str, List[str]]:
|
|
329
429
|
env = {}
|
|
330
430
|
devices = []
|
|
331
|
-
if
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
431
|
+
if gpu_idx is None:
|
|
432
|
+
if isinstance(n_gpu, int) or (n_gpu == "auto" and gpu_count() > 0):
|
|
433
|
+
# Currently, n_gpu=auto means using 1 GPU
|
|
434
|
+
gpu_cnt = n_gpu if isinstance(n_gpu, int) else 1
|
|
435
|
+
devices = (
|
|
436
|
+
[await self.allocate_devices_for_embedding(model_uid)]
|
|
437
|
+
if model_type in ["embedding", "rerank"]
|
|
438
|
+
else self.allocate_devices(model_uid=model_uid, n_gpu=gpu_cnt)
|
|
439
|
+
)
|
|
440
|
+
env["CUDA_VISIBLE_DEVICES"] = ",".join([str(dev) for dev in devices])
|
|
441
|
+
logger.debug(f"GPU selected: {devices} for model {model_uid}")
|
|
442
|
+
if n_gpu is None:
|
|
443
|
+
env["CUDA_VISIBLE_DEVICES"] = "-1"
|
|
444
|
+
logger.debug(f"GPU disabled for model {model_uid}")
|
|
445
|
+
else:
|
|
446
|
+
assert isinstance(gpu_idx, list)
|
|
447
|
+
devices = await self.allocate_devices_with_gpu_idx(
|
|
448
|
+
model_uid, model_type, gpu_idx # type: ignore
|
|
338
449
|
)
|
|
339
450
|
env["CUDA_VISIBLE_DEVICES"] = ",".join([str(dev) for dev in devices])
|
|
340
|
-
logger.debug(f"GPU selected: {devices} for model {model_uid}")
|
|
341
|
-
if n_gpu is None:
|
|
342
|
-
env["CUDA_VISIBLE_DEVICES"] = "-1"
|
|
343
|
-
logger.debug(f"GPU disabled for model {model_uid}")
|
|
344
451
|
|
|
345
452
|
if os.name != "nt" and platform.system() != "Darwin":
|
|
346
453
|
# Linux
|
|
@@ -495,6 +602,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
495
602
|
image_lora_load_kwargs: Optional[Dict] = None,
|
|
496
603
|
image_lora_fuse_kwargs: Optional[Dict] = None,
|
|
497
604
|
request_limits: Optional[int] = None,
|
|
605
|
+
gpu_idx: Optional[Union[int, List[int]]] = None,
|
|
498
606
|
**kwargs,
|
|
499
607
|
):
|
|
500
608
|
event_model_uid, _, __ = parse_replica_model_uid(model_uid)
|
|
@@ -510,6 +618,17 @@ class WorkerActor(xo.StatelessActor):
|
|
|
510
618
|
launch_args.pop("self")
|
|
511
619
|
launch_args.pop("kwargs")
|
|
512
620
|
launch_args.update(kwargs)
|
|
621
|
+
|
|
622
|
+
if gpu_idx is not None:
|
|
623
|
+
logger.info(
|
|
624
|
+
f"You specify to launch the model: {model_name} on GPU index: {gpu_idx} "
|
|
625
|
+
f"of the worker: {self.address}, "
|
|
626
|
+
f"xinference will automatically ignore the `n_gpu` option."
|
|
627
|
+
)
|
|
628
|
+
if isinstance(gpu_idx, int):
|
|
629
|
+
gpu_idx = [gpu_idx]
|
|
630
|
+
assert isinstance(gpu_idx, list)
|
|
631
|
+
|
|
513
632
|
if n_gpu is not None:
|
|
514
633
|
if isinstance(n_gpu, int) and (n_gpu <= 0 or n_gpu > gpu_count()):
|
|
515
634
|
raise ValueError(
|
|
@@ -535,7 +654,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
535
654
|
is_local_deployment = await self._supervisor_ref.is_local_deployment()
|
|
536
655
|
|
|
537
656
|
subpool_address, devices = await self._create_subpool(
|
|
538
|
-
model_uid, model_type, n_gpu=n_gpu
|
|
657
|
+
model_uid, model_type, n_gpu=n_gpu, gpu_idx=gpu_idx
|
|
539
658
|
)
|
|
540
659
|
|
|
541
660
|
try:
|
xinference/deploy/cmdline.py
CHANGED
|
@@ -376,18 +376,27 @@ def worker(
|
|
|
376
376
|
is_flag=True,
|
|
377
377
|
help="Persist the model configuration to the filesystem, retains the model registration after server restarts.",
|
|
378
378
|
)
|
|
379
|
+
@click.option(
|
|
380
|
+
"--api-key",
|
|
381
|
+
"-ak",
|
|
382
|
+
default=None,
|
|
383
|
+
type=str,
|
|
384
|
+
help="Api-Key for access xinference api with authorization.",
|
|
385
|
+
)
|
|
379
386
|
def register_model(
|
|
380
387
|
endpoint: Optional[str],
|
|
381
388
|
model_type: str,
|
|
382
389
|
file: str,
|
|
383
390
|
persist: bool,
|
|
391
|
+
api_key: Optional[str],
|
|
384
392
|
):
|
|
385
393
|
endpoint = get_endpoint(endpoint)
|
|
386
394
|
with open(file) as fd:
|
|
387
395
|
model = fd.read()
|
|
388
396
|
|
|
389
|
-
client = RESTfulClient(base_url=endpoint)
|
|
390
|
-
|
|
397
|
+
client = RESTfulClient(base_url=endpoint, api_key=api_key)
|
|
398
|
+
if api_key is None:
|
|
399
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
391
400
|
client.register_model(
|
|
392
401
|
model_type=model_type,
|
|
393
402
|
model=model,
|
|
@@ -408,15 +417,24 @@ def register_model(
|
|
|
408
417
|
help="Type of model to unregister (default is 'LLM').",
|
|
409
418
|
)
|
|
410
419
|
@click.option("--model-name", "-n", type=str, help="Name of the model to unregister.")
|
|
420
|
+
@click.option(
|
|
421
|
+
"--api-key",
|
|
422
|
+
"-ak",
|
|
423
|
+
default=None,
|
|
424
|
+
type=str,
|
|
425
|
+
help="Api-Key for access xinference api with authorization.",
|
|
426
|
+
)
|
|
411
427
|
def unregister_model(
|
|
412
428
|
endpoint: Optional[str],
|
|
413
429
|
model_type: str,
|
|
414
430
|
model_name: str,
|
|
431
|
+
api_key: Optional[str],
|
|
415
432
|
):
|
|
416
433
|
endpoint = get_endpoint(endpoint)
|
|
417
434
|
|
|
418
|
-
client = RESTfulClient(base_url=endpoint)
|
|
419
|
-
|
|
435
|
+
client = RESTfulClient(base_url=endpoint, api_key=api_key)
|
|
436
|
+
if api_key is None:
|
|
437
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
420
438
|
client.unregister_model(
|
|
421
439
|
model_type=model_type,
|
|
422
440
|
model_name=model_name,
|
|
@@ -437,15 +455,24 @@ def unregister_model(
|
|
|
437
455
|
type=str,
|
|
438
456
|
help="Filter by model type (default is 'LLM').",
|
|
439
457
|
)
|
|
458
|
+
@click.option(
|
|
459
|
+
"--api-key",
|
|
460
|
+
"-ak",
|
|
461
|
+
default=None,
|
|
462
|
+
type=str,
|
|
463
|
+
help="Api-Key for access xinference api with authorization.",
|
|
464
|
+
)
|
|
440
465
|
def list_model_registrations(
|
|
441
466
|
endpoint: Optional[str],
|
|
442
467
|
model_type: str,
|
|
468
|
+
api_key: Optional[str],
|
|
443
469
|
):
|
|
444
470
|
from tabulate import tabulate
|
|
445
471
|
|
|
446
472
|
endpoint = get_endpoint(endpoint)
|
|
447
|
-
client = RESTfulClient(base_url=endpoint)
|
|
448
|
-
|
|
473
|
+
client = RESTfulClient(base_url=endpoint, api_key=api_key)
|
|
474
|
+
if api_key is None:
|
|
475
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
449
476
|
|
|
450
477
|
registrations = client.list_model_registrations(model_type=model_type)
|
|
451
478
|
|
|
@@ -632,12 +659,31 @@ def list_model_registrations(
|
|
|
632
659
|
type=(str, str),
|
|
633
660
|
multiple=True,
|
|
634
661
|
)
|
|
662
|
+
@click.option(
|
|
663
|
+
"--worker-ip",
|
|
664
|
+
default=None,
|
|
665
|
+
type=str,
|
|
666
|
+
help="Specify which worker this model runs on by ip, for distributed situation.",
|
|
667
|
+
)
|
|
668
|
+
@click.option(
|
|
669
|
+
"--gpu-idx",
|
|
670
|
+
default=None,
|
|
671
|
+
type=str,
|
|
672
|
+
help="Specify which GPUs of a worker this model can run on, separated with commas.",
|
|
673
|
+
)
|
|
635
674
|
@click.option(
|
|
636
675
|
"--trust-remote-code",
|
|
637
676
|
default=True,
|
|
638
677
|
type=bool,
|
|
639
678
|
help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
|
|
640
679
|
)
|
|
680
|
+
@click.option(
|
|
681
|
+
"--api-key",
|
|
682
|
+
"-ak",
|
|
683
|
+
default=None,
|
|
684
|
+
type=str,
|
|
685
|
+
help="Api-Key for access xinference api with authorization.",
|
|
686
|
+
)
|
|
641
687
|
@click.pass_context
|
|
642
688
|
def model_launch(
|
|
643
689
|
ctx,
|
|
@@ -653,7 +699,10 @@ def model_launch(
|
|
|
653
699
|
peft_model_path: Optional[str],
|
|
654
700
|
image_lora_load_kwargs: Optional[Tuple],
|
|
655
701
|
image_lora_fuse_kwargs: Optional[Tuple],
|
|
702
|
+
worker_ip: Optional[str],
|
|
703
|
+
gpu_idx: Optional[str],
|
|
656
704
|
trust_remote_code: bool,
|
|
705
|
+
api_key: Optional[str],
|
|
657
706
|
):
|
|
658
707
|
kwargs = {}
|
|
659
708
|
for i in range(0, len(ctx.args), 2):
|
|
@@ -680,14 +729,19 @@ def model_launch(
|
|
|
680
729
|
else None
|
|
681
730
|
)
|
|
682
731
|
|
|
732
|
+
_gpu_idx: Optional[List[int]] = (
|
|
733
|
+
None if gpu_idx is None else [int(idx) for idx in gpu_idx.split(",")]
|
|
734
|
+
)
|
|
735
|
+
|
|
683
736
|
endpoint = get_endpoint(endpoint)
|
|
684
737
|
model_size: Optional[Union[str, int]] = (
|
|
685
738
|
size_in_billions
|
|
686
739
|
if size_in_billions is None or "_" in size_in_billions
|
|
687
740
|
else int(size_in_billions)
|
|
688
741
|
)
|
|
689
|
-
client = RESTfulClient(base_url=endpoint)
|
|
690
|
-
|
|
742
|
+
client = RESTfulClient(base_url=endpoint, api_key=api_key)
|
|
743
|
+
if api_key is None:
|
|
744
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
691
745
|
|
|
692
746
|
model_uid = client.launch_model(
|
|
693
747
|
model_name=model_name,
|
|
@@ -701,6 +755,8 @@ def model_launch(
|
|
|
701
755
|
peft_model_path=peft_model_path,
|
|
702
756
|
image_lora_load_kwargs=image_lora_load_params,
|
|
703
757
|
image_lora_fuse_kwargs=image_lora_fuse_params,
|
|
758
|
+
worker_ip=worker_ip,
|
|
759
|
+
gpu_idx=_gpu_idx,
|
|
704
760
|
trust_remote_code=trust_remote_code,
|
|
705
761
|
**kwargs,
|
|
706
762
|
)
|
|
@@ -718,12 +774,20 @@ def model_launch(
|
|
|
718
774
|
type=str,
|
|
719
775
|
help="Xinference endpoint.",
|
|
720
776
|
)
|
|
721
|
-
|
|
777
|
+
@click.option(
|
|
778
|
+
"--api-key",
|
|
779
|
+
"-ak",
|
|
780
|
+
default=None,
|
|
781
|
+
type=str,
|
|
782
|
+
help="Api-Key for access xinference api with authorization.",
|
|
783
|
+
)
|
|
784
|
+
def model_list(endpoint: Optional[str], api_key: Optional[str]):
|
|
722
785
|
from tabulate import tabulate
|
|
723
786
|
|
|
724
787
|
endpoint = get_endpoint(endpoint)
|
|
725
|
-
client = RESTfulClient(base_url=endpoint)
|
|
726
|
-
|
|
788
|
+
client = RESTfulClient(base_url=endpoint, api_key=api_key)
|
|
789
|
+
if api_key is None:
|
|
790
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
727
791
|
|
|
728
792
|
llm_table = []
|
|
729
793
|
embedding_table = []
|
|
@@ -844,13 +908,22 @@ def model_list(endpoint: Optional[str]):
|
|
|
844
908
|
required=True,
|
|
845
909
|
help="The unique identifier (UID) of the model.",
|
|
846
910
|
)
|
|
911
|
+
@click.option(
|
|
912
|
+
"--api-key",
|
|
913
|
+
"-ak",
|
|
914
|
+
default=None,
|
|
915
|
+
type=str,
|
|
916
|
+
help="Api-Key for access xinference api with authorization.",
|
|
917
|
+
)
|
|
847
918
|
def model_terminate(
|
|
848
919
|
endpoint: Optional[str],
|
|
849
920
|
model_uid: str,
|
|
921
|
+
api_key: Optional[str],
|
|
850
922
|
):
|
|
851
923
|
endpoint = get_endpoint(endpoint)
|
|
852
|
-
client = RESTfulClient(base_url=endpoint)
|
|
853
|
-
|
|
924
|
+
client = RESTfulClient(base_url=endpoint, api_key=api_key)
|
|
925
|
+
if api_key is None:
|
|
926
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
854
927
|
client.terminate_model(model_uid=model_uid)
|
|
855
928
|
|
|
856
929
|
|
|
@@ -873,15 +946,24 @@ def model_terminate(
|
|
|
873
946
|
type=bool,
|
|
874
947
|
help="Whether to stream the generated text. Use 'True' for streaming (default is True).",
|
|
875
948
|
)
|
|
949
|
+
@click.option(
|
|
950
|
+
"--api-key",
|
|
951
|
+
"-ak",
|
|
952
|
+
default=None,
|
|
953
|
+
type=str,
|
|
954
|
+
help="Api-Key for access xinference api with authorization.",
|
|
955
|
+
)
|
|
876
956
|
def model_generate(
|
|
877
957
|
endpoint: Optional[str],
|
|
878
958
|
model_uid: str,
|
|
879
959
|
max_tokens: int,
|
|
880
960
|
stream: bool,
|
|
961
|
+
api_key: Optional[str],
|
|
881
962
|
):
|
|
882
963
|
endpoint = get_endpoint(endpoint)
|
|
883
|
-
client = RESTfulClient(base_url=endpoint)
|
|
884
|
-
|
|
964
|
+
client = RESTfulClient(base_url=endpoint, api_key=api_key)
|
|
965
|
+
if api_key is None:
|
|
966
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
885
967
|
if stream:
|
|
886
968
|
# TODO: when stream=True, RestfulClient cannot generate words one by one.
|
|
887
969
|
# So use Client in temporary. The implementation needs to be changed to
|
|
@@ -959,16 +1041,25 @@ def model_generate(
|
|
|
959
1041
|
type=bool,
|
|
960
1042
|
help="Whether to stream the chat messages. Use 'True' for streaming (default is True).",
|
|
961
1043
|
)
|
|
1044
|
+
@click.option(
|
|
1045
|
+
"--api-key",
|
|
1046
|
+
"-ak",
|
|
1047
|
+
default=None,
|
|
1048
|
+
type=str,
|
|
1049
|
+
help="Api-Key for access xinference api with authorization.",
|
|
1050
|
+
)
|
|
962
1051
|
def model_chat(
|
|
963
1052
|
endpoint: Optional[str],
|
|
964
1053
|
model_uid: str,
|
|
965
1054
|
max_tokens: int,
|
|
966
1055
|
stream: bool,
|
|
1056
|
+
api_key: Optional[str],
|
|
967
1057
|
):
|
|
968
1058
|
# TODO: chat model roles may not be user and assistant.
|
|
969
1059
|
endpoint = get_endpoint(endpoint)
|
|
970
|
-
client = RESTfulClient(base_url=endpoint)
|
|
971
|
-
|
|
1060
|
+
client = RESTfulClient(base_url=endpoint, api_key=api_key)
|
|
1061
|
+
if api_key is None:
|
|
1062
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
972
1063
|
|
|
973
1064
|
chat_history: "List[ChatCompletionMessage]" = []
|
|
974
1065
|
if stream:
|
|
@@ -1048,10 +1139,18 @@ def model_chat(
|
|
|
1048
1139
|
|
|
1049
1140
|
@cli.command("vllm-models", help="Query and display models compatible with vLLM.")
|
|
1050
1141
|
@click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
|
|
1051
|
-
|
|
1142
|
+
@click.option(
|
|
1143
|
+
"--api-key",
|
|
1144
|
+
"-ak",
|
|
1145
|
+
default=None,
|
|
1146
|
+
type=str,
|
|
1147
|
+
help="Api-Key for access xinference api with authorization.",
|
|
1148
|
+
)
|
|
1149
|
+
def vllm_models(endpoint: Optional[str], api_key: Optional[str]):
|
|
1052
1150
|
endpoint = get_endpoint(endpoint)
|
|
1053
|
-
client = RESTfulClient(base_url=endpoint)
|
|
1054
|
-
|
|
1151
|
+
client = RESTfulClient(base_url=endpoint, api_key=api_key)
|
|
1152
|
+
if api_key is None:
|
|
1153
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
1055
1154
|
vllm_models_dict = client.vllm_models()
|
|
1056
1155
|
print("VLLM supported model families:")
|
|
1057
1156
|
chat_models = vllm_models_dict["chat"]
|
|
@@ -136,7 +136,7 @@ class EmbeddingModel:
|
|
|
136
136
|
def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
|
|
137
137
|
from sentence_transformers import SentenceTransformer
|
|
138
138
|
|
|
139
|
-
|
|
139
|
+
kwargs.setdefault("normalize_embeddings", True)
|
|
140
140
|
|
|
141
141
|
# copied from sentence-transformers, and modify it to return tokens num
|
|
142
142
|
@no_type_check
|
|
@@ -272,7 +272,6 @@ class EmbeddingModel:
|
|
|
272
272
|
self._model,
|
|
273
273
|
sentences,
|
|
274
274
|
convert_to_numpy=False,
|
|
275
|
-
normalize_embeddings=normalize_embeddings,
|
|
276
275
|
**kwargs,
|
|
277
276
|
)
|
|
278
277
|
if isinstance(sentences, str):
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -49,14 +49,15 @@ from .llm_family import (
|
|
|
49
49
|
|
|
50
50
|
def _install():
|
|
51
51
|
from .ggml.chatglm import ChatglmCppChatModel
|
|
52
|
-
from .ggml.ctransformers import CtransformersModel
|
|
53
52
|
from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
|
|
54
53
|
from .pytorch.baichuan import BaichuanPytorchChatModel
|
|
55
54
|
from .pytorch.chatglm import ChatglmPytorchChatModel
|
|
56
55
|
from .pytorch.core import PytorchChatModel, PytorchModel
|
|
56
|
+
from .pytorch.deepseek_vl import DeepSeekVLChatModel
|
|
57
57
|
from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
|
|
58
58
|
from .pytorch.internlm2 import Internlm2PytorchChatModel
|
|
59
59
|
from .pytorch.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
|
|
60
|
+
from .pytorch.omnilmm import OmniLMMModel
|
|
60
61
|
from .pytorch.qwen_vl import QwenVLChatModel
|
|
61
62
|
from .pytorch.vicuna import VicunaPytorchChatModel
|
|
62
63
|
from .pytorch.yi_vl import YiVLChatModel
|
|
@@ -75,11 +76,6 @@ def _install():
|
|
|
75
76
|
ChatglmCppChatModel,
|
|
76
77
|
]
|
|
77
78
|
)
|
|
78
|
-
LLM_CLASSES.extend(
|
|
79
|
-
[
|
|
80
|
-
CtransformersModel,
|
|
81
|
-
]
|
|
82
|
-
)
|
|
83
79
|
LLM_CLASSES.extend([SGLANGModel, SGLANGChatModel])
|
|
84
80
|
LLM_CLASSES.extend([VLLMModel, VLLMChatModel])
|
|
85
81
|
LLM_CLASSES.extend(
|
|
@@ -94,7 +90,9 @@ def _install():
|
|
|
94
90
|
FalconPytorchModel,
|
|
95
91
|
Internlm2PytorchChatModel,
|
|
96
92
|
QwenVLChatModel,
|
|
93
|
+
OmniLMMModel,
|
|
97
94
|
YiVLChatModel,
|
|
95
|
+
DeepSeekVLChatModel,
|
|
98
96
|
PytorchModel,
|
|
99
97
|
]
|
|
100
98
|
)
|
|
@@ -30,7 +30,6 @@ from ....types import (
|
|
|
30
30
|
from ..core import LLM
|
|
31
31
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
32
32
|
from ..utils import ChatModelMixin
|
|
33
|
-
from .ctransformers import CTRANSFORMERS_SUPPORTED_MODEL
|
|
34
33
|
|
|
35
34
|
logger = logging.getLogger(__name__)
|
|
36
35
|
|
|
@@ -182,11 +181,7 @@ class LlamaCppModel(LLM):
|
|
|
182
181
|
) -> bool:
|
|
183
182
|
if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
|
|
184
183
|
return False
|
|
185
|
-
if
|
|
186
|
-
"chatglm" in llm_family.model_name
|
|
187
|
-
or "qwen" in llm_family.model_name
|
|
188
|
-
or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL
|
|
189
|
-
):
|
|
184
|
+
if "chatglm" in llm_family.model_name or "qwen" in llm_family.model_name:
|
|
190
185
|
return False
|
|
191
186
|
if "generate" not in llm_family.model_ability:
|
|
192
187
|
return False
|
|
@@ -250,10 +245,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
|
|
|
250
245
|
) -> bool:
|
|
251
246
|
if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
|
|
252
247
|
return False
|
|
253
|
-
if
|
|
254
|
-
"chatglm" in llm_family.model_name
|
|
255
|
-
or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL
|
|
256
|
-
):
|
|
248
|
+
if "chatglm" in llm_family.model_name:
|
|
257
249
|
return False
|
|
258
250
|
if "chat" not in llm_family.model_ability:
|
|
259
251
|
return False
|