xinference 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +31 -0
- xinference/client/common.py +2 -0
- xinference/client/restful/restful_client.py +25 -0
- xinference/core/supervisor.py +11 -1
- xinference/core/worker.py +16 -0
- xinference/deploy/cmdline.py +53 -1
- xinference/device_utils.py +0 -2
- xinference/model/core.py +13 -2
- xinference/model/image/core.py +16 -2
- xinference/model/image/stable_diffusion/core.py +25 -2
- xinference/model/llm/__init__.py +17 -0
- xinference/model/llm/core.py +18 -2
- xinference/model/llm/ggml/llamacpp.py +3 -19
- xinference/model/llm/llm_family.json +8 -3
- xinference/model/llm/llm_family.py +100 -29
- xinference/model/llm/llm_family_modelscope.json +7 -2
- xinference/model/llm/pytorch/baichuan.py +2 -0
- xinference/model/llm/pytorch/chatglm.py +2 -0
- xinference/model/llm/pytorch/core.py +23 -0
- xinference/model/llm/pytorch/falcon.py +4 -0
- xinference/model/llm/pytorch/internlm2.py +2 -0
- xinference/model/llm/pytorch/llama_2.py +4 -0
- xinference/model/llm/pytorch/qwen_vl.py +1 -0
- xinference/model/llm/pytorch/vicuna.py +2 -0
- xinference/model/llm/pytorch/yi_vl.py +1 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.ebf7716d.js → main.78829790.js} +3 -3
- xinference/web/ui/build/static/js/main.78829790.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e8687f75d2adacd34852b71c41ca17203d6fb4c8999ea55325bb2939f9d9ea90.json +1 -0
- {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/METADATA +3 -1
- {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/RECORD +39 -39
- xinference/web/ui/build/static/js/main.ebf7716d.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0738899eefad7f90261125823d87ea9f0d53667b1479a0c1f398aff14f2bbd2a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/77d4d795f078408fa2dd49da26d1ba1543d51b63cc253e736f4bef2e6014e888.json +0 -1
- /xinference/web/ui/build/static/js/{main.ebf7716d.js.LICENSE.txt → main.78829790.js.LICENSE.txt} +0 -0
- {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/LICENSE +0 -0
- {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/WHEEL +0 -0
- {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/entry_points.txt +0 -0
- {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2024-03-
|
|
11
|
+
"date": "2024-03-08T13:28:03+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.9.
|
|
14
|
+
"full-revisionid": "29f4c10a854cfec684dcf8398a0974f64bf8ce2b",
|
|
15
|
+
"version": "0.9.2"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
xinference/api/restful_api.py
CHANGED
|
@@ -219,6 +219,11 @@ class RESTfulAPI:
|
|
|
219
219
|
self._router.add_api_route(
|
|
220
220
|
"/v1/models/families", self._get_builtin_families, methods=["GET"]
|
|
221
221
|
)
|
|
222
|
+
self._router.add_api_route(
|
|
223
|
+
"/v1/models/vllm-supported",
|
|
224
|
+
self.list_vllm_supported_model_families,
|
|
225
|
+
methods=["GET"],
|
|
226
|
+
)
|
|
222
227
|
self._router.add_api_route(
|
|
223
228
|
"/v1/cluster/info", self.get_cluster_device_info, methods=["GET"]
|
|
224
229
|
)
|
|
@@ -651,6 +656,9 @@ class RESTfulAPI:
|
|
|
651
656
|
replica = payload.get("replica", 1)
|
|
652
657
|
n_gpu = payload.get("n_gpu", "auto")
|
|
653
658
|
request_limits = payload.get("request_limits", None)
|
|
659
|
+
peft_model_path = payload.get("peft_model_path", None)
|
|
660
|
+
image_lora_load_kwargs = payload.get("image_lora_load_kwargs", None)
|
|
661
|
+
image_lora_fuse_kwargs = payload.get("image_lora_fuse_kwargs", None)
|
|
654
662
|
|
|
655
663
|
exclude_keys = {
|
|
656
664
|
"model_uid",
|
|
@@ -662,6 +670,9 @@ class RESTfulAPI:
|
|
|
662
670
|
"replica",
|
|
663
671
|
"n_gpu",
|
|
664
672
|
"request_limits",
|
|
673
|
+
"peft_model_path",
|
|
674
|
+
"image_lora_load_kwargs",
|
|
675
|
+
"image_lora_fuse_kwargs",
|
|
665
676
|
}
|
|
666
677
|
|
|
667
678
|
kwargs = {
|
|
@@ -686,6 +697,9 @@ class RESTfulAPI:
|
|
|
686
697
|
n_gpu=n_gpu,
|
|
687
698
|
request_limits=request_limits,
|
|
688
699
|
wait_ready=wait_ready,
|
|
700
|
+
peft_model_path=peft_model_path,
|
|
701
|
+
image_lora_load_kwargs=image_lora_load_kwargs,
|
|
702
|
+
image_lora_fuse_kwargs=image_lora_fuse_kwargs,
|
|
689
703
|
**kwargs,
|
|
690
704
|
)
|
|
691
705
|
|
|
@@ -1258,6 +1272,7 @@ class RESTfulAPI:
|
|
|
1258
1272
|
self.handle_request_limit_error(re)
|
|
1259
1273
|
async for item in iterator:
|
|
1260
1274
|
yield item
|
|
1275
|
+
yield "[DONE]"
|
|
1261
1276
|
except Exception as ex:
|
|
1262
1277
|
logger.exception("Chat completion stream got an error: %s", ex)
|
|
1263
1278
|
await self._report_error_event(model_uid, str(ex))
|
|
@@ -1350,6 +1365,22 @@ class RESTfulAPI:
|
|
|
1350
1365
|
logger.error(e, exc_info=True)
|
|
1351
1366
|
raise HTTPException(status_code=500, detail=str(e))
|
|
1352
1367
|
|
|
1368
|
+
async def list_vllm_supported_model_families(self) -> JSONResponse:
|
|
1369
|
+
try:
|
|
1370
|
+
from ..model.llm.vllm.core import (
|
|
1371
|
+
VLLM_SUPPORTED_CHAT_MODELS,
|
|
1372
|
+
VLLM_SUPPORTED_MODELS,
|
|
1373
|
+
)
|
|
1374
|
+
|
|
1375
|
+
data = {
|
|
1376
|
+
"chat": VLLM_SUPPORTED_CHAT_MODELS,
|
|
1377
|
+
"generate": VLLM_SUPPORTED_MODELS,
|
|
1378
|
+
}
|
|
1379
|
+
return JSONResponse(content=data)
|
|
1380
|
+
except Exception as e:
|
|
1381
|
+
logger.error(e, exc_info=True)
|
|
1382
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
1383
|
+
|
|
1353
1384
|
async def get_cluster_device_info(
|
|
1354
1385
|
self, detailed: bool = Query(False)
|
|
1355
1386
|
) -> JSONResponse:
|
xinference/client/common.py
CHANGED
|
@@ -43,6 +43,8 @@ def streaming_response_iterator(
|
|
|
43
43
|
line = line.strip()
|
|
44
44
|
if line.startswith(b"data:"):
|
|
45
45
|
json_str = line[len(b"data:") :].strip()
|
|
46
|
+
if json_str == b"[DONE]":
|
|
47
|
+
continue
|
|
46
48
|
data = json.loads(json_str.decode("utf-8"))
|
|
47
49
|
error = data.get("error")
|
|
48
50
|
if error is not None:
|
|
@@ -683,6 +683,19 @@ class Client:
|
|
|
683
683
|
response_data = response.json()
|
|
684
684
|
self._cluster_authed = bool(response_data["auth"])
|
|
685
685
|
|
|
686
|
+
def vllm_models(self) -> Dict[str, Any]:
|
|
687
|
+
url = f"{self.base_url}/v1/models/vllm-supported"
|
|
688
|
+
response = requests.get(url, headers=self._headers)
|
|
689
|
+
if response.status_code != 200:
|
|
690
|
+
raise RuntimeError(
|
|
691
|
+
f"Failed to fetch VLLM models. detail: {response.json()['detail']}"
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
try:
|
|
695
|
+
return response.json()
|
|
696
|
+
except Exception as e:
|
|
697
|
+
raise RuntimeError(f"Error parsing JSON response: {e}")
|
|
698
|
+
|
|
686
699
|
def login(self, username: str, password: str):
|
|
687
700
|
if not self._cluster_authed:
|
|
688
701
|
return
|
|
@@ -778,6 +791,9 @@ class Client:
|
|
|
778
791
|
replica: int = 1,
|
|
779
792
|
n_gpu: Optional[Union[int, str]] = "auto",
|
|
780
793
|
request_limits: Optional[int] = None,
|
|
794
|
+
peft_model_path: Optional[str] = None,
|
|
795
|
+
image_lora_load_kwargs: Optional[Dict] = None,
|
|
796
|
+
image_lora_fuse_kwargs: Optional[Dict] = None,
|
|
781
797
|
**kwargs,
|
|
782
798
|
) -> str:
|
|
783
799
|
"""
|
|
@@ -805,6 +821,12 @@ class Client:
|
|
|
805
821
|
request_limits: Optional[int]
|
|
806
822
|
The number of request limits for this model, default is None.
|
|
807
823
|
``request_limits=None`` means no limits for this model.
|
|
824
|
+
peft_model_path: Optional[str]
|
|
825
|
+
PEFT (Parameter-Efficient Fine-Tuning) model path.
|
|
826
|
+
image_lora_load_kwargs: Optional[Dict]
|
|
827
|
+
lora load parameters for image model
|
|
828
|
+
image_lora_fuse_kwargs: Optional[Dict]
|
|
829
|
+
lora fuse parameters for image model
|
|
808
830
|
**kwargs:
|
|
809
831
|
Any other parameters been specified.
|
|
810
832
|
|
|
@@ -827,6 +849,9 @@ class Client:
|
|
|
827
849
|
"replica": replica,
|
|
828
850
|
"n_gpu": n_gpu,
|
|
829
851
|
"request_limits": request_limits,
|
|
852
|
+
"peft_model_path": peft_model_path,
|
|
853
|
+
"image_lora_load_kwargs": image_lora_load_kwargs,
|
|
854
|
+
"image_lora_fuse_kwargs": image_lora_fuse_kwargs,
|
|
830
855
|
}
|
|
831
856
|
|
|
832
857
|
for key, value in kwargs.items():
|
xinference/core/supervisor.py
CHANGED
|
@@ -714,6 +714,9 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
714
714
|
request_limits: Optional[int] = None,
|
|
715
715
|
wait_ready: bool = True,
|
|
716
716
|
model_version: Optional[str] = None,
|
|
717
|
+
peft_model_path: Optional[str] = None,
|
|
718
|
+
image_lora_load_kwargs: Optional[Dict] = None,
|
|
719
|
+
image_lora_fuse_kwargs: Optional[Dict] = None,
|
|
717
720
|
**kwargs,
|
|
718
721
|
) -> str:
|
|
719
722
|
if model_uid is None:
|
|
@@ -751,6 +754,9 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
751
754
|
model_type=model_type,
|
|
752
755
|
n_gpu=n_gpu,
|
|
753
756
|
request_limits=request_limits,
|
|
757
|
+
peft_model_path=peft_model_path,
|
|
758
|
+
image_lora_load_kwargs=image_lora_load_kwargs,
|
|
759
|
+
image_lora_fuse_kwargs=image_lora_fuse_kwargs,
|
|
754
760
|
**kwargs,
|
|
755
761
|
)
|
|
756
762
|
self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref
|
|
@@ -922,7 +928,11 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
922
928
|
workers = list(self._worker_address_to_worker.values())
|
|
923
929
|
for worker in workers:
|
|
924
930
|
ret.update(await worker.list_models())
|
|
925
|
-
|
|
931
|
+
running_model_info = {parse_replica_model_uid(k)[0]: v for k, v in ret.items()}
|
|
932
|
+
# add replica count
|
|
933
|
+
for k, v in running_model_info.items():
|
|
934
|
+
v["replica"] = self._model_uid_to_replica_info[k].replica
|
|
935
|
+
return running_model_info
|
|
926
936
|
|
|
927
937
|
def is_local_deployment(self) -> bool:
|
|
928
938
|
# TODO: temporary.
|
xinference/core/worker.py
CHANGED
|
@@ -491,6 +491,9 @@ class WorkerActor(xo.StatelessActor):
|
|
|
491
491
|
quantization: Optional[str],
|
|
492
492
|
model_type: str = "LLM",
|
|
493
493
|
n_gpu: Optional[Union[int, str]] = "auto",
|
|
494
|
+
peft_model_path: Optional[str] = None,
|
|
495
|
+
image_lora_load_kwargs: Optional[Dict] = None,
|
|
496
|
+
image_lora_fuse_kwargs: Optional[Dict] = None,
|
|
494
497
|
request_limits: Optional[int] = None,
|
|
495
498
|
**kwargs,
|
|
496
499
|
):
|
|
@@ -516,6 +519,16 @@ class WorkerActor(xo.StatelessActor):
|
|
|
516
519
|
if isinstance(n_gpu, str) and n_gpu != "auto":
|
|
517
520
|
raise ValueError("Currently `n_gpu` only supports `auto`.")
|
|
518
521
|
|
|
522
|
+
if peft_model_path is not None:
|
|
523
|
+
if model_type in ("embedding", "rerank"):
|
|
524
|
+
raise ValueError(
|
|
525
|
+
f"PEFT adaptors cannot be applied to embedding or rerank models."
|
|
526
|
+
)
|
|
527
|
+
if model_type == "LLM" and model_format in ("ggufv2", "ggmlv3"):
|
|
528
|
+
raise ValueError(
|
|
529
|
+
f"PEFT adaptors can only be applied to pytorch-like models"
|
|
530
|
+
)
|
|
531
|
+
|
|
519
532
|
assert model_uid not in self._model_uid_to_model
|
|
520
533
|
self._check_model_is_valid(model_name, model_format)
|
|
521
534
|
assert self._supervisor_ref is not None
|
|
@@ -537,6 +550,9 @@ class WorkerActor(xo.StatelessActor):
|
|
|
537
550
|
model_format,
|
|
538
551
|
model_size_in_billions,
|
|
539
552
|
quantization,
|
|
553
|
+
peft_model_path,
|
|
554
|
+
image_lora_load_kwargs,
|
|
555
|
+
image_lora_fuse_kwargs,
|
|
540
556
|
is_local_deployment,
|
|
541
557
|
**kwargs,
|
|
542
558
|
)
|
xinference/deploy/cmdline.py
CHANGED
|
@@ -17,7 +17,7 @@ import logging
|
|
|
17
17
|
import os
|
|
18
18
|
import sys
|
|
19
19
|
import warnings
|
|
20
|
-
from typing import List, Optional, Union
|
|
20
|
+
from typing import List, Optional, Tuple, Union
|
|
21
21
|
|
|
22
22
|
import click
|
|
23
23
|
from xoscar.utils import get_next_port
|
|
@@ -596,6 +596,26 @@ def list_model_registrations(
|
|
|
596
596
|
type=str,
|
|
597
597
|
help='The number of GPUs used by the model, default is "auto".',
|
|
598
598
|
)
|
|
599
|
+
@click.option(
|
|
600
|
+
"--peft-model-path",
|
|
601
|
+
default=None,
|
|
602
|
+
type=str,
|
|
603
|
+
help="PEFT model path.",
|
|
604
|
+
)
|
|
605
|
+
@click.option(
|
|
606
|
+
"--image-lora-load-kwargs",
|
|
607
|
+
"-ld",
|
|
608
|
+
"image_lora_load_kwargs",
|
|
609
|
+
type=(str, str),
|
|
610
|
+
multiple=True,
|
|
611
|
+
)
|
|
612
|
+
@click.option(
|
|
613
|
+
"--image-lora-fuse-kwargs",
|
|
614
|
+
"-fd",
|
|
615
|
+
"image_lora_fuse_kwargs",
|
|
616
|
+
type=(str, str),
|
|
617
|
+
multiple=True,
|
|
618
|
+
)
|
|
599
619
|
@click.option(
|
|
600
620
|
"--trust-remote-code",
|
|
601
621
|
default=True,
|
|
@@ -614,6 +634,9 @@ def model_launch(
|
|
|
614
634
|
quantization: str,
|
|
615
635
|
replica: int,
|
|
616
636
|
n_gpu: str,
|
|
637
|
+
peft_model_path: Optional[str],
|
|
638
|
+
image_lora_load_kwargs: Optional[Tuple],
|
|
639
|
+
image_lora_fuse_kwargs: Optional[Tuple],
|
|
617
640
|
trust_remote_code: bool,
|
|
618
641
|
):
|
|
619
642
|
kwargs = {}
|
|
@@ -630,6 +653,17 @@ def model_launch(
|
|
|
630
653
|
else:
|
|
631
654
|
_n_gpu = int(n_gpu)
|
|
632
655
|
|
|
656
|
+
image_lora_load_params = (
|
|
657
|
+
{k: handle_click_args_type(v) for k, v in dict(image_lora_load_kwargs).items()}
|
|
658
|
+
if image_lora_load_kwargs
|
|
659
|
+
else None
|
|
660
|
+
)
|
|
661
|
+
image_lora_fuse_params = (
|
|
662
|
+
{k: handle_click_args_type(v) for k, v in dict(image_lora_fuse_kwargs).items()}
|
|
663
|
+
if image_lora_fuse_kwargs
|
|
664
|
+
else None
|
|
665
|
+
)
|
|
666
|
+
|
|
633
667
|
endpoint = get_endpoint(endpoint)
|
|
634
668
|
model_size: Optional[Union[str, int]] = (
|
|
635
669
|
size_in_billions
|
|
@@ -648,6 +682,9 @@ def model_launch(
|
|
|
648
682
|
quantization=quantization,
|
|
649
683
|
replica=replica,
|
|
650
684
|
n_gpu=_n_gpu,
|
|
685
|
+
peft_model_path=peft_model_path,
|
|
686
|
+
image_lora_load_kwargs=image_lora_load_params,
|
|
687
|
+
image_lora_fuse_kwargs=image_lora_fuse_params,
|
|
651
688
|
trust_remote_code=trust_remote_code,
|
|
652
689
|
**kwargs,
|
|
653
690
|
)
|
|
@@ -944,6 +981,21 @@ def model_chat(
|
|
|
944
981
|
)
|
|
945
982
|
|
|
946
983
|
|
|
984
|
+
@cli.command("vllm-models", help="Query and display models compatible with VLLM.")
|
|
985
|
+
@click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
|
|
986
|
+
def vllm_models(endpoint: Optional[str]):
|
|
987
|
+
endpoint = get_endpoint(endpoint)
|
|
988
|
+
client = RESTfulClient(base_url=endpoint)
|
|
989
|
+
client._set_token(get_stored_token(endpoint, client))
|
|
990
|
+
vllm_models_dict = client.vllm_models()
|
|
991
|
+
print("VLLM supported model families:")
|
|
992
|
+
chat_models = vllm_models_dict["chat"]
|
|
993
|
+
supported_models = vllm_models_dict["generate"]
|
|
994
|
+
|
|
995
|
+
print("VLLM supported chat model families:", chat_models)
|
|
996
|
+
print("VLLM supported generate model families:", supported_models)
|
|
997
|
+
|
|
998
|
+
|
|
947
999
|
@cli.command("login", help="Login when the cluster is authenticated.")
|
|
948
1000
|
@click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
|
|
949
1001
|
@click.option("--username", type=str, required=True, help="Username.")
|
xinference/device_utils.py
CHANGED
xinference/model/core.py
CHANGED
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
from abc import ABC, abstractmethod
|
|
16
|
-
from typing import Any, List, Optional, Tuple
|
|
16
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
17
17
|
|
|
18
18
|
from .._compat import BaseModel
|
|
19
19
|
|
|
@@ -52,6 +52,9 @@ def create_model_instance(
|
|
|
52
52
|
model_format: Optional[str] = None,
|
|
53
53
|
model_size_in_billions: Optional[int] = None,
|
|
54
54
|
quantization: Optional[str] = None,
|
|
55
|
+
peft_model_path: Optional[str] = None,
|
|
56
|
+
image_lora_load_kwargs: Optional[Dict] = None,
|
|
57
|
+
image_lora_fuse_kwargs: Optional[Dict] = None,
|
|
55
58
|
is_local_deployment: bool = False,
|
|
56
59
|
**kwargs,
|
|
57
60
|
) -> Tuple[Any, ModelDescription]:
|
|
@@ -70,6 +73,7 @@ def create_model_instance(
|
|
|
70
73
|
model_format,
|
|
71
74
|
model_size_in_billions,
|
|
72
75
|
quantization,
|
|
76
|
+
peft_model_path,
|
|
73
77
|
is_local_deployment,
|
|
74
78
|
**kwargs,
|
|
75
79
|
)
|
|
@@ -82,7 +86,14 @@ def create_model_instance(
|
|
|
82
86
|
elif model_type == "image":
|
|
83
87
|
kwargs.pop("trust_remote_code", None)
|
|
84
88
|
return create_image_model_instance(
|
|
85
|
-
subpool_addr,
|
|
89
|
+
subpool_addr,
|
|
90
|
+
devices,
|
|
91
|
+
model_uid,
|
|
92
|
+
model_name,
|
|
93
|
+
lora_model_path=peft_model_path,
|
|
94
|
+
lora_load_kwargs=image_lora_load_kwargs,
|
|
95
|
+
lora_fuse_kwargs=image_lora_fuse_kwargs,
|
|
96
|
+
**kwargs,
|
|
86
97
|
)
|
|
87
98
|
elif model_type == "rerank":
|
|
88
99
|
kwargs.pop("trust_remote_code", None)
|
xinference/model/image/core.py
CHANGED
|
@@ -155,7 +155,14 @@ def get_cache_status(
|
|
|
155
155
|
|
|
156
156
|
|
|
157
157
|
def create_image_model_instance(
|
|
158
|
-
subpool_addr: str,
|
|
158
|
+
subpool_addr: str,
|
|
159
|
+
devices: List[str],
|
|
160
|
+
model_uid: str,
|
|
161
|
+
model_name: str,
|
|
162
|
+
lora_model_path: Optional[str] = None,
|
|
163
|
+
lora_load_kwargs: Optional[Dict] = None,
|
|
164
|
+
lora_fuse_kwargs: Optional[Dict] = None,
|
|
165
|
+
**kwargs,
|
|
159
166
|
) -> Tuple[DiffusionModel, ImageModelDescription]:
|
|
160
167
|
model_spec = match_diffusion(model_name)
|
|
161
168
|
controlnet = kwargs.get("controlnet")
|
|
@@ -187,7 +194,14 @@ def create_image_model_instance(
|
|
|
187
194
|
else:
|
|
188
195
|
kwargs["controlnet"] = controlnet_model_paths
|
|
189
196
|
model_path = cache(model_spec)
|
|
190
|
-
model = DiffusionModel(
|
|
197
|
+
model = DiffusionModel(
|
|
198
|
+
model_uid,
|
|
199
|
+
model_path,
|
|
200
|
+
lora_model_path=lora_model_path,
|
|
201
|
+
lora_load_kwargs=lora_load_kwargs,
|
|
202
|
+
lora_fuse_kwargs=lora_fuse_kwargs,
|
|
203
|
+
**kwargs,
|
|
204
|
+
)
|
|
191
205
|
model_description = ImageModelDescription(
|
|
192
206
|
subpool_addr, devices, model_spec, model_path=model_path
|
|
193
207
|
)
|
|
@@ -21,7 +21,7 @@ import uuid
|
|
|
21
21
|
from concurrent.futures import ThreadPoolExecutor
|
|
22
22
|
from functools import partial
|
|
23
23
|
from io import BytesIO
|
|
24
|
-
from typing import List, Optional, Union
|
|
24
|
+
from typing import Dict, List, Optional, Union
|
|
25
25
|
|
|
26
26
|
from ....constants import XINFERENCE_IMAGE_DIR
|
|
27
27
|
from ....device_utils import move_model_to_available_device
|
|
@@ -32,14 +32,36 @@ logger = logging.getLogger(__name__)
|
|
|
32
32
|
|
|
33
33
|
class DiffusionModel:
|
|
34
34
|
def __init__(
|
|
35
|
-
self,
|
|
35
|
+
self,
|
|
36
|
+
model_uid: str,
|
|
37
|
+
model_path: str,
|
|
38
|
+
device: Optional[str] = None,
|
|
39
|
+
lora_model_path: Optional[str] = None,
|
|
40
|
+
lora_load_kwargs: Optional[Dict] = None,
|
|
41
|
+
lora_fuse_kwargs: Optional[Dict] = None,
|
|
42
|
+
**kwargs,
|
|
36
43
|
):
|
|
37
44
|
self._model_uid = model_uid
|
|
38
45
|
self._model_path = model_path
|
|
39
46
|
self._device = device
|
|
40
47
|
self._model = None
|
|
48
|
+
self._lora_model_path = lora_model_path
|
|
49
|
+
self._lora_load_kwargs = lora_load_kwargs or {}
|
|
50
|
+
self._lora_fuse_kwargs = lora_fuse_kwargs or {}
|
|
41
51
|
self._kwargs = kwargs
|
|
42
52
|
|
|
53
|
+
def _apply_lora(self):
|
|
54
|
+
if self._lora_model_path is not None:
|
|
55
|
+
logger.info(
|
|
56
|
+
f"Loading the LoRA with load kwargs: {self._lora_load_kwargs}, fuse kwargs: {self._lora_fuse_kwargs}."
|
|
57
|
+
)
|
|
58
|
+
assert self._model is not None
|
|
59
|
+
self._model.load_lora_weights(
|
|
60
|
+
self._lora_model_path, **self._lora_load_kwargs
|
|
61
|
+
)
|
|
62
|
+
self._model.fuse_lora(**self._lora_fuse_kwargs)
|
|
63
|
+
logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
|
|
64
|
+
|
|
43
65
|
def load(self):
|
|
44
66
|
# import torch
|
|
45
67
|
from diffusers import AutoPipelineForText2Image
|
|
@@ -61,6 +83,7 @@ class DiffusionModel:
|
|
|
61
83
|
self._model = move_model_to_available_device(self._model)
|
|
62
84
|
# Recommended if your computer has < 64 GB of RAM
|
|
63
85
|
self._model.enable_attention_slicing()
|
|
86
|
+
self._apply_lora()
|
|
64
87
|
|
|
65
88
|
def _call_model(
|
|
66
89
|
self,
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -31,6 +31,7 @@ from .llm_family import (
|
|
|
31
31
|
BUILTIN_LLM_PROMPT_STYLE,
|
|
32
32
|
BUILTIN_MODELSCOPE_LLM_FAMILIES,
|
|
33
33
|
LLM_CLASSES,
|
|
34
|
+
PEFT_SUPPORTED_CLASSES,
|
|
34
35
|
CustomLLMFamilyV1,
|
|
35
36
|
GgmlLLMSpecV1,
|
|
36
37
|
LLMFamilyV1,
|
|
@@ -95,6 +96,22 @@ def _install():
|
|
|
95
96
|
PytorchModel,
|
|
96
97
|
]
|
|
97
98
|
)
|
|
99
|
+
PEFT_SUPPORTED_CLASSES.extend(
|
|
100
|
+
[
|
|
101
|
+
BaichuanPytorchChatModel,
|
|
102
|
+
VicunaPytorchChatModel,
|
|
103
|
+
FalconPytorchChatModel,
|
|
104
|
+
ChatglmPytorchChatModel,
|
|
105
|
+
LlamaPytorchModel,
|
|
106
|
+
LlamaPytorchChatModel,
|
|
107
|
+
PytorchChatModel,
|
|
108
|
+
FalconPytorchModel,
|
|
109
|
+
Internlm2PytorchChatModel,
|
|
110
|
+
QwenVLChatModel,
|
|
111
|
+
YiVLChatModel,
|
|
112
|
+
PytorchModel,
|
|
113
|
+
]
|
|
114
|
+
)
|
|
98
115
|
|
|
99
116
|
json_path = os.path.join(
|
|
100
117
|
os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
|
xinference/model/llm/core.py
CHANGED
|
@@ -180,6 +180,7 @@ def create_llm_model_instance(
|
|
|
180
180
|
model_format: Optional[str] = None,
|
|
181
181
|
model_size_in_billions: Optional[int] = None,
|
|
182
182
|
quantization: Optional[str] = None,
|
|
183
|
+
peft_model_path: Optional[str] = None,
|
|
183
184
|
is_local_deployment: bool = False,
|
|
184
185
|
**kwargs,
|
|
185
186
|
) -> Tuple[LLM, LLMDescription]:
|
|
@@ -203,7 +204,9 @@ def create_llm_model_instance(
|
|
|
203
204
|
assert quantization is not None
|
|
204
205
|
save_path = cache(llm_family, llm_spec, quantization)
|
|
205
206
|
|
|
206
|
-
llm_cls = match_llm_cls(
|
|
207
|
+
llm_cls = match_llm_cls(
|
|
208
|
+
llm_family, llm_spec, quantization, peft_model_path=peft_model_path
|
|
209
|
+
)
|
|
207
210
|
if not llm_cls:
|
|
208
211
|
raise ValueError(
|
|
209
212
|
f"Model not supported, name: {model_name}, format: {model_format},"
|
|
@@ -211,7 +214,20 @@ def create_llm_model_instance(
|
|
|
211
214
|
)
|
|
212
215
|
logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
|
|
213
216
|
|
|
214
|
-
|
|
217
|
+
if peft_model_path is not None:
|
|
218
|
+
model = llm_cls(
|
|
219
|
+
model_uid,
|
|
220
|
+
llm_family,
|
|
221
|
+
llm_spec,
|
|
222
|
+
quantization,
|
|
223
|
+
save_path,
|
|
224
|
+
kwargs,
|
|
225
|
+
peft_model_path,
|
|
226
|
+
)
|
|
227
|
+
else:
|
|
228
|
+
model = llm_cls(
|
|
229
|
+
model_uid, llm_family, llm_spec, quantization, save_path, kwargs
|
|
230
|
+
)
|
|
215
231
|
return model, LLMDescription(
|
|
216
232
|
subpool_addr, devices, llm_family, llm_spec, quantization
|
|
217
233
|
)
|
|
@@ -35,15 +35,6 @@ from .ctransformers import CTRANSFORMERS_SUPPORTED_MODEL
|
|
|
35
35
|
logger = logging.getLogger(__name__)
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
SIZE_TO_GPU_LAYERS = {
|
|
39
|
-
3: 26,
|
|
40
|
-
7: 32,
|
|
41
|
-
13: 40,
|
|
42
|
-
30: 60,
|
|
43
|
-
65: 80,
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
|
|
47
38
|
class LlamaCppModel(LLM):
|
|
48
39
|
def __init__(
|
|
49
40
|
self,
|
|
@@ -56,13 +47,6 @@ class LlamaCppModel(LLM):
|
|
|
56
47
|
):
|
|
57
48
|
super().__init__(model_uid, model_family, model_spec, quantization, model_path)
|
|
58
49
|
|
|
59
|
-
closest_size = min(
|
|
60
|
-
SIZE_TO_GPU_LAYERS.keys(),
|
|
61
|
-
key=lambda x: abs(
|
|
62
|
-
x - self.handle_model_size(model_spec.model_size_in_billions)
|
|
63
|
-
),
|
|
64
|
-
)
|
|
65
|
-
self._gpu_layers = SIZE_TO_GPU_LAYERS[closest_size]
|
|
66
50
|
self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
|
|
67
51
|
llamacpp_model_config
|
|
68
52
|
)
|
|
@@ -96,9 +80,9 @@ class LlamaCppModel(LLM):
|
|
|
96
80
|
|
|
97
81
|
if self._is_darwin_and_apple_silicon() and self._can_apply_metal():
|
|
98
82
|
# TODO: platform.processor() is not safe, need to be replaced to other method.
|
|
99
|
-
llamacpp_model_config.setdefault("n_gpu_layers", 1)
|
|
83
|
+
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
100
84
|
elif self._is_linux() and self._can_apply_cublas():
|
|
101
|
-
llamacpp_model_config.setdefault("n_gpu_layers",
|
|
85
|
+
llamacpp_model_config.setdefault("n_gpu_layers", -1)
|
|
102
86
|
|
|
103
87
|
return llamacpp_model_config
|
|
104
88
|
|
|
@@ -313,7 +297,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
|
|
|
313
297
|
generate_config["stop"] = [stop, "Observation:"]
|
|
314
298
|
elif isinstance(stop, Iterable):
|
|
315
299
|
assert not isinstance(stop, str)
|
|
316
|
-
generate_config["stop"] = stop + ["Observation:"]
|
|
300
|
+
generate_config["stop"] = stop + ["Observation:"] # type: ignore
|
|
317
301
|
else:
|
|
318
302
|
generate_config["stop"] = "Observation:"
|
|
319
303
|
|
|
@@ -1599,10 +1599,15 @@
|
|
|
1599
1599
|
"model_size_in_billions": 72,
|
|
1600
1600
|
"quantizations": [
|
|
1601
1601
|
"q2_k",
|
|
1602
|
-
"q3_k_m"
|
|
1602
|
+
"q3_k_m",
|
|
1603
|
+
"q4_k_m"
|
|
1603
1604
|
],
|
|
1604
1605
|
"model_id": "Qwen/Qwen1.5-72B-Chat-GGUF",
|
|
1605
|
-
"model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf"
|
|
1606
|
+
"model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf",
|
|
1607
|
+
"model_file_name_split_template": "qwen1_5-72b-chat-{quantization}.gguf.{part}",
|
|
1608
|
+
"quantization_parts": {
|
|
1609
|
+
"q4_k_m": ["a", "b"]
|
|
1610
|
+
}
|
|
1606
1611
|
}
|
|
1607
1612
|
],
|
|
1608
1613
|
"prompt_style": {
|
|
@@ -2967,7 +2972,7 @@
|
|
|
2967
2972
|
},
|
|
2968
2973
|
{
|
|
2969
2974
|
"version": 1,
|
|
2970
|
-
"context_length":
|
|
2975
|
+
"context_length": 16384,
|
|
2971
2976
|
"model_name": "glaive-coder",
|
|
2972
2977
|
"model_description": "A code model trained on a dataset of ~140k programming related problems and solutions generated from Glaive’s synthetic data generation platform.",
|
|
2973
2978
|
"model_lang": [
|