xinference 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (42) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +31 -0
  3. xinference/client/common.py +2 -0
  4. xinference/client/restful/restful_client.py +25 -0
  5. xinference/core/supervisor.py +11 -1
  6. xinference/core/worker.py +16 -0
  7. xinference/deploy/cmdline.py +53 -1
  8. xinference/device_utils.py +0 -2
  9. xinference/model/core.py +13 -2
  10. xinference/model/image/core.py +16 -2
  11. xinference/model/image/stable_diffusion/core.py +25 -2
  12. xinference/model/llm/__init__.py +17 -0
  13. xinference/model/llm/core.py +18 -2
  14. xinference/model/llm/ggml/llamacpp.py +3 -19
  15. xinference/model/llm/llm_family.json +8 -3
  16. xinference/model/llm/llm_family.py +100 -29
  17. xinference/model/llm/llm_family_modelscope.json +7 -2
  18. xinference/model/llm/pytorch/baichuan.py +2 -0
  19. xinference/model/llm/pytorch/chatglm.py +2 -0
  20. xinference/model/llm/pytorch/core.py +23 -0
  21. xinference/model/llm/pytorch/falcon.py +4 -0
  22. xinference/model/llm/pytorch/internlm2.py +2 -0
  23. xinference/model/llm/pytorch/llama_2.py +4 -0
  24. xinference/model/llm/pytorch/qwen_vl.py +1 -0
  25. xinference/model/llm/pytorch/vicuna.py +2 -0
  26. xinference/model/llm/pytorch/yi_vl.py +1 -0
  27. xinference/web/ui/build/asset-manifest.json +3 -3
  28. xinference/web/ui/build/index.html +1 -1
  29. xinference/web/ui/build/static/js/{main.ebf7716d.js → main.78829790.js} +3 -3
  30. xinference/web/ui/build/static/js/main.78829790.js.map +1 -0
  31. xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +1 -0
  32. xinference/web/ui/node_modules/.cache/babel-loader/e8687f75d2adacd34852b71c41ca17203d6fb4c8999ea55325bb2939f9d9ea90.json +1 -0
  33. {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/METADATA +3 -1
  34. {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/RECORD +39 -39
  35. xinference/web/ui/build/static/js/main.ebf7716d.js.map +0 -1
  36. xinference/web/ui/node_modules/.cache/babel-loader/0738899eefad7f90261125823d87ea9f0d53667b1479a0c1f398aff14f2bbd2a.json +0 -1
  37. xinference/web/ui/node_modules/.cache/babel-loader/77d4d795f078408fa2dd49da26d1ba1543d51b63cc253e736f4bef2e6014e888.json +0 -1
  38. /xinference/web/ui/build/static/js/{main.ebf7716d.js.LICENSE.txt → main.78829790.js.LICENSE.txt} +0 -0
  39. {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/LICENSE +0 -0
  40. {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/WHEEL +0 -0
  41. {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/entry_points.txt +0 -0
  42. {xinference-0.9.1.dist-info → xinference-0.9.2.dist-info}/top_level.txt +0 -0
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2024-03-01T14:36:49+0800",
11
+ "date": "2024-03-08T13:28:03+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "7b20f76ff35c3ca1824656fcd792837d909b0351",
15
- "version": "0.9.1"
14
+ "full-revisionid": "29f4c10a854cfec684dcf8398a0974f64bf8ce2b",
15
+ "version": "0.9.2"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -219,6 +219,11 @@ class RESTfulAPI:
219
219
  self._router.add_api_route(
220
220
  "/v1/models/families", self._get_builtin_families, methods=["GET"]
221
221
  )
222
+ self._router.add_api_route(
223
+ "/v1/models/vllm-supported",
224
+ self.list_vllm_supported_model_families,
225
+ methods=["GET"],
226
+ )
222
227
  self._router.add_api_route(
223
228
  "/v1/cluster/info", self.get_cluster_device_info, methods=["GET"]
224
229
  )
@@ -651,6 +656,9 @@ class RESTfulAPI:
651
656
  replica = payload.get("replica", 1)
652
657
  n_gpu = payload.get("n_gpu", "auto")
653
658
  request_limits = payload.get("request_limits", None)
659
+ peft_model_path = payload.get("peft_model_path", None)
660
+ image_lora_load_kwargs = payload.get("image_lora_load_kwargs", None)
661
+ image_lora_fuse_kwargs = payload.get("image_lora_fuse_kwargs", None)
654
662
 
655
663
  exclude_keys = {
656
664
  "model_uid",
@@ -662,6 +670,9 @@ class RESTfulAPI:
662
670
  "replica",
663
671
  "n_gpu",
664
672
  "request_limits",
673
+ "peft_model_path",
674
+ "image_lora_load_kwargs",
675
+ "image_lora_fuse_kwargs",
665
676
  }
666
677
 
667
678
  kwargs = {
@@ -686,6 +697,9 @@ class RESTfulAPI:
686
697
  n_gpu=n_gpu,
687
698
  request_limits=request_limits,
688
699
  wait_ready=wait_ready,
700
+ peft_model_path=peft_model_path,
701
+ image_lora_load_kwargs=image_lora_load_kwargs,
702
+ image_lora_fuse_kwargs=image_lora_fuse_kwargs,
689
703
  **kwargs,
690
704
  )
691
705
 
@@ -1258,6 +1272,7 @@ class RESTfulAPI:
1258
1272
  self.handle_request_limit_error(re)
1259
1273
  async for item in iterator:
1260
1274
  yield item
1275
+ yield "[DONE]"
1261
1276
  except Exception as ex:
1262
1277
  logger.exception("Chat completion stream got an error: %s", ex)
1263
1278
  await self._report_error_event(model_uid, str(ex))
@@ -1350,6 +1365,22 @@ class RESTfulAPI:
1350
1365
  logger.error(e, exc_info=True)
1351
1366
  raise HTTPException(status_code=500, detail=str(e))
1352
1367
 
1368
+ async def list_vllm_supported_model_families(self) -> JSONResponse:
1369
+ try:
1370
+ from ..model.llm.vllm.core import (
1371
+ VLLM_SUPPORTED_CHAT_MODELS,
1372
+ VLLM_SUPPORTED_MODELS,
1373
+ )
1374
+
1375
+ data = {
1376
+ "chat": VLLM_SUPPORTED_CHAT_MODELS,
1377
+ "generate": VLLM_SUPPORTED_MODELS,
1378
+ }
1379
+ return JSONResponse(content=data)
1380
+ except Exception as e:
1381
+ logger.error(e, exc_info=True)
1382
+ raise HTTPException(status_code=500, detail=str(e))
1383
+
1353
1384
  async def get_cluster_device_info(
1354
1385
  self, detailed: bool = Query(False)
1355
1386
  ) -> JSONResponse:
@@ -43,6 +43,8 @@ def streaming_response_iterator(
43
43
  line = line.strip()
44
44
  if line.startswith(b"data:"):
45
45
  json_str = line[len(b"data:") :].strip()
46
+ if json_str == b"[DONE]":
47
+ continue
46
48
  data = json.loads(json_str.decode("utf-8"))
47
49
  error = data.get("error")
48
50
  if error is not None:
@@ -683,6 +683,19 @@ class Client:
683
683
  response_data = response.json()
684
684
  self._cluster_authed = bool(response_data["auth"])
685
685
 
686
+ def vllm_models(self) -> Dict[str, Any]:
687
+ url = f"{self.base_url}/v1/models/vllm-supported"
688
+ response = requests.get(url, headers=self._headers)
689
+ if response.status_code != 200:
690
+ raise RuntimeError(
691
+ f"Failed to fetch VLLM models. detail: {response.json()['detail']}"
692
+ )
693
+
694
+ try:
695
+ return response.json()
696
+ except Exception as e:
697
+ raise RuntimeError(f"Error parsing JSON response: {e}")
698
+
686
699
  def login(self, username: str, password: str):
687
700
  if not self._cluster_authed:
688
701
  return
@@ -778,6 +791,9 @@ class Client:
778
791
  replica: int = 1,
779
792
  n_gpu: Optional[Union[int, str]] = "auto",
780
793
  request_limits: Optional[int] = None,
794
+ peft_model_path: Optional[str] = None,
795
+ image_lora_load_kwargs: Optional[Dict] = None,
796
+ image_lora_fuse_kwargs: Optional[Dict] = None,
781
797
  **kwargs,
782
798
  ) -> str:
783
799
  """
@@ -805,6 +821,12 @@ class Client:
805
821
  request_limits: Optional[int]
806
822
  The number of request limits for this model, default is None.
807
823
  ``request_limits=None`` means no limits for this model.
824
+ peft_model_path: Optional[str]
825
+ PEFT (Parameter-Efficient Fine-Tuning) model path.
826
+ image_lora_load_kwargs: Optional[Dict]
827
+ lora load parameters for image model
828
+ image_lora_fuse_kwargs: Optional[Dict]
829
+ lora fuse parameters for image model
808
830
  **kwargs:
809
831
  Any other parameters been specified.
810
832
 
@@ -827,6 +849,9 @@ class Client:
827
849
  "replica": replica,
828
850
  "n_gpu": n_gpu,
829
851
  "request_limits": request_limits,
852
+ "peft_model_path": peft_model_path,
853
+ "image_lora_load_kwargs": image_lora_load_kwargs,
854
+ "image_lora_fuse_kwargs": image_lora_fuse_kwargs,
830
855
  }
831
856
 
832
857
  for key, value in kwargs.items():
@@ -714,6 +714,9 @@ class SupervisorActor(xo.StatelessActor):
714
714
  request_limits: Optional[int] = None,
715
715
  wait_ready: bool = True,
716
716
  model_version: Optional[str] = None,
717
+ peft_model_path: Optional[str] = None,
718
+ image_lora_load_kwargs: Optional[Dict] = None,
719
+ image_lora_fuse_kwargs: Optional[Dict] = None,
717
720
  **kwargs,
718
721
  ) -> str:
719
722
  if model_uid is None:
@@ -751,6 +754,9 @@ class SupervisorActor(xo.StatelessActor):
751
754
  model_type=model_type,
752
755
  n_gpu=n_gpu,
753
756
  request_limits=request_limits,
757
+ peft_model_path=peft_model_path,
758
+ image_lora_load_kwargs=image_lora_load_kwargs,
759
+ image_lora_fuse_kwargs=image_lora_fuse_kwargs,
754
760
  **kwargs,
755
761
  )
756
762
  self._replica_model_uid_to_worker[_replica_model_uid] = worker_ref
@@ -922,7 +928,11 @@ class SupervisorActor(xo.StatelessActor):
922
928
  workers = list(self._worker_address_to_worker.values())
923
929
  for worker in workers:
924
930
  ret.update(await worker.list_models())
925
- return {parse_replica_model_uid(k)[0]: v for k, v in ret.items()}
931
+ running_model_info = {parse_replica_model_uid(k)[0]: v for k, v in ret.items()}
932
+ # add replica count
933
+ for k, v in running_model_info.items():
934
+ v["replica"] = self._model_uid_to_replica_info[k].replica
935
+ return running_model_info
926
936
 
927
937
  def is_local_deployment(self) -> bool:
928
938
  # TODO: temporary.
xinference/core/worker.py CHANGED
@@ -491,6 +491,9 @@ class WorkerActor(xo.StatelessActor):
491
491
  quantization: Optional[str],
492
492
  model_type: str = "LLM",
493
493
  n_gpu: Optional[Union[int, str]] = "auto",
494
+ peft_model_path: Optional[str] = None,
495
+ image_lora_load_kwargs: Optional[Dict] = None,
496
+ image_lora_fuse_kwargs: Optional[Dict] = None,
494
497
  request_limits: Optional[int] = None,
495
498
  **kwargs,
496
499
  ):
@@ -516,6 +519,16 @@ class WorkerActor(xo.StatelessActor):
516
519
  if isinstance(n_gpu, str) and n_gpu != "auto":
517
520
  raise ValueError("Currently `n_gpu` only supports `auto`.")
518
521
 
522
+ if peft_model_path is not None:
523
+ if model_type in ("embedding", "rerank"):
524
+ raise ValueError(
525
+ f"PEFT adaptors cannot be applied to embedding or rerank models."
526
+ )
527
+ if model_type == "LLM" and model_format in ("ggufv2", "ggmlv3"):
528
+ raise ValueError(
529
+ f"PEFT adaptors can only be applied to pytorch-like models"
530
+ )
531
+
519
532
  assert model_uid not in self._model_uid_to_model
520
533
  self._check_model_is_valid(model_name, model_format)
521
534
  assert self._supervisor_ref is not None
@@ -537,6 +550,9 @@ class WorkerActor(xo.StatelessActor):
537
550
  model_format,
538
551
  model_size_in_billions,
539
552
  quantization,
553
+ peft_model_path,
554
+ image_lora_load_kwargs,
555
+ image_lora_fuse_kwargs,
540
556
  is_local_deployment,
541
557
  **kwargs,
542
558
  )
@@ -17,7 +17,7 @@ import logging
17
17
  import os
18
18
  import sys
19
19
  import warnings
20
- from typing import List, Optional, Union
20
+ from typing import List, Optional, Tuple, Union
21
21
 
22
22
  import click
23
23
  from xoscar.utils import get_next_port
@@ -596,6 +596,26 @@ def list_model_registrations(
596
596
  type=str,
597
597
  help='The number of GPUs used by the model, default is "auto".',
598
598
  )
599
+ @click.option(
600
+ "--peft-model-path",
601
+ default=None,
602
+ type=str,
603
+ help="PEFT model path.",
604
+ )
605
+ @click.option(
606
+ "--image-lora-load-kwargs",
607
+ "-ld",
608
+ "image_lora_load_kwargs",
609
+ type=(str, str),
610
+ multiple=True,
611
+ )
612
+ @click.option(
613
+ "--image-lora-fuse-kwargs",
614
+ "-fd",
615
+ "image_lora_fuse_kwargs",
616
+ type=(str, str),
617
+ multiple=True,
618
+ )
599
619
  @click.option(
600
620
  "--trust-remote-code",
601
621
  default=True,
@@ -614,6 +634,9 @@ def model_launch(
614
634
  quantization: str,
615
635
  replica: int,
616
636
  n_gpu: str,
637
+ peft_model_path: Optional[str],
638
+ image_lora_load_kwargs: Optional[Tuple],
639
+ image_lora_fuse_kwargs: Optional[Tuple],
617
640
  trust_remote_code: bool,
618
641
  ):
619
642
  kwargs = {}
@@ -630,6 +653,17 @@ def model_launch(
630
653
  else:
631
654
  _n_gpu = int(n_gpu)
632
655
 
656
+ image_lora_load_params = (
657
+ {k: handle_click_args_type(v) for k, v in dict(image_lora_load_kwargs).items()}
658
+ if image_lora_load_kwargs
659
+ else None
660
+ )
661
+ image_lora_fuse_params = (
662
+ {k: handle_click_args_type(v) for k, v in dict(image_lora_fuse_kwargs).items()}
663
+ if image_lora_fuse_kwargs
664
+ else None
665
+ )
666
+
633
667
  endpoint = get_endpoint(endpoint)
634
668
  model_size: Optional[Union[str, int]] = (
635
669
  size_in_billions
@@ -648,6 +682,9 @@ def model_launch(
648
682
  quantization=quantization,
649
683
  replica=replica,
650
684
  n_gpu=_n_gpu,
685
+ peft_model_path=peft_model_path,
686
+ image_lora_load_kwargs=image_lora_load_params,
687
+ image_lora_fuse_kwargs=image_lora_fuse_params,
651
688
  trust_remote_code=trust_remote_code,
652
689
  **kwargs,
653
690
  )
@@ -944,6 +981,21 @@ def model_chat(
944
981
  )
945
982
 
946
983
 
984
+ @cli.command("vllm-models", help="Query and display models compatible with VLLM.")
985
+ @click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
986
+ def vllm_models(endpoint: Optional[str]):
987
+ endpoint = get_endpoint(endpoint)
988
+ client = RESTfulClient(base_url=endpoint)
989
+ client._set_token(get_stored_token(endpoint, client))
990
+ vllm_models_dict = client.vllm_models()
991
+ print("VLLM supported model families:")
992
+ chat_models = vllm_models_dict["chat"]
993
+ supported_models = vllm_models_dict["generate"]
994
+
995
+ print("VLLM supported chat model families:", chat_models)
996
+ print("VLLM supported generate model families:", supported_models)
997
+
998
+
947
999
  @cli.command("login", help="Login when the cluster is authenticated.")
948
1000
  @click.option("--endpoint", "-e", type=str, help="Xinference endpoint.")
949
1001
  @click.option("--username", type=str, required=True, help="Username.")
@@ -92,8 +92,6 @@ def gpu_count():
92
92
  )
93
93
 
94
94
  return min(torch.cuda.device_count(), len(cuda_visible_devices))
95
- elif torch.backends.mps.is_available():
96
- return 1
97
95
  elif is_xpu_available():
98
96
  return torch.xpu.device_count()
99
97
  else:
xinference/model/core.py CHANGED
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from abc import ABC, abstractmethod
16
- from typing import Any, List, Optional, Tuple
16
+ from typing import Any, Dict, List, Optional, Tuple
17
17
 
18
18
  from .._compat import BaseModel
19
19
 
@@ -52,6 +52,9 @@ def create_model_instance(
52
52
  model_format: Optional[str] = None,
53
53
  model_size_in_billions: Optional[int] = None,
54
54
  quantization: Optional[str] = None,
55
+ peft_model_path: Optional[str] = None,
56
+ image_lora_load_kwargs: Optional[Dict] = None,
57
+ image_lora_fuse_kwargs: Optional[Dict] = None,
55
58
  is_local_deployment: bool = False,
56
59
  **kwargs,
57
60
  ) -> Tuple[Any, ModelDescription]:
@@ -70,6 +73,7 @@ def create_model_instance(
70
73
  model_format,
71
74
  model_size_in_billions,
72
75
  quantization,
76
+ peft_model_path,
73
77
  is_local_deployment,
74
78
  **kwargs,
75
79
  )
@@ -82,7 +86,14 @@ def create_model_instance(
82
86
  elif model_type == "image":
83
87
  kwargs.pop("trust_remote_code", None)
84
88
  return create_image_model_instance(
85
- subpool_addr, devices, model_uid, model_name, **kwargs
89
+ subpool_addr,
90
+ devices,
91
+ model_uid,
92
+ model_name,
93
+ lora_model_path=peft_model_path,
94
+ lora_load_kwargs=image_lora_load_kwargs,
95
+ lora_fuse_kwargs=image_lora_fuse_kwargs,
96
+ **kwargs,
86
97
  )
87
98
  elif model_type == "rerank":
88
99
  kwargs.pop("trust_remote_code", None)
@@ -155,7 +155,14 @@ def get_cache_status(
155
155
 
156
156
 
157
157
  def create_image_model_instance(
158
- subpool_addr: str, devices: List[str], model_uid: str, model_name: str, **kwargs
158
+ subpool_addr: str,
159
+ devices: List[str],
160
+ model_uid: str,
161
+ model_name: str,
162
+ lora_model_path: Optional[str] = None,
163
+ lora_load_kwargs: Optional[Dict] = None,
164
+ lora_fuse_kwargs: Optional[Dict] = None,
165
+ **kwargs,
159
166
  ) -> Tuple[DiffusionModel, ImageModelDescription]:
160
167
  model_spec = match_diffusion(model_name)
161
168
  controlnet = kwargs.get("controlnet")
@@ -187,7 +194,14 @@ def create_image_model_instance(
187
194
  else:
188
195
  kwargs["controlnet"] = controlnet_model_paths
189
196
  model_path = cache(model_spec)
190
- model = DiffusionModel(model_uid, model_path, **kwargs)
197
+ model = DiffusionModel(
198
+ model_uid,
199
+ model_path,
200
+ lora_model_path=lora_model_path,
201
+ lora_load_kwargs=lora_load_kwargs,
202
+ lora_fuse_kwargs=lora_fuse_kwargs,
203
+ **kwargs,
204
+ )
191
205
  model_description = ImageModelDescription(
192
206
  subpool_addr, devices, model_spec, model_path=model_path
193
207
  )
@@ -21,7 +21,7 @@ import uuid
21
21
  from concurrent.futures import ThreadPoolExecutor
22
22
  from functools import partial
23
23
  from io import BytesIO
24
- from typing import List, Optional, Union
24
+ from typing import Dict, List, Optional, Union
25
25
 
26
26
  from ....constants import XINFERENCE_IMAGE_DIR
27
27
  from ....device_utils import move_model_to_available_device
@@ -32,14 +32,36 @@ logger = logging.getLogger(__name__)
32
32
 
33
33
  class DiffusionModel:
34
34
  def __init__(
35
- self, model_uid: str, model_path: str, device: Optional[str] = None, **kwargs
35
+ self,
36
+ model_uid: str,
37
+ model_path: str,
38
+ device: Optional[str] = None,
39
+ lora_model_path: Optional[str] = None,
40
+ lora_load_kwargs: Optional[Dict] = None,
41
+ lora_fuse_kwargs: Optional[Dict] = None,
42
+ **kwargs,
36
43
  ):
37
44
  self._model_uid = model_uid
38
45
  self._model_path = model_path
39
46
  self._device = device
40
47
  self._model = None
48
+ self._lora_model_path = lora_model_path
49
+ self._lora_load_kwargs = lora_load_kwargs or {}
50
+ self._lora_fuse_kwargs = lora_fuse_kwargs or {}
41
51
  self._kwargs = kwargs
42
52
 
53
+ def _apply_lora(self):
54
+ if self._lora_model_path is not None:
55
+ logger.info(
56
+ f"Loading the LoRA with load kwargs: {self._lora_load_kwargs}, fuse kwargs: {self._lora_fuse_kwargs}."
57
+ )
58
+ assert self._model is not None
59
+ self._model.load_lora_weights(
60
+ self._lora_model_path, **self._lora_load_kwargs
61
+ )
62
+ self._model.fuse_lora(**self._lora_fuse_kwargs)
63
+ logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
64
+
43
65
  def load(self):
44
66
  # import torch
45
67
  from diffusers import AutoPipelineForText2Image
@@ -61,6 +83,7 @@ class DiffusionModel:
61
83
  self._model = move_model_to_available_device(self._model)
62
84
  # Recommended if your computer has < 64 GB of RAM
63
85
  self._model.enable_attention_slicing()
86
+ self._apply_lora()
64
87
 
65
88
  def _call_model(
66
89
  self,
@@ -31,6 +31,7 @@ from .llm_family import (
31
31
  BUILTIN_LLM_PROMPT_STYLE,
32
32
  BUILTIN_MODELSCOPE_LLM_FAMILIES,
33
33
  LLM_CLASSES,
34
+ PEFT_SUPPORTED_CLASSES,
34
35
  CustomLLMFamilyV1,
35
36
  GgmlLLMSpecV1,
36
37
  LLMFamilyV1,
@@ -95,6 +96,22 @@ def _install():
95
96
  PytorchModel,
96
97
  ]
97
98
  )
99
+ PEFT_SUPPORTED_CLASSES.extend(
100
+ [
101
+ BaichuanPytorchChatModel,
102
+ VicunaPytorchChatModel,
103
+ FalconPytorchChatModel,
104
+ ChatglmPytorchChatModel,
105
+ LlamaPytorchModel,
106
+ LlamaPytorchChatModel,
107
+ PytorchChatModel,
108
+ FalconPytorchModel,
109
+ Internlm2PytorchChatModel,
110
+ QwenVLChatModel,
111
+ YiVLChatModel,
112
+ PytorchModel,
113
+ ]
114
+ )
98
115
 
99
116
  json_path = os.path.join(
100
117
  os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
@@ -180,6 +180,7 @@ def create_llm_model_instance(
180
180
  model_format: Optional[str] = None,
181
181
  model_size_in_billions: Optional[int] = None,
182
182
  quantization: Optional[str] = None,
183
+ peft_model_path: Optional[str] = None,
183
184
  is_local_deployment: bool = False,
184
185
  **kwargs,
185
186
  ) -> Tuple[LLM, LLMDescription]:
@@ -203,7 +204,9 @@ def create_llm_model_instance(
203
204
  assert quantization is not None
204
205
  save_path = cache(llm_family, llm_spec, quantization)
205
206
 
206
- llm_cls = match_llm_cls(llm_family, llm_spec, quantization)
207
+ llm_cls = match_llm_cls(
208
+ llm_family, llm_spec, quantization, peft_model_path=peft_model_path
209
+ )
207
210
  if not llm_cls:
208
211
  raise ValueError(
209
212
  f"Model not supported, name: {model_name}, format: {model_format},"
@@ -211,7 +214,20 @@ def create_llm_model_instance(
211
214
  )
212
215
  logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
213
216
 
214
- model = llm_cls(model_uid, llm_family, llm_spec, quantization, save_path, kwargs)
217
+ if peft_model_path is not None:
218
+ model = llm_cls(
219
+ model_uid,
220
+ llm_family,
221
+ llm_spec,
222
+ quantization,
223
+ save_path,
224
+ kwargs,
225
+ peft_model_path,
226
+ )
227
+ else:
228
+ model = llm_cls(
229
+ model_uid, llm_family, llm_spec, quantization, save_path, kwargs
230
+ )
215
231
  return model, LLMDescription(
216
232
  subpool_addr, devices, llm_family, llm_spec, quantization
217
233
  )
@@ -35,15 +35,6 @@ from .ctransformers import CTRANSFORMERS_SUPPORTED_MODEL
35
35
  logger = logging.getLogger(__name__)
36
36
 
37
37
 
38
- SIZE_TO_GPU_LAYERS = {
39
- 3: 26,
40
- 7: 32,
41
- 13: 40,
42
- 30: 60,
43
- 65: 80,
44
- }
45
-
46
-
47
38
  class LlamaCppModel(LLM):
48
39
  def __init__(
49
40
  self,
@@ -56,13 +47,6 @@ class LlamaCppModel(LLM):
56
47
  ):
57
48
  super().__init__(model_uid, model_family, model_spec, quantization, model_path)
58
49
 
59
- closest_size = min(
60
- SIZE_TO_GPU_LAYERS.keys(),
61
- key=lambda x: abs(
62
- x - self.handle_model_size(model_spec.model_size_in_billions)
63
- ),
64
- )
65
- self._gpu_layers = SIZE_TO_GPU_LAYERS[closest_size]
66
50
  self._llamacpp_model_config: LlamaCppModelConfig = self._sanitize_model_config(
67
51
  llamacpp_model_config
68
52
  )
@@ -96,9 +80,9 @@ class LlamaCppModel(LLM):
96
80
 
97
81
  if self._is_darwin_and_apple_silicon() and self._can_apply_metal():
98
82
  # TODO: platform.processor() is not safe, need to be replaced to other method.
99
- llamacpp_model_config.setdefault("n_gpu_layers", 1)
83
+ llamacpp_model_config.setdefault("n_gpu_layers", -1)
100
84
  elif self._is_linux() and self._can_apply_cublas():
101
- llamacpp_model_config.setdefault("n_gpu_layers", self._gpu_layers)
85
+ llamacpp_model_config.setdefault("n_gpu_layers", -1)
102
86
 
103
87
  return llamacpp_model_config
104
88
 
@@ -313,7 +297,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
313
297
  generate_config["stop"] = [stop, "Observation:"]
314
298
  elif isinstance(stop, Iterable):
315
299
  assert not isinstance(stop, str)
316
- generate_config["stop"] = stop + ["Observation:"]
300
+ generate_config["stop"] = stop + ["Observation:"] # type: ignore
317
301
  else:
318
302
  generate_config["stop"] = "Observation:"
319
303
 
@@ -1599,10 +1599,15 @@
1599
1599
  "model_size_in_billions": 72,
1600
1600
  "quantizations": [
1601
1601
  "q2_k",
1602
- "q3_k_m"
1602
+ "q3_k_m",
1603
+ "q4_k_m"
1603
1604
  ],
1604
1605
  "model_id": "Qwen/Qwen1.5-72B-Chat-GGUF",
1605
- "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf"
1606
+ "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf",
1607
+ "model_file_name_split_template": "qwen1_5-72b-chat-{quantization}.gguf.{part}",
1608
+ "quantization_parts": {
1609
+ "q4_k_m": ["a", "b"]
1610
+ }
1606
1611
  }
1607
1612
  ],
1608
1613
  "prompt_style": {
@@ -2967,7 +2972,7 @@
2967
2972
  },
2968
2973
  {
2969
2974
  "version": 1,
2970
- "context_length": 100000,
2975
+ "context_length": 16384,
2971
2976
  "model_name": "glaive-coder",
2972
2977
  "model_description": "A code model trained on a dataset of ~140k programming related problems and solutions generated from Glaive’s synthetic data generation platform.",
2973
2978
  "model_lang": [