xinference 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (97) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +34 -15
  3. xinference/client/oscar/actor_client.py +4 -3
  4. xinference/client/restful/restful_client.py +40 -18
  5. xinference/core/supervisor.py +48 -9
  6. xinference/core/worker.py +13 -8
  7. xinference/deploy/cmdline.py +22 -9
  8. xinference/model/audio/__init__.py +40 -1
  9. xinference/model/audio/core.py +25 -45
  10. xinference/model/audio/custom.py +148 -0
  11. xinference/model/core.py +6 -9
  12. xinference/model/embedding/core.py +1 -2
  13. xinference/model/embedding/model_spec.json +24 -0
  14. xinference/model/embedding/model_spec_modelscope.json +24 -0
  15. xinference/model/image/core.py +12 -4
  16. xinference/model/image/stable_diffusion/core.py +8 -7
  17. xinference/model/llm/__init__.py +0 -6
  18. xinference/model/llm/core.py +9 -14
  19. xinference/model/llm/ggml/llamacpp.py +2 -10
  20. xinference/model/llm/llm_family.json +507 -7
  21. xinference/model/llm/llm_family.py +41 -4
  22. xinference/model/llm/llm_family_modelscope.json +260 -0
  23. xinference/model/llm/pytorch/baichuan.py +4 -3
  24. xinference/model/llm/pytorch/chatglm.py +5 -2
  25. xinference/model/llm/pytorch/core.py +37 -41
  26. xinference/model/llm/pytorch/falcon.py +6 -5
  27. xinference/model/llm/pytorch/internlm2.py +5 -2
  28. xinference/model/llm/pytorch/llama_2.py +6 -5
  29. xinference/model/llm/pytorch/qwen_vl.py +2 -0
  30. xinference/model/llm/pytorch/vicuna.py +4 -3
  31. xinference/model/llm/pytorch/yi_vl.py +4 -2
  32. xinference/model/llm/utils.py +42 -4
  33. xinference/model/llm/vllm/core.py +54 -6
  34. xinference/model/rerank/core.py +26 -12
  35. xinference/model/rerank/model_spec.json +24 -0
  36. xinference/model/rerank/model_spec_modelscope.json +25 -1
  37. xinference/model/utils.py +12 -1
  38. xinference/thirdparty/omnilmm/chat.py +1 -1
  39. xinference/types.py +70 -19
  40. xinference/utils.py +1 -0
  41. xinference/web/ui/build/asset-manifest.json +3 -3
  42. xinference/web/ui/build/index.html +1 -1
  43. xinference/web/ui/build/static/js/main.26fdbfbe.js +3 -0
  44. xinference/web/ui/build/static/js/main.26fdbfbe.js.map +1 -0
  45. xinference/web/ui/node_modules/.cache/babel-loader/15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json +1 -0
  46. xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +1 -0
  47. xinference/web/ui/node_modules/.cache/babel-loader/1e86938a0cdf706d21e99b21f5d868fa247c0c88b26807047e26dcdc4d9a9db3.json +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/b9cbcb6d77ba21b22c6950b6fb5b305d23c19cf747f99f7d48b6b046f8f7b1b0.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/f9290c0738db50065492ceedc6a4af25083fe18399b7c44d942273349ad9e643.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +1 -0
  65. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/METADATA +13 -10
  66. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/RECORD +71 -74
  67. xinference/model/llm/ggml/ctransformers.py +0 -281
  68. xinference/model/llm/ggml/ctransformers_util.py +0 -161
  69. xinference/web/ui/build/static/js/main.98516614.js +0 -3
  70. xinference/web/ui/build/static/js/main.98516614.js.map +0 -1
  71. xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +0 -1
  72. xinference/web/ui/node_modules/.cache/babel-loader/0db651c046ef908f45cde73af0dbea0a797d3e35bb57f4a0863b481502103a64.json +0 -1
  73. xinference/web/ui/node_modules/.cache/babel-loader/139969fd25258eb7decc9505f30b779089bba50c402bb5c663008477c7bff73b.json +0 -1
  74. xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +0 -1
  75. xinference/web/ui/node_modules/.cache/babel-loader/3d93bd9a74a1ab0cec85af40f9baa5f6a8e7384b9e18c409b95a81a7b45bb7e2.json +0 -1
  76. xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +0 -1
  77. xinference/web/ui/node_modules/.cache/babel-loader/3f357ab57b8e7fade54c667f0e0ebf2787566f72bfdca0fea14e395b5c203753.json +0 -1
  78. xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +0 -1
  79. xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +0 -1
  80. xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +0 -1
  81. xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +0 -1
  83. xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +0 -1
  84. xinference/web/ui/node_modules/.cache/babel-loader/9d7c49815d97539207e5aab2fb967591b5fed7791218a0762539efc9491f36af.json +0 -1
  85. xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +0 -1
  86. xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +0 -1
  87. xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +0 -1
  88. xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +0 -1
  89. xinference/web/ui/node_modules/.cache/babel-loader/d0d0b591d9adaf42b83ad6633f8b7c118541a4b80ea957c303d3bf9b86fbad0a.json +0 -1
  90. xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +0 -1
  93. /xinference/web/ui/build/static/js/{main.98516614.js.LICENSE.txt → main.26fdbfbe.js.LICENSE.txt} +0 -0
  94. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/LICENSE +0 -0
  95. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/WHEEL +0 -0
  96. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/entry_points.txt +0 -0
  97. {xinference-0.10.0.dist-info → xinference-0.10.2.dist-info}/top_level.txt +0 -0
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2024-03-29T12:46:14+0800",
11
+ "date": "2024-04-19T11:39:12+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "2857ec497afbd2a6895d3658384ff3b4022b2840",
15
- "version": "0.10.0"
14
+ "full-revisionid": "f19e85be09bce966e0c0b3e01bc5690eb6016398",
15
+ "version": "0.10.2"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -64,6 +64,7 @@ from ..types import (
64
64
  CreateChatCompletion,
65
65
  CreateCompletion,
66
66
  ImageList,
67
+ PeftModelConfig,
67
68
  max_tokens_field,
68
69
  )
69
70
  from .oauth2.auth_service import AuthService
@@ -692,9 +693,7 @@ class RESTfulAPI:
692
693
  replica = payload.get("replica", 1)
693
694
  n_gpu = payload.get("n_gpu", "auto")
694
695
  request_limits = payload.get("request_limits", None)
695
- peft_model_path = payload.get("peft_model_path", None)
696
- image_lora_load_kwargs = payload.get("image_lora_load_kwargs", None)
697
- image_lora_fuse_kwargs = payload.get("image_lora_fuse_kwargs", None)
696
+ peft_model_config = payload.get("peft_model_config", None)
698
697
  worker_ip = payload.get("worker_ip", None)
699
698
  gpu_idx = payload.get("gpu_idx", None)
700
699
 
@@ -708,9 +707,7 @@ class RESTfulAPI:
708
707
  "replica",
709
708
  "n_gpu",
710
709
  "request_limits",
711
- "peft_model_path",
712
- "image_lora_load_kwargs",
713
- "image_lora_fuse_kwargs",
710
+ "peft_model_config",
714
711
  "worker_ip",
715
712
  "gpu_idx",
716
713
  }
@@ -725,6 +722,11 @@ class RESTfulAPI:
725
722
  detail="Invalid input. Please specify the model name",
726
723
  )
727
724
 
725
+ if peft_model_config is not None:
726
+ peft_model_config = PeftModelConfig.from_dict(peft_model_config)
727
+ else:
728
+ peft_model_config = None
729
+
728
730
  try:
729
731
  model_uid = await (await self._get_supervisor_ref()).launch_builtin_model(
730
732
  model_uid=model_uid,
@@ -737,9 +739,7 @@ class RESTfulAPI:
737
739
  n_gpu=n_gpu,
738
740
  request_limits=request_limits,
739
741
  wait_ready=wait_ready,
740
- peft_model_path=peft_model_path,
741
- image_lora_load_kwargs=image_lora_load_kwargs,
742
- image_lora_fuse_kwargs=image_lora_fuse_kwargs,
742
+ peft_model_config=peft_model_config,
743
743
  worker_ip=worker_ip,
744
744
  gpu_idx=gpu_idx,
745
745
  **kwargs,
@@ -1007,8 +1007,16 @@ class RESTfulAPI:
1007
1007
  raise HTTPException(status_code=500, detail=str(e))
1008
1008
 
1009
1009
  async def create_embedding(self, request: Request) -> Response:
1010
- body = CreateEmbeddingRequest.parse_obj(await request.json())
1010
+ payload = await request.json()
1011
+ body = CreateEmbeddingRequest.parse_obj(payload)
1011
1012
  model_uid = body.model
1013
+ exclude = {
1014
+ "model",
1015
+ "input",
1016
+ "user",
1017
+ "encoding_format",
1018
+ }
1019
+ kwargs = {key: value for key, value in payload.items() if key not in exclude}
1012
1020
 
1013
1021
  try:
1014
1022
  model = await (await self._get_supervisor_ref()).get_model(model_uid)
@@ -1022,7 +1030,7 @@ class RESTfulAPI:
1022
1030
  raise HTTPException(status_code=500, detail=str(e))
1023
1031
 
1024
1032
  try:
1025
- embedding = await model.create_embedding(body.input)
1033
+ embedding = await model.create_embedding(body.input, **kwargs)
1026
1034
  return Response(embedding, media_type="application/json")
1027
1035
  except RuntimeError as re:
1028
1036
  logger.error(re, exc_info=True)
@@ -1035,8 +1043,15 @@ class RESTfulAPI:
1035
1043
  raise HTTPException(status_code=500, detail=str(e))
1036
1044
 
1037
1045
  async def rerank(self, request: Request) -> Response:
1038
- body = RerankRequest.parse_obj(await request.json())
1046
+ payload = await request.json()
1047
+ body = RerankRequest.parse_obj(payload)
1039
1048
  model_uid = body.model
1049
+ kwargs = {
1050
+ key: value
1051
+ for key, value in payload.items()
1052
+ if key not in RerankRequest.__annotations__.keys()
1053
+ }
1054
+
1040
1055
  try:
1041
1056
  model = await (await self._get_supervisor_ref()).get_model(model_uid)
1042
1057
  except ValueError as ve:
@@ -1055,6 +1070,7 @@ class RESTfulAPI:
1055
1070
  top_n=body.top_n,
1056
1071
  max_chunks_per_doc=body.max_chunks_per_doc,
1057
1072
  return_documents=body.return_documents,
1073
+ **kwargs,
1058
1074
  )
1059
1075
  return Response(scores, media_type="application/json")
1060
1076
  except RuntimeError as re:
@@ -1345,9 +1361,12 @@ class RESTfulAPI:
1345
1361
  detail=f"Only {function_call_models} support tool messages",
1346
1362
  )
1347
1363
  if body.tools and body.stream:
1348
- raise HTTPException(
1349
- status_code=400, detail="Tool calls does not support stream"
1350
- )
1364
+ is_vllm = await model.is_vllm_backend()
1365
+ if not is_vllm or model_family not in ["qwen-chat", "qwen1.5-chat"]:
1366
+ raise HTTPException(
1367
+ status_code=400,
1368
+ detail="Streaming support for tool calls is available only when using vLLM backend and Qwen models.",
1369
+ )
1351
1370
 
1352
1371
  if body.stream:
1353
1372
 
@@ -111,7 +111,7 @@ class ClientIteratorWrapper(AsyncIterator):
111
111
 
112
112
 
113
113
  class EmbeddingModelHandle(ModelHandle):
114
- def create_embedding(self, input: Union[str, List[str]]) -> bytes:
114
+ def create_embedding(self, input: Union[str, List[str]], **kwargs) -> bytes:
115
115
  """
116
116
  Creates an embedding vector representing the input text.
117
117
 
@@ -128,7 +128,7 @@ class EmbeddingModelHandle(ModelHandle):
128
128
  machine learning models and algorithms.
129
129
  """
130
130
 
131
- coro = self._model_ref.create_embedding(input)
131
+ coro = self._model_ref.create_embedding(input, **kwargs)
132
132
  return orjson.loads(self._isolation.call(coro))
133
133
 
134
134
 
@@ -140,6 +140,7 @@ class RerankModelHandle(ModelHandle):
140
140
  top_n: Optional[int],
141
141
  max_chunks_per_doc: Optional[int],
142
142
  return_documents: Optional[bool],
143
+ **kwargs,
143
144
  ):
144
145
  """
145
146
  Returns an ordered list of documents ordered by their relevance to the provided query.
@@ -163,7 +164,7 @@ class RerankModelHandle(ModelHandle):
163
164
 
164
165
  """
165
166
  coro = self._model_ref.rerank(
166
- documents, query, top_n, max_chunks_per_doc, return_documents
167
+ documents, query, top_n, max_chunks_per_doc, return_documents, **kwargs
167
168
  )
168
169
  results = orjson.loads(self._isolation.call(coro))
169
170
  for r in results["results"]:
@@ -18,6 +18,8 @@ from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
18
18
 
19
19
  import requests
20
20
 
21
+ from ...model.utils import convert_float_to_int_or_str
22
+ from ...types import LoRA, PeftModelConfig
21
23
  from ..common import streaming_response_iterator
22
24
 
23
25
  if TYPE_CHECKING:
@@ -80,7 +82,7 @@ class RESTfulModelHandle:
80
82
 
81
83
 
82
84
  class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
83
- def create_embedding(self, input: Union[str, List[str]]) -> "Embedding":
85
+ def create_embedding(self, input: Union[str, List[str]], **kwargs) -> "Embedding":
84
86
  """
85
87
  Create an Embedding from user input via RESTful APIs.
86
88
 
@@ -102,7 +104,11 @@ class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
102
104
 
103
105
  """
104
106
  url = f"{self._base_url}/v1/embeddings"
105
- request_body = {"model": self._model_uid, "input": input}
107
+ request_body = {
108
+ "model": self._model_uid,
109
+ "input": input,
110
+ }
111
+ request_body.update(kwargs)
106
112
  response = requests.post(url, json=request_body, headers=self.auth_headers)
107
113
  if response.status_code != 200:
108
114
  raise RuntimeError(
@@ -121,6 +127,7 @@ class RESTfulRerankModelHandle(RESTfulModelHandle):
121
127
  top_n: Optional[int] = None,
122
128
  max_chunks_per_doc: Optional[int] = None,
123
129
  return_documents: Optional[bool] = None,
130
+ **kwargs,
124
131
  ):
125
132
  """
126
133
  Returns an ordered list of documents ordered by their relevance to the provided query.
@@ -156,6 +163,7 @@ class RESTfulRerankModelHandle(RESTfulModelHandle):
156
163
  "max_chunks_per_doc": max_chunks_per_doc,
157
164
  "return_documents": return_documents,
158
165
  }
166
+ request_body.update(kwargs)
159
167
  response = requests.post(url, json=request_body, headers=self.auth_headers)
160
168
  if response.status_code != 200:
161
169
  raise RuntimeError(
@@ -740,7 +748,7 @@ class Client:
740
748
  def launch_speculative_llm(
741
749
  self,
742
750
  model_name: str,
743
- model_size_in_billions: Optional[int],
751
+ model_size_in_billions: Optional[Union[int, str, float]],
744
752
  quantization: Optional[str],
745
753
  draft_model_name: str,
746
754
  draft_model_size_in_billions: Optional[int],
@@ -761,6 +769,10 @@ class Client:
761
769
  "`launch_speculative_llm` is an experimental feature and the API may change in the future."
762
770
  )
763
771
 
772
+ # convert float to int or string since the RESTful API does not accept float.
773
+ if isinstance(model_size_in_billions, float):
774
+ model_size_in_billions = convert_float_to_int_or_str(model_size_in_billions)
775
+
764
776
  payload = {
765
777
  "model_uid": None,
766
778
  "model_name": model_name,
@@ -788,15 +800,13 @@ class Client:
788
800
  model_name: str,
789
801
  model_type: str = "LLM",
790
802
  model_uid: Optional[str] = None,
791
- model_size_in_billions: Optional[Union[int, str]] = None,
803
+ model_size_in_billions: Optional[Union[int, str, float]] = None,
792
804
  model_format: Optional[str] = None,
793
805
  quantization: Optional[str] = None,
794
806
  replica: int = 1,
795
807
  n_gpu: Optional[Union[int, str]] = "auto",
808
+ peft_model_config: Optional[Dict] = None,
796
809
  request_limits: Optional[int] = None,
797
- peft_model_path: Optional[str] = None,
798
- image_lora_load_kwargs: Optional[Dict] = None,
799
- image_lora_fuse_kwargs: Optional[Dict] = None,
800
810
  worker_ip: Optional[str] = None,
801
811
  gpu_idx: Optional[Union[int, List[int]]] = None,
802
812
  **kwargs,
@@ -812,7 +822,7 @@ class Client:
812
822
  type of model.
813
823
  model_uid: str
814
824
  UID of model, auto generate a UUID if is None.
815
- model_size_in_billions: Optional[int]
825
+ model_size_in_billions: Optional[Union[int, str, float]]
816
826
  The size (in billions) of the model.
817
827
  model_format: Optional[str]
818
828
  The format of the model.
@@ -823,15 +833,13 @@ class Client:
823
833
  n_gpu: Optional[Union[int, str]],
824
834
  The number of GPUs used by the model, default is "auto".
825
835
  ``n_gpu=None`` means cpu only, ``n_gpu=auto`` lets the system automatically determine the best number of GPUs to use.
836
+ peft_model_config: Optional[Dict]
837
+ - "lora_list": A List of PEFT (Parameter-Efficient Fine-Tuning) model and path.
838
+ - "image_lora_load_kwargs": A Dict of lora load parameters for image model
839
+ - "image_lora_fuse_kwargs": A Dict of lora fuse parameters for image model
826
840
  request_limits: Optional[int]
827
- The number of request limits for this model default is None.
841
+ The number of request limits for this model, default is None.
828
842
  ``request_limits=None`` means no limits for this model.
829
- peft_model_path: Optional[str]
830
- PEFT (Parameter-Efficient Fine-Tuning) model path.
831
- image_lora_load_kwargs: Optional[Dict]
832
- lora load parameters for image model
833
- image_lora_fuse_kwargs: Optional[Dict]
834
- lora fuse parameters for image model
835
843
  worker_ip: Optional[str]
836
844
  Specify the worker ip where the model is located in a distributed scenario.
837
845
  gpu_idx: Optional[Union[int, List[int]]]
@@ -848,9 +856,26 @@ class Client:
848
856
 
849
857
  url = f"{self.base_url}/v1/models"
850
858
 
859
+ if peft_model_config is not None:
860
+ lora_list = [
861
+ LoRA.from_dict(model) for model in peft_model_config["lora_list"]
862
+ ]
863
+ peft_model = PeftModelConfig(
864
+ lora_list,
865
+ peft_model_config["image_lora_load_kwargs"],
866
+ peft_model_config["image_lora_fuse_kwargs"],
867
+ )
868
+ else:
869
+ peft_model = None
870
+
871
+ # convert float to int or string since the RESTful API does not accept float.
872
+ if isinstance(model_size_in_billions, float):
873
+ model_size_in_billions = convert_float_to_int_or_str(model_size_in_billions)
874
+
851
875
  payload = {
852
876
  "model_uid": model_uid,
853
877
  "model_name": model_name,
878
+ "peft_model_config": peft_model.to_dict() if peft_model else None,
854
879
  "model_type": model_type,
855
880
  "model_size_in_billions": model_size_in_billions,
856
881
  "model_format": model_format,
@@ -858,9 +883,6 @@ class Client:
858
883
  "replica": replica,
859
884
  "n_gpu": n_gpu,
860
885
  "request_limits": request_limits,
861
- "peft_model_path": peft_model_path,
862
- "image_lora_load_kwargs": image_lora_load_kwargs,
863
- "image_lora_fuse_kwargs": image_lora_fuse_kwargs,
864
886
  "worker_ip": worker_ip,
865
887
  "gpu_idx": gpu_idx,
866
888
  }
@@ -30,6 +30,7 @@ from ..constants import (
30
30
  )
31
31
  from ..core import ModelActor
32
32
  from ..core.status_guard import InstanceInfo, LaunchStatus
33
+ from ..types import PeftModelConfig
33
34
  from .metrics import record_metrics
34
35
  from .resource import GPUStatus, ResourceStatus
35
36
  from .utils import (
@@ -135,6 +136,13 @@ class SupervisorActor(xo.StatelessActor):
135
136
  EventCollectorActor, address=self.address, uid=EventCollectorActor.uid()
136
137
  )
137
138
 
139
+ from ..model.audio import (
140
+ CustomAudioModelFamilyV1,
141
+ generate_audio_description,
142
+ get_audio_model_descriptions,
143
+ register_audio,
144
+ unregister_audio,
145
+ )
138
146
  from ..model.embedding import (
139
147
  CustomEmbeddingModelSpec,
140
148
  generate_embedding_description,
@@ -177,6 +185,12 @@ class SupervisorActor(xo.StatelessActor):
177
185
  unregister_rerank,
178
186
  generate_rerank_description,
179
187
  ),
188
+ "audio": (
189
+ CustomAudioModelFamilyV1,
190
+ register_audio,
191
+ unregister_audio,
192
+ generate_audio_description,
193
+ ),
180
194
  }
181
195
 
182
196
  # record model version
@@ -185,6 +199,7 @@ class SupervisorActor(xo.StatelessActor):
185
199
  model_version_infos.update(get_embedding_model_descriptions())
186
200
  model_version_infos.update(get_rerank_model_descriptions())
187
201
  model_version_infos.update(get_image_model_descriptions())
202
+ model_version_infos.update(get_audio_model_descriptions())
188
203
  await self._cache_tracker_ref.record_model_version(
189
204
  model_version_infos, self.address
190
205
  )
@@ -483,6 +498,7 @@ class SupervisorActor(xo.StatelessActor):
483
498
  return ret
484
499
  elif model_type == "audio":
485
500
  from ..model.audio import BUILTIN_AUDIO_MODELS
501
+ from ..model.audio.custom import get_user_defined_audios
486
502
 
487
503
  ret = []
488
504
  for model_name, family in BUILTIN_AUDIO_MODELS.items():
@@ -491,6 +507,16 @@ class SupervisorActor(xo.StatelessActor):
491
507
  else:
492
508
  ret.append({"model_name": model_name, "is_builtin": True})
493
509
 
510
+ for model_spec in get_user_defined_audios():
511
+ if detailed:
512
+ ret.append(
513
+ await self._to_audio_model_reg(model_spec, is_builtin=False)
514
+ )
515
+ else:
516
+ ret.append(
517
+ {"model_name": model_spec.model_name, "is_builtin": False}
518
+ )
519
+
494
520
  ret.sort(key=sort_helper)
495
521
  return ret
496
522
  elif model_type == "rerank":
@@ -548,8 +574,9 @@ class SupervisorActor(xo.StatelessActor):
548
574
  raise ValueError(f"Model {model_name} not found")
549
575
  elif model_type == "audio":
550
576
  from ..model.audio import BUILTIN_AUDIO_MODELS
577
+ from ..model.audio.custom import get_user_defined_audios
551
578
 
552
- for f in BUILTIN_AUDIO_MODELS.values():
579
+ for f in list(BUILTIN_AUDIO_MODELS.values()) + get_user_defined_audios():
553
580
  if f.model_name == model_name:
554
581
  return f
555
582
  raise ValueError(f"Model {model_name} not found")
@@ -654,7 +681,7 @@ class SupervisorActor(xo.StatelessActor):
654
681
  self,
655
682
  model_uid: Optional[str],
656
683
  model_name: str,
657
- model_size_in_billions: Optional[int],
684
+ model_size_in_billions: Optional[Union[int, str]],
658
685
  quantization: Optional[str],
659
686
  draft_model_name: str,
660
687
  draft_model_size_in_billions: Optional[int],
@@ -714,7 +741,7 @@ class SupervisorActor(xo.StatelessActor):
714
741
  self,
715
742
  model_uid: Optional[str],
716
743
  model_name: str,
717
- model_size_in_billions: Optional[int],
744
+ model_size_in_billions: Optional[Union[int, str]],
718
745
  model_format: Optional[str],
719
746
  quantization: Optional[str],
720
747
  model_type: Optional[str],
@@ -723,9 +750,7 @@ class SupervisorActor(xo.StatelessActor):
723
750
  request_limits: Optional[int] = None,
724
751
  wait_ready: bool = True,
725
752
  model_version: Optional[str] = None,
726
- peft_model_path: Optional[str] = None,
727
- image_lora_load_kwargs: Optional[Dict] = None,
728
- image_lora_fuse_kwargs: Optional[Dict] = None,
753
+ peft_model_config: Optional[PeftModelConfig] = None,
729
754
  worker_ip: Optional[str] = None,
730
755
  gpu_idx: Optional[Union[int, List[int]]] = None,
731
756
  **kwargs,
@@ -777,9 +802,7 @@ class SupervisorActor(xo.StatelessActor):
777
802
  model_type=model_type,
778
803
  n_gpu=n_gpu,
779
804
  request_limits=request_limits,
780
- peft_model_path=peft_model_path,
781
- image_lora_load_kwargs=image_lora_load_kwargs,
782
- image_lora_fuse_kwargs=image_lora_fuse_kwargs,
805
+ peft_model_config=peft_model_config,
783
806
  gpu_idx=gpu_idx,
784
807
  **kwargs,
785
808
  )
@@ -870,6 +893,12 @@ class SupervisorActor(xo.StatelessActor):
870
893
  address,
871
894
  dead_models,
872
895
  )
896
+ for replica_model_uid in dead_models:
897
+ model_uid, _, _ = parse_replica_model_uid(replica_model_uid)
898
+ self._model_uid_to_replica_info.pop(model_uid, None)
899
+ self._replica_model_uid_to_worker.pop(
900
+ replica_model_uid, None
901
+ )
873
902
  dead_nodes.append(address)
874
903
  elif (
875
904
  status.failure_remaining_count
@@ -979,6 +1008,16 @@ class SupervisorActor(xo.StatelessActor):
979
1008
 
980
1009
  @log_async(logger=logger)
981
1010
  async def remove_worker(self, worker_address: str):
1011
+ uids_to_remove = []
1012
+ for model_uid in self._replica_model_uid_to_worker:
1013
+ if self._replica_model_uid_to_worker[model_uid].address == worker_address:
1014
+ uids_to_remove.append(model_uid)
1015
+
1016
+ for replica_model_uid in uids_to_remove:
1017
+ model_uid, _, _ = parse_replica_model_uid(replica_model_uid)
1018
+ self._model_uid_to_replica_info.pop(model_uid, None)
1019
+ self._replica_model_uid_to_worker.pop(replica_model_uid, None)
1020
+
982
1021
  if worker_address in self._worker_address_to_worker:
983
1022
  del self._worker_address_to_worker[worker_address]
984
1023
  logger.debug("Worker %s has been removed successfully", worker_address)
xinference/core/worker.py CHANGED
@@ -36,6 +36,7 @@ from ..core import ModelActor
36
36
  from ..core.status_guard import LaunchStatus
37
37
  from ..device_utils import gpu_count
38
38
  from ..model.core import ModelDescription, create_model_instance
39
+ from ..types import PeftModelConfig
39
40
  from .event import Event, EventCollectorActor, EventType
40
41
  from .metrics import launch_metrics_export_server, record_metrics
41
42
  from .resource import gather_node_info
@@ -195,6 +196,12 @@ class WorkerActor(xo.StatelessActor):
195
196
  logger.info("Purge cache directory: %s", XINFERENCE_CACHE_DIR)
196
197
  purge_dir(XINFERENCE_CACHE_DIR)
197
198
 
199
+ from ..model.audio import (
200
+ CustomAudioModelFamilyV1,
201
+ get_audio_model_descriptions,
202
+ register_audio,
203
+ unregister_audio,
204
+ )
198
205
  from ..model.embedding import (
199
206
  CustomEmbeddingModelSpec,
200
207
  get_embedding_model_descriptions,
@@ -223,6 +230,7 @@ class WorkerActor(xo.StatelessActor):
223
230
  unregister_embedding,
224
231
  ),
225
232
  "rerank": (CustomRerankModelSpec, register_rerank, unregister_rerank),
233
+ "audio": (CustomAudioModelFamilyV1, register_audio, unregister_audio),
226
234
  }
227
235
 
228
236
  # record model version
@@ -231,6 +239,7 @@ class WorkerActor(xo.StatelessActor):
231
239
  model_version_infos.update(get_embedding_model_descriptions())
232
240
  model_version_infos.update(get_rerank_model_descriptions())
233
241
  model_version_infos.update(get_image_model_descriptions())
242
+ model_version_infos.update(get_audio_model_descriptions())
234
243
  await self._cache_tracker_ref.record_model_version(
235
244
  model_version_infos, self.address
236
245
  )
@@ -593,14 +602,12 @@ class WorkerActor(xo.StatelessActor):
593
602
  self,
594
603
  model_uid: str,
595
604
  model_name: str,
596
- model_size_in_billions: Optional[int],
605
+ model_size_in_billions: Optional[Union[int, str]],
597
606
  model_format: Optional[str],
598
607
  quantization: Optional[str],
599
608
  model_type: str = "LLM",
600
609
  n_gpu: Optional[Union[int, str]] = "auto",
601
- peft_model_path: Optional[str] = None,
602
- image_lora_load_kwargs: Optional[Dict] = None,
603
- image_lora_fuse_kwargs: Optional[Dict] = None,
610
+ peft_model_config: Optional[PeftModelConfig] = None,
604
611
  request_limits: Optional[int] = None,
605
612
  gpu_idx: Optional[Union[int, List[int]]] = None,
606
613
  **kwargs,
@@ -638,7 +645,7 @@ class WorkerActor(xo.StatelessActor):
638
645
  if isinstance(n_gpu, str) and n_gpu != "auto":
639
646
  raise ValueError("Currently `n_gpu` only supports `auto`.")
640
647
 
641
- if peft_model_path is not None:
648
+ if peft_model_config is not None:
642
649
  if model_type in ("embedding", "rerank"):
643
650
  raise ValueError(
644
651
  f"PEFT adaptors cannot be applied to embedding or rerank models."
@@ -669,9 +676,7 @@ class WorkerActor(xo.StatelessActor):
669
676
  model_format,
670
677
  model_size_in_billions,
671
678
  quantization,
672
- peft_model_path,
673
- image_lora_load_kwargs,
674
- image_lora_fuse_kwargs,
679
+ peft_model_config,
675
680
  is_local_deployment,
676
681
  **kwargs,
677
682
  )
@@ -640,10 +640,11 @@ def list_model_registrations(
640
640
  help='The number of GPUs used by the model, default is "auto".',
641
641
  )
642
642
  @click.option(
643
- "--peft-model-path",
644
- default=None,
645
- type=str,
646
- help="PEFT model path.",
643
+ "--lora-modules",
644
+ "-lm",
645
+ multiple=True,
646
+ type=(str, str),
647
+ help="LoRA module configurations in the format name=path. Multiple modules can be specified.",
647
648
  )
648
649
  @click.option(
649
650
  "--image-lora-load-kwargs",
@@ -696,7 +697,7 @@ def model_launch(
696
697
  quantization: str,
697
698
  replica: int,
698
699
  n_gpu: str,
699
- peft_model_path: Optional[str],
700
+ lora_modules: Optional[Tuple],
700
701
  image_lora_load_kwargs: Optional[Tuple],
701
702
  image_lora_fuse_kwargs: Optional[Tuple],
702
703
  worker_ip: Optional[str],
@@ -729,6 +730,18 @@ def model_launch(
729
730
  else None
730
731
  )
731
732
 
733
+ lora_list = (
734
+ [{"lora_name": k, "local_path": v} for k, v in dict(lora_modules).items()]
735
+ if lora_modules
736
+ else []
737
+ )
738
+
739
+ peft_model_config = {
740
+ "image_lora_load_kwargs": image_lora_load_params,
741
+ "image_lora_fuse_kwargs": image_lora_fuse_params,
742
+ "lora_list": lora_list,
743
+ }
744
+
732
745
  _gpu_idx: Optional[List[int]] = (
733
746
  None if gpu_idx is None else [int(idx) for idx in gpu_idx.split(",")]
734
747
  )
@@ -736,7 +749,9 @@ def model_launch(
736
749
  endpoint = get_endpoint(endpoint)
737
750
  model_size: Optional[Union[str, int]] = (
738
751
  size_in_billions
739
- if size_in_billions is None or "_" in size_in_billions
752
+ if size_in_billions is None
753
+ or "_" in size_in_billions
754
+ or "." in size_in_billions
740
755
  else int(size_in_billions)
741
756
  )
742
757
  client = RESTfulClient(base_url=endpoint, api_key=api_key)
@@ -752,9 +767,7 @@ def model_launch(
752
767
  quantization=quantization,
753
768
  replica=replica,
754
769
  n_gpu=_n_gpu,
755
- peft_model_path=peft_model_path,
756
- image_lora_load_kwargs=image_lora_load_params,
757
- image_lora_fuse_kwargs=image_lora_fuse_params,
770
+ peft_model_config=peft_model_config,
758
771
  worker_ip=worker_ip,
759
772
  gpu_idx=_gpu_idx,
760
773
  trust_remote_code=trust_remote_code,
@@ -16,12 +16,51 @@ import codecs
16
16
  import json
17
17
  import os
18
18
 
19
- from .core import AudioModelFamilyV1, generate_audio_description, get_cache_status
19
+ from .core import (
20
+ AUDIO_MODEL_DESCRIPTIONS,
21
+ MODEL_NAME_TO_REVISION,
22
+ AudioModelFamilyV1,
23
+ generate_audio_description,
24
+ get_audio_model_descriptions,
25
+ get_cache_status,
26
+ )
27
+ from .custom import (
28
+ CustomAudioModelFamilyV1,
29
+ get_user_defined_audios,
30
+ register_audio,
31
+ unregister_audio,
32
+ )
20
33
 
21
34
  _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
22
35
  BUILTIN_AUDIO_MODELS = dict(
23
36
  (spec["model_name"], AudioModelFamilyV1(**spec))
24
37
  for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
25
38
  )
39
+ for model_name, model_spec in BUILTIN_AUDIO_MODELS.items():
40
+ MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
41
+
42
+ # register model description after recording model revision
43
+ for model_spec_info in [BUILTIN_AUDIO_MODELS]:
44
+ for model_name, model_spec in model_spec_info.items():
45
+ if model_spec.model_name not in AUDIO_MODEL_DESCRIPTIONS:
46
+ AUDIO_MODEL_DESCRIPTIONS.update(generate_audio_description(model_spec))
47
+
48
+ from ...constants import XINFERENCE_MODEL_DIR
49
+
50
+ # if persist=True, load them when init
51
+ user_defined_audio_dir = os.path.join(XINFERENCE_MODEL_DIR, "audio")
52
+ if os.path.isdir(user_defined_audio_dir):
53
+ for f in os.listdir(user_defined_audio_dir):
54
+ with codecs.open(
55
+ os.path.join(user_defined_audio_dir, f), encoding="utf-8"
56
+ ) as fd:
57
+ user_defined_audio_family = CustomAudioModelFamilyV1.parse_obj(
58
+ json.load(fd)
59
+ )
60
+ register_audio(user_defined_audio_family, persist=False)
61
+
62
+ # register model description
63
+ for ud_audio in get_user_defined_audios():
64
+ AUDIO_MODEL_DESCRIPTIONS.update(generate_audio_description(ud_audio))
26
65
 
27
66
  del _model_spec_json