xinference 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (76) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +25 -6
  3. xinference/client/oscar/actor_client.py +4 -3
  4. xinference/client/restful/restful_client.py +8 -2
  5. xinference/core/supervisor.py +16 -0
  6. xinference/model/embedding/core.py +1 -2
  7. xinference/model/llm/__init__.py +0 -6
  8. xinference/model/llm/ggml/llamacpp.py +2 -10
  9. xinference/model/llm/llm_family.json +244 -7
  10. xinference/model/llm/llm_family.py +15 -0
  11. xinference/model/llm/llm_family_modelscope.json +100 -0
  12. xinference/model/llm/pytorch/chatglm.py +2 -0
  13. xinference/model/llm/pytorch/core.py +22 -28
  14. xinference/model/llm/pytorch/internlm2.py +2 -0
  15. xinference/model/llm/pytorch/qwen_vl.py +2 -0
  16. xinference/model/llm/pytorch/yi_vl.py +4 -2
  17. xinference/model/llm/utils.py +42 -4
  18. xinference/model/llm/vllm/core.py +51 -6
  19. xinference/model/rerank/core.py +3 -0
  20. xinference/thirdparty/omnilmm/chat.py +1 -1
  21. xinference/types.py +15 -19
  22. xinference/web/ui/build/asset-manifest.json +3 -3
  23. xinference/web/ui/build/index.html +1 -1
  24. xinference/web/ui/build/static/js/main.76ef2b17.js +3 -0
  25. xinference/web/ui/build/static/js/main.76ef2b17.js.map +1 -0
  26. xinference/web/ui/node_modules/.cache/babel-loader/15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json +1 -0
  27. xinference/web/ui/node_modules/.cache/babel-loader/35d0e4a317e5582cbb79d901302e9d706520ac53f8a734c2fd8bfde6eb5a4f02.json +1 -0
  28. xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +1 -0
  29. xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +1 -0
  30. xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +1 -0
  31. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +1 -0
  32. xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +1 -0
  33. xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/b9cbcb6d77ba21b22c6950b6fb5b305d23c19cf747f99f7d48b6b046f8f7b1b0.json +1 -0
  36. xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/d076fd56cf3b15ed2433e3744b98c6b4e4410a19903d1db4de5bba0e1a1b3347.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/daad8131d91134f6d7aef895a0c9c32e1cb928277cb5aa66c01028126d215be0.json +1 -0
  39. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +1 -0
  40. xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +1 -0
  41. xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +1 -0
  42. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/f16aec63602a77bd561d0e67fa00b76469ac54b8033754bba114ec5eb3257964.json +1 -0
  44. {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/METADATA +10 -10
  45. {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/RECORD +50 -56
  46. xinference/model/llm/ggml/ctransformers.py +0 -281
  47. xinference/model/llm/ggml/ctransformers_util.py +0 -161
  48. xinference/web/ui/build/static/js/main.98516614.js +0 -3
  49. xinference/web/ui/build/static/js/main.98516614.js.map +0 -1
  50. xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +0 -1
  51. xinference/web/ui/node_modules/.cache/babel-loader/0db651c046ef908f45cde73af0dbea0a797d3e35bb57f4a0863b481502103a64.json +0 -1
  52. xinference/web/ui/node_modules/.cache/babel-loader/139969fd25258eb7decc9505f30b779089bba50c402bb5c663008477c7bff73b.json +0 -1
  53. xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +0 -1
  54. xinference/web/ui/node_modules/.cache/babel-loader/3d93bd9a74a1ab0cec85af40f9baa5f6a8e7384b9e18c409b95a81a7b45bb7e2.json +0 -1
  55. xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +0 -1
  56. xinference/web/ui/node_modules/.cache/babel-loader/3f357ab57b8e7fade54c667f0e0ebf2787566f72bfdca0fea14e395b5c203753.json +0 -1
  57. xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +0 -1
  58. xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +0 -1
  59. xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +0 -1
  60. xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +0 -1
  61. xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +0 -1
  62. xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +0 -1
  63. xinference/web/ui/node_modules/.cache/babel-loader/9d7c49815d97539207e5aab2fb967591b5fed7791218a0762539efc9491f36af.json +0 -1
  64. xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +0 -1
  65. xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +0 -1
  66. xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +0 -1
  67. xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +0 -1
  68. xinference/web/ui/node_modules/.cache/babel-loader/d0d0b591d9adaf42b83ad6633f8b7c118541a4b80ea957c303d3bf9b86fbad0a.json +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +0 -1
  71. xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +0 -1
  72. /xinference/web/ui/build/static/js/{main.98516614.js.LICENSE.txt → main.76ef2b17.js.LICENSE.txt} +0 -0
  73. {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/LICENSE +0 -0
  74. {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/WHEEL +0 -0
  75. {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/entry_points.txt +0 -0
  76. {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/top_level.txt +0 -0
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2024-03-29T12:46:14+0800",
11
+ "date": "2024-04-11T15:35:46+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "2857ec497afbd2a6895d3658384ff3b4022b2840",
15
- "version": "0.10.0"
14
+ "full-revisionid": "e3a947ebddfc53b5e8ec723c1f632c2b895edef1",
15
+ "version": "0.10.1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -1007,8 +1007,16 @@ class RESTfulAPI:
1007
1007
  raise HTTPException(status_code=500, detail=str(e))
1008
1008
 
1009
1009
  async def create_embedding(self, request: Request) -> Response:
1010
- body = CreateEmbeddingRequest.parse_obj(await request.json())
1010
+ payload = await request.json()
1011
+ body = CreateEmbeddingRequest.parse_obj(payload)
1011
1012
  model_uid = body.model
1013
+ exclude = {
1014
+ "model",
1015
+ "input",
1016
+ "user",
1017
+ "encoding_format",
1018
+ }
1019
+ kwargs = {key: value for key, value in payload.items() if key not in exclude}
1012
1020
 
1013
1021
  try:
1014
1022
  model = await (await self._get_supervisor_ref()).get_model(model_uid)
@@ -1022,7 +1030,7 @@ class RESTfulAPI:
1022
1030
  raise HTTPException(status_code=500, detail=str(e))
1023
1031
 
1024
1032
  try:
1025
- embedding = await model.create_embedding(body.input)
1033
+ embedding = await model.create_embedding(body.input, **kwargs)
1026
1034
  return Response(embedding, media_type="application/json")
1027
1035
  except RuntimeError as re:
1028
1036
  logger.error(re, exc_info=True)
@@ -1035,8 +1043,15 @@ class RESTfulAPI:
1035
1043
  raise HTTPException(status_code=500, detail=str(e))
1036
1044
 
1037
1045
  async def rerank(self, request: Request) -> Response:
1038
- body = RerankRequest.parse_obj(await request.json())
1046
+ payload = await request.json()
1047
+ body = RerankRequest.parse_obj(payload)
1039
1048
  model_uid = body.model
1049
+ kwargs = {
1050
+ key: value
1051
+ for key, value in payload.items()
1052
+ if key not in RerankRequest.__annotations__.keys()
1053
+ }
1054
+
1040
1055
  try:
1041
1056
  model = await (await self._get_supervisor_ref()).get_model(model_uid)
1042
1057
  except ValueError as ve:
@@ -1055,6 +1070,7 @@ class RESTfulAPI:
1055
1070
  top_n=body.top_n,
1056
1071
  max_chunks_per_doc=body.max_chunks_per_doc,
1057
1072
  return_documents=body.return_documents,
1073
+ **kwargs,
1058
1074
  )
1059
1075
  return Response(scores, media_type="application/json")
1060
1076
  except RuntimeError as re:
@@ -1345,9 +1361,12 @@ class RESTfulAPI:
1345
1361
  detail=f"Only {function_call_models} support tool messages",
1346
1362
  )
1347
1363
  if body.tools and body.stream:
1348
- raise HTTPException(
1349
- status_code=400, detail="Tool calls does not support stream"
1350
- )
1364
+ is_vllm = await model.is_vllm_backend()
1365
+ if not is_vllm or model_family not in ["qwen-chat", "qwen1.5-chat"]:
1366
+ raise HTTPException(
1367
+ status_code=400,
1368
+ detail="Streaming support for tool calls is available only when using vLLM backend and Qwen models.",
1369
+ )
1351
1370
 
1352
1371
  if body.stream:
1353
1372
 
@@ -111,7 +111,7 @@ class ClientIteratorWrapper(AsyncIterator):
111
111
 
112
112
 
113
113
  class EmbeddingModelHandle(ModelHandle):
114
- def create_embedding(self, input: Union[str, List[str]]) -> bytes:
114
+ def create_embedding(self, input: Union[str, List[str]], **kwargs) -> bytes:
115
115
  """
116
116
  Creates an embedding vector representing the input text.
117
117
 
@@ -128,7 +128,7 @@ class EmbeddingModelHandle(ModelHandle):
128
128
  machine learning models and algorithms.
129
129
  """
130
130
 
131
- coro = self._model_ref.create_embedding(input)
131
+ coro = self._model_ref.create_embedding(input, **kwargs)
132
132
  return orjson.loads(self._isolation.call(coro))
133
133
 
134
134
 
@@ -140,6 +140,7 @@ class RerankModelHandle(ModelHandle):
140
140
  top_n: Optional[int],
141
141
  max_chunks_per_doc: Optional[int],
142
142
  return_documents: Optional[bool],
143
+ **kwargs,
143
144
  ):
144
145
  """
145
146
  Returns an ordered list of documents ordered by their relevance to the provided query.
@@ -163,7 +164,7 @@ class RerankModelHandle(ModelHandle):
163
164
 
164
165
  """
165
166
  coro = self._model_ref.rerank(
166
- documents, query, top_n, max_chunks_per_doc, return_documents
167
+ documents, query, top_n, max_chunks_per_doc, return_documents, **kwargs
167
168
  )
168
169
  results = orjson.loads(self._isolation.call(coro))
169
170
  for r in results["results"]:
@@ -80,7 +80,7 @@ class RESTfulModelHandle:
80
80
 
81
81
 
82
82
  class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
83
- def create_embedding(self, input: Union[str, List[str]]) -> "Embedding":
83
+ def create_embedding(self, input: Union[str, List[str]], **kwargs) -> "Embedding":
84
84
  """
85
85
  Create an Embedding from user input via RESTful APIs.
86
86
 
@@ -102,7 +102,11 @@ class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
102
102
 
103
103
  """
104
104
  url = f"{self._base_url}/v1/embeddings"
105
- request_body = {"model": self._model_uid, "input": input}
105
+ request_body = {
106
+ "model": self._model_uid,
107
+ "input": input,
108
+ }
109
+ request_body.update(kwargs)
106
110
  response = requests.post(url, json=request_body, headers=self.auth_headers)
107
111
  if response.status_code != 200:
108
112
  raise RuntimeError(
@@ -121,6 +125,7 @@ class RESTfulRerankModelHandle(RESTfulModelHandle):
121
125
  top_n: Optional[int] = None,
122
126
  max_chunks_per_doc: Optional[int] = None,
123
127
  return_documents: Optional[bool] = None,
128
+ **kwargs,
124
129
  ):
125
130
  """
126
131
  Returns an ordered list of documents ordered by their relevance to the provided query.
@@ -156,6 +161,7 @@ class RESTfulRerankModelHandle(RESTfulModelHandle):
156
161
  "max_chunks_per_doc": max_chunks_per_doc,
157
162
  "return_documents": return_documents,
158
163
  }
164
+ request_body.update(kwargs)
159
165
  response = requests.post(url, json=request_body, headers=self.auth_headers)
160
166
  if response.status_code != 200:
161
167
  raise RuntimeError(
@@ -870,6 +870,12 @@ class SupervisorActor(xo.StatelessActor):
870
870
  address,
871
871
  dead_models,
872
872
  )
873
+ for replica_model_uid in dead_models:
874
+ model_uid, _, _ = parse_replica_model_uid(replica_model_uid)
875
+ self._model_uid_to_replica_info.pop(model_uid, None)
876
+ self._replica_model_uid_to_worker.pop(
877
+ replica_model_uid, None
878
+ )
873
879
  dead_nodes.append(address)
874
880
  elif (
875
881
  status.failure_remaining_count
@@ -979,6 +985,16 @@ class SupervisorActor(xo.StatelessActor):
979
985
 
980
986
  @log_async(logger=logger)
981
987
  async def remove_worker(self, worker_address: str):
988
+ uids_to_remove = []
989
+ for model_uid in self._replica_model_uid_to_worker:
990
+ if self._replica_model_uid_to_worker[model_uid].address == worker_address:
991
+ uids_to_remove.append(model_uid)
992
+
993
+ for replica_model_uid in uids_to_remove:
994
+ model_uid, _, _ = parse_replica_model_uid(replica_model_uid)
995
+ self._model_uid_to_replica_info.pop(model_uid, None)
996
+ self._replica_model_uid_to_worker.pop(replica_model_uid, None)
997
+
982
998
  if worker_address in self._worker_address_to_worker:
983
999
  del self._worker_address_to_worker[worker_address]
984
1000
  logger.debug("Worker %s has been removed successfully", worker_address)
@@ -136,7 +136,7 @@ class EmbeddingModel:
136
136
  def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
137
137
  from sentence_transformers import SentenceTransformer
138
138
 
139
- normalize_embeddings = kwargs.pop("normalize_embeddings", True)
139
+ kwargs.setdefault("normalize_embeddings", True)
140
140
 
141
141
  # copied from sentence-transformers, and modify it to return tokens num
142
142
  @no_type_check
@@ -272,7 +272,6 @@ class EmbeddingModel:
272
272
  self._model,
273
273
  sentences,
274
274
  convert_to_numpy=False,
275
- normalize_embeddings=normalize_embeddings,
276
275
  **kwargs,
277
276
  )
278
277
  if isinstance(sentences, str):
@@ -49,7 +49,6 @@ from .llm_family import (
49
49
 
50
50
  def _install():
51
51
  from .ggml.chatglm import ChatglmCppChatModel
52
- from .ggml.ctransformers import CtransformersModel
53
52
  from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
54
53
  from .pytorch.baichuan import BaichuanPytorchChatModel
55
54
  from .pytorch.chatglm import ChatglmPytorchChatModel
@@ -77,11 +76,6 @@ def _install():
77
76
  ChatglmCppChatModel,
78
77
  ]
79
78
  )
80
- LLM_CLASSES.extend(
81
- [
82
- CtransformersModel,
83
- ]
84
- )
85
79
  LLM_CLASSES.extend([SGLANGModel, SGLANGChatModel])
86
80
  LLM_CLASSES.extend([VLLMModel, VLLMChatModel])
87
81
  LLM_CLASSES.extend(
@@ -30,7 +30,6 @@ from ....types import (
30
30
  from ..core import LLM
31
31
  from ..llm_family import LLMFamilyV1, LLMSpecV1
32
32
  from ..utils import ChatModelMixin
33
- from .ctransformers import CTRANSFORMERS_SUPPORTED_MODEL
34
33
 
35
34
  logger = logging.getLogger(__name__)
36
35
 
@@ -182,11 +181,7 @@ class LlamaCppModel(LLM):
182
181
  ) -> bool:
183
182
  if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
184
183
  return False
185
- if (
186
- "chatglm" in llm_family.model_name
187
- or "qwen" in llm_family.model_name
188
- or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL
189
- ):
184
+ if "chatglm" in llm_family.model_name or "qwen" in llm_family.model_name:
190
185
  return False
191
186
  if "generate" not in llm_family.model_ability:
192
187
  return False
@@ -250,10 +245,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
250
245
  ) -> bool:
251
246
  if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
252
247
  return False
253
- if (
254
- "chatglm" in llm_family.model_name
255
- or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL
256
- ):
248
+ if "chatglm" in llm_family.model_name:
257
249
  return False
258
250
  if "chat" not in llm_family.model_ability:
259
251
  return False
@@ -913,6 +913,38 @@
913
913
  "model_id": "meta-llama/Llama-2-7b-chat-hf",
914
914
  "model_revision": "08751db2aca9bf2f7f80d2e516117a53d7450235"
915
915
  },
916
+ {
917
+ "model_format": "gptq",
918
+ "model_size_in_billions": 7,
919
+ "quantizations": [
920
+ "Int4"
921
+ ],
922
+ "model_id": "TheBloke/Llama-2-7B-Chat-GPTQ"
923
+ },
924
+ {
925
+ "model_format": "gptq",
926
+ "model_size_in_billions": 70,
927
+ "quantizations": [
928
+ "Int4"
929
+ ],
930
+ "model_id": "TheBloke/Llama-2-70B-Chat-GPTQ"
931
+ },
932
+ {
933
+ "model_format": "awq",
934
+ "model_size_in_billions": 70,
935
+ "quantizations": [
936
+ "Int4"
937
+ ],
938
+ "model_id": "TheBloke/Llama-2-70B-Chat-AWQ"
939
+ },
940
+ {
941
+ "model_format": "awq",
942
+ "model_size_in_billions": 7,
943
+ "quantizations": [
944
+ "Int4"
945
+ ],
946
+ "model_id": "TheBloke/Llama-2-7B-Chat-AWQ"
947
+ },
916
948
  {
917
949
  "model_format": "pytorch",
918
950
  "model_size_in_billions": 13,
@@ -924,6 +956,22 @@
924
956
  "model_id": "meta-llama/Llama-2-13b-chat-hf",
925
957
  "model_revision": "0ba94ac9b9e1d5a0037780667e8b219adde1908c"
926
958
  },
959
+ {
960
+ "model_format": "gptq",
961
+ "model_size_in_billions": 13,
962
+ "quantizations": [
963
+ "Int4"
964
+ ],
965
+ "model_id": "TheBloke/Llama-2-13B-chat-GPTQ"
966
+ },
967
+ {
968
+ "model_format": "awq",
969
+ "model_size_in_billions": 13,
970
+ "quantizations": [
971
+ "Int4"
972
+ ],
973
+ "model_id": "TheBloke/Llama-2-13B-chat-AWQ"
974
+ },
927
975
  {
928
976
  "model_format": "pytorch",
929
977
  "model_size_in_billions": 70,
@@ -1045,6 +1093,22 @@
1045
1093
  "model_id": "TheBloke/Llama-2-7B-GGML",
1046
1094
  "model_file_name_template": "llama-2-7b.ggmlv3.{quantization}.bin"
1047
1095
  },
1096
+ {
1097
+ "model_format": "gptq",
1098
+ "model_size_in_billions": 7,
1099
+ "quantizations": [
1100
+ "Int4"
1101
+ ],
1102
+ "model_id": "TheBloke/Llama-2-7B-GPTQ"
1103
+ },
1104
+ {
1105
+ "model_format": "awq",
1106
+ "model_size_in_billions": 7,
1107
+ "quantizations": [
1108
+ "Int4"
1109
+ ],
1110
+ "model_id": "TheBloke/Llama-2-7B-AWQ"
1111
+ },
1048
1112
  {
1049
1113
  "model_format": "ggmlv3",
1050
1114
  "model_size_in_billions": 13,
@@ -1111,6 +1175,22 @@
1111
1175
  "model_id": "meta-llama/Llama-2-13b-hf",
1112
1176
  "model_revision": "db6b8eb1feabb38985fdf785a89895959e944936"
1113
1177
  },
1178
+ {
1179
+ "model_format": "gptq",
1180
+ "model_size_in_billions": 13,
1181
+ "quantizations": [
1182
+ "Int4"
1183
+ ],
1184
+ "model_id": "TheBloke/Llama-2-13B-GPTQ"
1185
+ },
1186
+ {
1187
+ "model_format": "awq",
1188
+ "model_size_in_billions": 13,
1189
+ "quantizations": [
1190
+ "Int4"
1191
+ ],
1192
+ "model_id": "TheBloke/Llama-2-13B-AWQ"
1193
+ },
1114
1194
  {
1115
1195
  "model_format": "pytorch",
1116
1196
  "model_size_in_billions": 70,
@@ -1121,6 +1201,22 @@
1121
1201
  ],
1122
1202
  "model_id": "meta-llama/Llama-2-70b-hf",
1123
1203
  "model_revision": "cc8aa03a000ff08b4d5c5b39673321a2a396c396"
1204
+ },
1205
+ {
1206
+ "model_format": "gptq",
1207
+ "model_size_in_billions": 70,
1208
+ "quantizations": [
1209
+ "Int4"
1210
+ ],
1211
+ "model_id": "TheBloke/Llama-2-70B-GPTQ"
1212
+ },
1213
+ {
1214
+ "model_format": "awq",
1215
+ "model_size_in_billions": 70,
1216
+ "quantizations": [
1217
+ "Int4"
1218
+ ],
1219
+ "model_id": "TheBloke/Llama-2-70B-AWQ"
1124
1220
  }
1125
1221
  ]
1126
1222
  },
@@ -1509,6 +1605,16 @@
1509
1605
  ],
1510
1606
  "model_id": "Qwen/Qwen1.5-14B-Chat"
1511
1607
  },
1608
+ {
1609
+ "model_format": "pytorch",
1610
+ "model_size_in_billions": 32,
1611
+ "quantizations": [
1612
+ "4-bit",
1613
+ "8-bit",
1614
+ "none"
1615
+ ],
1616
+ "model_id": "Qwen/Qwen1.5-32B-Chat"
1617
+ },
1512
1618
  {
1513
1619
  "model_format": "pytorch",
1514
1620
  "model_size_in_billions": 72,
@@ -1564,6 +1670,14 @@
1564
1670
  ],
1565
1671
  "model_id": "Qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}"
1566
1672
  },
1673
+ {
1674
+ "model_format": "gptq",
1675
+ "model_size_in_billions": 32,
1676
+ "quantizations": [
1677
+ "Int4"
1678
+ ],
1679
+ "model_id": "Qwen/Qwen1.5-32B-Chat-GPTQ-{quantization}"
1680
+ },
1567
1681
  {
1568
1682
  "model_format": "gptq",
1569
1683
  "model_size_in_billions": 72,
@@ -1613,6 +1727,14 @@
1613
1727
  ],
1614
1728
  "model_id": "Qwen/Qwen1.5-14B-Chat-AWQ"
1615
1729
  },
1730
+ {
1731
+ "model_format": "awq",
1732
+ "model_size_in_billions": 32,
1733
+ "quantizations": [
1734
+ "Int4"
1735
+ ],
1736
+ "model_id": "Qwen/Qwen1.5-32B-Chat-AWQ"
1737
+ },
1616
1738
  {
1617
1739
  "model_format": "awq",
1618
1740
  "model_size_in_billions": 72,
@@ -1701,6 +1823,22 @@
1701
1823
  "model_id": "Qwen/Qwen1.5-14B-Chat-GGUF",
1702
1824
  "model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf"
1703
1825
  },
1826
+ {
1827
+ "model_format": "ggufv2",
1828
+ "model_size_in_billions": 32,
1829
+ "quantizations": [
1830
+ "q2_k",
1831
+ "q3_k_m",
1832
+ "q4_0",
1833
+ "q4_k_m",
1834
+ "q5_0",
1835
+ "q5_k_m",
1836
+ "q6_k",
1837
+ "q8_0"
1838
+ ],
1839
+ "model_id": "Qwen/Qwen1.5-32B-Chat-GGUF",
1840
+ "model_file_name_template": "qwen1_5-32b-chat-{quantization}.gguf"
1841
+ },
1704
1842
  {
1705
1843
  "model_format": "ggufv2",
1706
1844
  "model_size_in_billions": 72,
@@ -1740,6 +1878,58 @@
1740
1878
  ]
1741
1879
  }
1742
1880
  },
1881
+ {
1882
+ "version": 1,
1883
+ "context_length": 32768,
1884
+ "model_name": "qwen1.5-moe-chat",
1885
+ "model_lang": [
1886
+ "en",
1887
+ "zh"
1888
+ ],
1889
+ "model_ability": [
1890
+ "chat"
1891
+ ],
1892
+ "model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
1893
+ "model_specs": [
1894
+ {
1895
+ "model_format": "pytorch",
1896
+ "model_size_in_billions": "2_7",
1897
+ "quantizations": [
1898
+ "4-bit",
1899
+ "8-bit",
1900
+ "none"
1901
+ ],
1902
+ "model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat"
1903
+ },
1904
+ {
1905
+ "model_format": "gptq",
1906
+ "model_size_in_billions": "2_7",
1907
+ "quantizations": [
1908
+ "Int4"
1909
+ ],
1910
+ "model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4"
1911
+ }
1912
+ ],
1913
+ "prompt_style": {
1914
+ "style_name": "QWEN",
1915
+ "system_prompt": "You are a helpful assistant.",
1916
+ "roles": [
1917
+ "user",
1918
+ "assistant"
1919
+ ],
1920
+ "intra_message_sep": "\n",
1921
+ "stop_token_ids": [
1922
+ 151643,
1923
+ 151644,
1924
+ 151645
1925
+ ],
1926
+ "stop": [
1927
+ "<|endoftext|>",
1928
+ "<|im_start|>",
1929
+ "<|im_end|>"
1930
+ ]
1931
+ }
1932
+ },
1743
1933
  {
1744
1934
  "version": 1,
1745
1935
  "context_length": 8192,
@@ -1780,13 +1970,13 @@
1780
1970
  "model_description": "GPT-2 is a Transformer-based LLM that is trained on WebTest, a 40 GB dataset of Reddit posts with 3+ upvotes.",
1781
1971
  "model_specs": [
1782
1972
  {
1783
- "model_format": "ggmlv3",
1784
- "model_size_in_billions": 1,
1973
+ "model_format": "pytorch",
1974
+ "model_size_in_billions": "1_5",
1785
1975
  "quantizations": [
1786
1976
  "none"
1787
1977
  ],
1788
- "model_id": "marella/gpt-2-ggml",
1789
- "model_file_name_template": "ggml-model.bin"
1978
+ "model_id": "openai-community/gpt2",
1979
+ "model_revision": "607a30d783dfa663caf39e06633721c8d4cfcd7e"
1790
1980
  }
1791
1981
  ]
1792
1982
  },
@@ -2569,6 +2759,22 @@
2569
2759
  "model_id": "mistralai/Mistral-7B-Instruct-v0.1",
2570
2760
  "model_revision": "54766df6d50e4d3d7ccd66758e5341ba105a6d36"
2571
2761
  },
2762
+ {
2763
+ "model_format": "awq",
2764
+ "model_size_in_billions": 7,
2765
+ "quantizations": [
2766
+ "Int4"
2767
+ ],
2768
+ "model_id": "TheBloke/Mistral-7B-Instruct-v0.1-AWQ"
2769
+ },
2770
+ {
2771
+ "model_format": "gptq",
2772
+ "model_size_in_billions": 7,
2773
+ "quantizations": [
2774
+ "Int4"
2775
+ ],
2776
+ "model_id": "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
2777
+ },
2572
2778
  {
2573
2779
  "model_format": "ggufv2",
2574
2780
  "model_size_in_billions": 7,
@@ -2630,6 +2836,22 @@
2630
2836
  "model_id": "mistralai/Mistral-7B-Instruct-v0.2",
2631
2837
  "model_revision": "b70aa86578567ba3301b21c8a27bea4e8f6d6d61"
2632
2838
  },
2839
+ {
2840
+ "model_format": "gptq",
2841
+ "model_size_in_billions": 7,
2842
+ "quantizations": [
2843
+ "Int4"
2844
+ ],
2845
+ "model_id": "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
2846
+ },
2847
+ {
2848
+ "model_format": "awq",
2849
+ "model_size_in_billions": 7,
2850
+ "quantizations": [
2851
+ "Int4"
2852
+ ],
2853
+ "model_id": "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
2854
+ },
2633
2855
  {
2634
2856
  "model_format": "ggufv2",
2635
2857
  "model_size_in_billions": 7,
@@ -2790,6 +3012,14 @@
2790
3012
  "model_id": "mistralai/Mixtral-8x7B-v0.1",
2791
3013
  "model_revision": "58301445dc1378584211722b7ebf8743ec4e192b"
2792
3014
  },
3015
+ {
3016
+ "model_format": "gptq",
3017
+ "model_size_in_billions": "46_7",
3018
+ "quantizations": [
3019
+ "Int4"
3020
+ ],
3021
+ "model_id": "TheBloke/Mixtral-8x7B-v0.1-GPTQ"
3022
+ },
2793
3023
  {
2794
3024
  "model_format": "ggufv2",
2795
3025
  "model_size_in_billions": "46_7",
@@ -2839,10 +3069,17 @@
2839
3069
  "model_format": "awq",
2840
3070
  "model_size_in_billions": "46_7",
2841
3071
  "quantizations": [
2842
- "4-bit"
3072
+ "Int4"
3073
+ ],
3074
+ "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ"
3075
+ },
3076
+ {
3077
+ "model_format": "gptq",
3078
+ "model_size_in_billions": "46_7",
3079
+ "quantizations": [
3080
+ "Int4"
2843
3081
  ],
2844
- "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ",
2845
- "model_revision": "9afb6f0a7d7fe9ecebdda1baa4ff4e13e73e97d7"
3082
+ "model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
2846
3083
  },
2847
3084
  {
2848
3085
  "model_format": "ggufv2",
@@ -199,6 +199,21 @@ class CustomLLMFamilyV1(LLMFamilyV1):
199
199
  )
200
200
  llm_spec.prompt_style = BUILTIN_LLM_PROMPT_STYLE[prompt_style_name]
201
201
 
202
+ # check model ability, registering LLM only provides generate and chat
203
+ # but for vision models, we add back the abilities so that
204
+ # gradio chat interface can be generated properly
205
+ if (
206
+ llm_spec.model_family != "other"
207
+ and llm_spec.model_family
208
+ in {
209
+ family.model_name
210
+ for family in BUILTIN_LLM_FAMILIES
211
+ if "vision" in family.model_ability
212
+ }
213
+ and "vision" not in llm_spec.model_ability
214
+ ):
215
+ llm_spec.model_ability.append("vision")
216
+
202
217
  return llm_spec
203
218
 
204
219