xinference 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +25 -6
- xinference/client/oscar/actor_client.py +4 -3
- xinference/client/restful/restful_client.py +8 -2
- xinference/core/supervisor.py +16 -0
- xinference/model/embedding/core.py +1 -2
- xinference/model/llm/__init__.py +0 -6
- xinference/model/llm/ggml/llamacpp.py +2 -10
- xinference/model/llm/llm_family.json +244 -7
- xinference/model/llm/llm_family.py +15 -0
- xinference/model/llm/llm_family_modelscope.json +100 -0
- xinference/model/llm/pytorch/chatglm.py +2 -0
- xinference/model/llm/pytorch/core.py +22 -28
- xinference/model/llm/pytorch/internlm2.py +2 -0
- xinference/model/llm/pytorch/qwen_vl.py +2 -0
- xinference/model/llm/pytorch/yi_vl.py +4 -2
- xinference/model/llm/utils.py +42 -4
- xinference/model/llm/vllm/core.py +51 -6
- xinference/model/rerank/core.py +3 -0
- xinference/thirdparty/omnilmm/chat.py +1 -1
- xinference/types.py +15 -19
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.76ef2b17.js +3 -0
- xinference/web/ui/build/static/js/main.76ef2b17.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/35d0e4a317e5582cbb79d901302e9d706520ac53f8a734c2fd8bfde6eb5a4f02.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/3fa1f69162f9c6dc0f6a6e21b64d49d6b8e6fa8dfa59a82cf829931c5f97d99f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/62e257ed9016471035fa1a7da57c9e2a4250974ed566b4d1295873d747c68eb2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b9cbcb6d77ba21b22c6950b6fb5b305d23c19cf747f99f7d48b6b046f8f7b1b0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d076fd56cf3b15ed2433e3744b98c6b4e4410a19903d1db4de5bba0e1a1b3347.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/daad8131d91134f6d7aef895a0c9c32e1cb928277cb5aa66c01028126d215be0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e606671420d2937102c3c34b4b04056c11736408c1d3347b8cf42dfe61fb394b.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f16aec63602a77bd561d0e67fa00b76469ac54b8033754bba114ec5eb3257964.json +1 -0
- {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/METADATA +10 -10
- {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/RECORD +50 -56
- xinference/model/llm/ggml/ctransformers.py +0 -281
- xinference/model/llm/ggml/ctransformers_util.py +0 -161
- xinference/web/ui/build/static/js/main.98516614.js +0 -3
- xinference/web/ui/build/static/js/main.98516614.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0bd70b1ecf307e2681318e864f4692305b6350c8683863007f4caf2f9ac33b6e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0db651c046ef908f45cde73af0dbea0a797d3e35bb57f4a0863b481502103a64.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/139969fd25258eb7decc9505f30b779089bba50c402bb5c663008477c7bff73b.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/18e5d5422e2464abf4a3e6d38164570e2e426e0a921e9a2628bbae81b18da353.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3d93bd9a74a1ab0cec85af40f9baa5f6a8e7384b9e18c409b95a81a7b45bb7e2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3e055de705e397e1d413d7f429589b1a98dd78ef378b97f0cdb462c5f2487d5e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/3f357ab57b8e7fade54c667f0e0ebf2787566f72bfdca0fea14e395b5c203753.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4fd24800544873512b540544ae54601240a5bfefd9105ff647855c64f8ad828f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/52aa27272b4b9968f62666262b47661cb1992336a2aff3b13994cc36877b3ec3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/60c4b98d8ea7479fb0c94cfd19c8128f17bd7e27a1e73e6dd9adf6e9d88d18eb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/7e094845f611802b024b57439cbf911038169d06cdf6c34a72a7277f35aa71a4.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/95c8cc049fadd23085d8623e1d43d70b614a4e52217676f186a417dca894aa09.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/98b7ef307f436affe13d75a4f265b27e828ccc2b10ffae6513abe2681bc11971.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/9d7c49815d97539207e5aab2fb967591b5fed7791218a0762539efc9491f36af.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/a8070ce4b780b4a044218536e158a9e7192a6c80ff593fdc126fee43f46296b5.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/b400cfc9db57fa6c70cd2bad055b73c5079fde0ed37974009d898083f6af8cd8.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/bd04667474fd9cac2983b03725c218908a6cc0ee9128a5953cd00d26d4877f60.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/c230a727b8f68f0e62616a75e14a3d33026dc4164f2e325a9a8072d733850edb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d0d0b591d9adaf42b83ad6633f8b7c118541a4b80ea957c303d3bf9b86fbad0a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d44a6eb6106e09082b691a315c9f6ce17fcfe25beb7547810e0d271ce3301cd2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e1d9b2ae4e1248658704bc6bfc5d6160dcd1a9e771ea4ae8c1fed0aaddeedd29.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/fe5db70859503a54cbe71f9637e5a314cda88b1f0eecb733b6e6f837697db1ef.json +0 -1
- /xinference/web/ui/build/static/js/{main.98516614.js.LICENSE.txt → main.76ef2b17.js.LICENSE.txt} +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/LICENSE +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/WHEEL +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.10.0.dist-info → xinference-0.10.1.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2024-
|
|
11
|
+
"date": "2024-04-11T15:35:46+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.10.
|
|
14
|
+
"full-revisionid": "e3a947ebddfc53b5e8ec723c1f632c2b895edef1",
|
|
15
|
+
"version": "0.10.1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
xinference/api/restful_api.py
CHANGED
|
@@ -1007,8 +1007,16 @@ class RESTfulAPI:
|
|
|
1007
1007
|
raise HTTPException(status_code=500, detail=str(e))
|
|
1008
1008
|
|
|
1009
1009
|
async def create_embedding(self, request: Request) -> Response:
|
|
1010
|
-
|
|
1010
|
+
payload = await request.json()
|
|
1011
|
+
body = CreateEmbeddingRequest.parse_obj(payload)
|
|
1011
1012
|
model_uid = body.model
|
|
1013
|
+
exclude = {
|
|
1014
|
+
"model",
|
|
1015
|
+
"input",
|
|
1016
|
+
"user",
|
|
1017
|
+
"encoding_format",
|
|
1018
|
+
}
|
|
1019
|
+
kwargs = {key: value for key, value in payload.items() if key not in exclude}
|
|
1012
1020
|
|
|
1013
1021
|
try:
|
|
1014
1022
|
model = await (await self._get_supervisor_ref()).get_model(model_uid)
|
|
@@ -1022,7 +1030,7 @@ class RESTfulAPI:
|
|
|
1022
1030
|
raise HTTPException(status_code=500, detail=str(e))
|
|
1023
1031
|
|
|
1024
1032
|
try:
|
|
1025
|
-
embedding = await model.create_embedding(body.input)
|
|
1033
|
+
embedding = await model.create_embedding(body.input, **kwargs)
|
|
1026
1034
|
return Response(embedding, media_type="application/json")
|
|
1027
1035
|
except RuntimeError as re:
|
|
1028
1036
|
logger.error(re, exc_info=True)
|
|
@@ -1035,8 +1043,15 @@ class RESTfulAPI:
|
|
|
1035
1043
|
raise HTTPException(status_code=500, detail=str(e))
|
|
1036
1044
|
|
|
1037
1045
|
async def rerank(self, request: Request) -> Response:
|
|
1038
|
-
|
|
1046
|
+
payload = await request.json()
|
|
1047
|
+
body = RerankRequest.parse_obj(payload)
|
|
1039
1048
|
model_uid = body.model
|
|
1049
|
+
kwargs = {
|
|
1050
|
+
key: value
|
|
1051
|
+
for key, value in payload.items()
|
|
1052
|
+
if key not in RerankRequest.__annotations__.keys()
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1040
1055
|
try:
|
|
1041
1056
|
model = await (await self._get_supervisor_ref()).get_model(model_uid)
|
|
1042
1057
|
except ValueError as ve:
|
|
@@ -1055,6 +1070,7 @@ class RESTfulAPI:
|
|
|
1055
1070
|
top_n=body.top_n,
|
|
1056
1071
|
max_chunks_per_doc=body.max_chunks_per_doc,
|
|
1057
1072
|
return_documents=body.return_documents,
|
|
1073
|
+
**kwargs,
|
|
1058
1074
|
)
|
|
1059
1075
|
return Response(scores, media_type="application/json")
|
|
1060
1076
|
except RuntimeError as re:
|
|
@@ -1345,9 +1361,12 @@ class RESTfulAPI:
|
|
|
1345
1361
|
detail=f"Only {function_call_models} support tool messages",
|
|
1346
1362
|
)
|
|
1347
1363
|
if body.tools and body.stream:
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1364
|
+
is_vllm = await model.is_vllm_backend()
|
|
1365
|
+
if not is_vllm or model_family not in ["qwen-chat", "qwen1.5-chat"]:
|
|
1366
|
+
raise HTTPException(
|
|
1367
|
+
status_code=400,
|
|
1368
|
+
detail="Streaming support for tool calls is available only when using vLLM backend and Qwen models.",
|
|
1369
|
+
)
|
|
1351
1370
|
|
|
1352
1371
|
if body.stream:
|
|
1353
1372
|
|
|
@@ -111,7 +111,7 @@ class ClientIteratorWrapper(AsyncIterator):
|
|
|
111
111
|
|
|
112
112
|
|
|
113
113
|
class EmbeddingModelHandle(ModelHandle):
|
|
114
|
-
def create_embedding(self, input: Union[str, List[str]]) -> bytes:
|
|
114
|
+
def create_embedding(self, input: Union[str, List[str]], **kwargs) -> bytes:
|
|
115
115
|
"""
|
|
116
116
|
Creates an embedding vector representing the input text.
|
|
117
117
|
|
|
@@ -128,7 +128,7 @@ class EmbeddingModelHandle(ModelHandle):
|
|
|
128
128
|
machine learning models and algorithms.
|
|
129
129
|
"""
|
|
130
130
|
|
|
131
|
-
coro = self._model_ref.create_embedding(input)
|
|
131
|
+
coro = self._model_ref.create_embedding(input, **kwargs)
|
|
132
132
|
return orjson.loads(self._isolation.call(coro))
|
|
133
133
|
|
|
134
134
|
|
|
@@ -140,6 +140,7 @@ class RerankModelHandle(ModelHandle):
|
|
|
140
140
|
top_n: Optional[int],
|
|
141
141
|
max_chunks_per_doc: Optional[int],
|
|
142
142
|
return_documents: Optional[bool],
|
|
143
|
+
**kwargs,
|
|
143
144
|
):
|
|
144
145
|
"""
|
|
145
146
|
Returns an ordered list of documents ordered by their relevance to the provided query.
|
|
@@ -163,7 +164,7 @@ class RerankModelHandle(ModelHandle):
|
|
|
163
164
|
|
|
164
165
|
"""
|
|
165
166
|
coro = self._model_ref.rerank(
|
|
166
|
-
documents, query, top_n, max_chunks_per_doc, return_documents
|
|
167
|
+
documents, query, top_n, max_chunks_per_doc, return_documents, **kwargs
|
|
167
168
|
)
|
|
168
169
|
results = orjson.loads(self._isolation.call(coro))
|
|
169
170
|
for r in results["results"]:
|
|
@@ -80,7 +80,7 @@ class RESTfulModelHandle:
|
|
|
80
80
|
|
|
81
81
|
|
|
82
82
|
class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
|
|
83
|
-
def create_embedding(self, input: Union[str, List[str]]) -> "Embedding":
|
|
83
|
+
def create_embedding(self, input: Union[str, List[str]], **kwargs) -> "Embedding":
|
|
84
84
|
"""
|
|
85
85
|
Create an Embedding from user input via RESTful APIs.
|
|
86
86
|
|
|
@@ -102,7 +102,11 @@ class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
|
|
|
102
102
|
|
|
103
103
|
"""
|
|
104
104
|
url = f"{self._base_url}/v1/embeddings"
|
|
105
|
-
request_body = {
|
|
105
|
+
request_body = {
|
|
106
|
+
"model": self._model_uid,
|
|
107
|
+
"input": input,
|
|
108
|
+
}
|
|
109
|
+
request_body.update(kwargs)
|
|
106
110
|
response = requests.post(url, json=request_body, headers=self.auth_headers)
|
|
107
111
|
if response.status_code != 200:
|
|
108
112
|
raise RuntimeError(
|
|
@@ -121,6 +125,7 @@ class RESTfulRerankModelHandle(RESTfulModelHandle):
|
|
|
121
125
|
top_n: Optional[int] = None,
|
|
122
126
|
max_chunks_per_doc: Optional[int] = None,
|
|
123
127
|
return_documents: Optional[bool] = None,
|
|
128
|
+
**kwargs,
|
|
124
129
|
):
|
|
125
130
|
"""
|
|
126
131
|
Returns an ordered list of documents ordered by their relevance to the provided query.
|
|
@@ -156,6 +161,7 @@ class RESTfulRerankModelHandle(RESTfulModelHandle):
|
|
|
156
161
|
"max_chunks_per_doc": max_chunks_per_doc,
|
|
157
162
|
"return_documents": return_documents,
|
|
158
163
|
}
|
|
164
|
+
request_body.update(kwargs)
|
|
159
165
|
response = requests.post(url, json=request_body, headers=self.auth_headers)
|
|
160
166
|
if response.status_code != 200:
|
|
161
167
|
raise RuntimeError(
|
xinference/core/supervisor.py
CHANGED
|
@@ -870,6 +870,12 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
870
870
|
address,
|
|
871
871
|
dead_models,
|
|
872
872
|
)
|
|
873
|
+
for replica_model_uid in dead_models:
|
|
874
|
+
model_uid, _, _ = parse_replica_model_uid(replica_model_uid)
|
|
875
|
+
self._model_uid_to_replica_info.pop(model_uid, None)
|
|
876
|
+
self._replica_model_uid_to_worker.pop(
|
|
877
|
+
replica_model_uid, None
|
|
878
|
+
)
|
|
873
879
|
dead_nodes.append(address)
|
|
874
880
|
elif (
|
|
875
881
|
status.failure_remaining_count
|
|
@@ -979,6 +985,16 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
979
985
|
|
|
980
986
|
@log_async(logger=logger)
|
|
981
987
|
async def remove_worker(self, worker_address: str):
|
|
988
|
+
uids_to_remove = []
|
|
989
|
+
for model_uid in self._replica_model_uid_to_worker:
|
|
990
|
+
if self._replica_model_uid_to_worker[model_uid].address == worker_address:
|
|
991
|
+
uids_to_remove.append(model_uid)
|
|
992
|
+
|
|
993
|
+
for replica_model_uid in uids_to_remove:
|
|
994
|
+
model_uid, _, _ = parse_replica_model_uid(replica_model_uid)
|
|
995
|
+
self._model_uid_to_replica_info.pop(model_uid, None)
|
|
996
|
+
self._replica_model_uid_to_worker.pop(replica_model_uid, None)
|
|
997
|
+
|
|
982
998
|
if worker_address in self._worker_address_to_worker:
|
|
983
999
|
del self._worker_address_to_worker[worker_address]
|
|
984
1000
|
logger.debug("Worker %s has been removed successfully", worker_address)
|
|
@@ -136,7 +136,7 @@ class EmbeddingModel:
|
|
|
136
136
|
def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
|
|
137
137
|
from sentence_transformers import SentenceTransformer
|
|
138
138
|
|
|
139
|
-
|
|
139
|
+
kwargs.setdefault("normalize_embeddings", True)
|
|
140
140
|
|
|
141
141
|
# copied from sentence-transformers, and modify it to return tokens num
|
|
142
142
|
@no_type_check
|
|
@@ -272,7 +272,6 @@ class EmbeddingModel:
|
|
|
272
272
|
self._model,
|
|
273
273
|
sentences,
|
|
274
274
|
convert_to_numpy=False,
|
|
275
|
-
normalize_embeddings=normalize_embeddings,
|
|
276
275
|
**kwargs,
|
|
277
276
|
)
|
|
278
277
|
if isinstance(sentences, str):
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -49,7 +49,6 @@ from .llm_family import (
|
|
|
49
49
|
|
|
50
50
|
def _install():
|
|
51
51
|
from .ggml.chatglm import ChatglmCppChatModel
|
|
52
|
-
from .ggml.ctransformers import CtransformersModel
|
|
53
52
|
from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
|
|
54
53
|
from .pytorch.baichuan import BaichuanPytorchChatModel
|
|
55
54
|
from .pytorch.chatglm import ChatglmPytorchChatModel
|
|
@@ -77,11 +76,6 @@ def _install():
|
|
|
77
76
|
ChatglmCppChatModel,
|
|
78
77
|
]
|
|
79
78
|
)
|
|
80
|
-
LLM_CLASSES.extend(
|
|
81
|
-
[
|
|
82
|
-
CtransformersModel,
|
|
83
|
-
]
|
|
84
|
-
)
|
|
85
79
|
LLM_CLASSES.extend([SGLANGModel, SGLANGChatModel])
|
|
86
80
|
LLM_CLASSES.extend([VLLMModel, VLLMChatModel])
|
|
87
81
|
LLM_CLASSES.extend(
|
|
@@ -30,7 +30,6 @@ from ....types import (
|
|
|
30
30
|
from ..core import LLM
|
|
31
31
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
32
32
|
from ..utils import ChatModelMixin
|
|
33
|
-
from .ctransformers import CTRANSFORMERS_SUPPORTED_MODEL
|
|
34
33
|
|
|
35
34
|
logger = logging.getLogger(__name__)
|
|
36
35
|
|
|
@@ -182,11 +181,7 @@ class LlamaCppModel(LLM):
|
|
|
182
181
|
) -> bool:
|
|
183
182
|
if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
|
|
184
183
|
return False
|
|
185
|
-
if
|
|
186
|
-
"chatglm" in llm_family.model_name
|
|
187
|
-
or "qwen" in llm_family.model_name
|
|
188
|
-
or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL
|
|
189
|
-
):
|
|
184
|
+
if "chatglm" in llm_family.model_name or "qwen" in llm_family.model_name:
|
|
190
185
|
return False
|
|
191
186
|
if "generate" not in llm_family.model_ability:
|
|
192
187
|
return False
|
|
@@ -250,10 +245,7 @@ class LlamaCppChatModel(LlamaCppModel, ChatModelMixin):
|
|
|
250
245
|
) -> bool:
|
|
251
246
|
if llm_spec.model_format not in ["ggmlv3", "ggufv2"]:
|
|
252
247
|
return False
|
|
253
|
-
if
|
|
254
|
-
"chatglm" in llm_family.model_name
|
|
255
|
-
or llm_family.model_name in CTRANSFORMERS_SUPPORTED_MODEL
|
|
256
|
-
):
|
|
248
|
+
if "chatglm" in llm_family.model_name:
|
|
257
249
|
return False
|
|
258
250
|
if "chat" not in llm_family.model_ability:
|
|
259
251
|
return False
|
|
@@ -913,6 +913,38 @@
|
|
|
913
913
|
"model_id": "meta-llama/Llama-2-7b-chat-hf",
|
|
914
914
|
"model_revision": "08751db2aca9bf2f7f80d2e516117a53d7450235"
|
|
915
915
|
},
|
|
916
|
+
{
|
|
917
|
+
"model_format": "gptq",
|
|
918
|
+
"model_size_in_billions": 7,
|
|
919
|
+
"quantizations": [
|
|
920
|
+
"Int4"
|
|
921
|
+
],
|
|
922
|
+
"model_id": "TheBloke/Llama-2-7B-Chat-GPTQ"
|
|
923
|
+
},
|
|
924
|
+
{
|
|
925
|
+
"model_format": "gptq",
|
|
926
|
+
"model_size_in_billions": 70,
|
|
927
|
+
"quantizations": [
|
|
928
|
+
"Int4"
|
|
929
|
+
],
|
|
930
|
+
"model_id": "TheBloke/Llama-2-70B-Chat-GPTQ"
|
|
931
|
+
},
|
|
932
|
+
{
|
|
933
|
+
"model_format": "awq",
|
|
934
|
+
"model_size_in_billions": 70,
|
|
935
|
+
"quantizations": [
|
|
936
|
+
"Int4"
|
|
937
|
+
],
|
|
938
|
+
"model_id": "TheBloke/Llama-2-70B-Chat-AWQ"
|
|
939
|
+
},
|
|
940
|
+
{
|
|
941
|
+
"model_format": "awq",
|
|
942
|
+
"model_size_in_billions": 7,
|
|
943
|
+
"quantizations": [
|
|
944
|
+
"Int4"
|
|
945
|
+
],
|
|
946
|
+
"model_id": "TheBloke/Llama-2-7B-Chat-AWQ"
|
|
947
|
+
},
|
|
916
948
|
{
|
|
917
949
|
"model_format": "pytorch",
|
|
918
950
|
"model_size_in_billions": 13,
|
|
@@ -924,6 +956,22 @@
|
|
|
924
956
|
"model_id": "meta-llama/Llama-2-13b-chat-hf",
|
|
925
957
|
"model_revision": "0ba94ac9b9e1d5a0037780667e8b219adde1908c"
|
|
926
958
|
},
|
|
959
|
+
{
|
|
960
|
+
"model_format": "gptq",
|
|
961
|
+
"model_size_in_billions": 13,
|
|
962
|
+
"quantizations": [
|
|
963
|
+
"Int4"
|
|
964
|
+
],
|
|
965
|
+
"model_id": "TheBloke/Llama-2-13B-chat-GPTQ"
|
|
966
|
+
},
|
|
967
|
+
{
|
|
968
|
+
"model_format": "awq",
|
|
969
|
+
"model_size_in_billions": 13,
|
|
970
|
+
"quantizations": [
|
|
971
|
+
"Int4"
|
|
972
|
+
],
|
|
973
|
+
"model_id": "TheBloke/Llama-2-13B-chat-AWQ"
|
|
974
|
+
},
|
|
927
975
|
{
|
|
928
976
|
"model_format": "pytorch",
|
|
929
977
|
"model_size_in_billions": 70,
|
|
@@ -1045,6 +1093,22 @@
|
|
|
1045
1093
|
"model_id": "TheBloke/Llama-2-7B-GGML",
|
|
1046
1094
|
"model_file_name_template": "llama-2-7b.ggmlv3.{quantization}.bin"
|
|
1047
1095
|
},
|
|
1096
|
+
{
|
|
1097
|
+
"model_format": "gptq",
|
|
1098
|
+
"model_size_in_billions": 7,
|
|
1099
|
+
"quantizations": [
|
|
1100
|
+
"Int4"
|
|
1101
|
+
],
|
|
1102
|
+
"model_id": "TheBloke/Llama-2-7B-GPTQ"
|
|
1103
|
+
},
|
|
1104
|
+
{
|
|
1105
|
+
"model_format": "awq",
|
|
1106
|
+
"model_size_in_billions": 7,
|
|
1107
|
+
"quantizations": [
|
|
1108
|
+
"Int4"
|
|
1109
|
+
],
|
|
1110
|
+
"model_id": "TheBloke/Llama-2-7B-AWQ"
|
|
1111
|
+
},
|
|
1048
1112
|
{
|
|
1049
1113
|
"model_format": "ggmlv3",
|
|
1050
1114
|
"model_size_in_billions": 13,
|
|
@@ -1111,6 +1175,22 @@
|
|
|
1111
1175
|
"model_id": "meta-llama/Llama-2-13b-hf",
|
|
1112
1176
|
"model_revision": "db6b8eb1feabb38985fdf785a89895959e944936"
|
|
1113
1177
|
},
|
|
1178
|
+
{
|
|
1179
|
+
"model_format": "gptq",
|
|
1180
|
+
"model_size_in_billions": 13,
|
|
1181
|
+
"quantizations": [
|
|
1182
|
+
"Int4"
|
|
1183
|
+
],
|
|
1184
|
+
"model_id": "TheBloke/Llama-2-13B-GPTQ"
|
|
1185
|
+
},
|
|
1186
|
+
{
|
|
1187
|
+
"model_format": "awq",
|
|
1188
|
+
"model_size_in_billions": 13,
|
|
1189
|
+
"quantizations": [
|
|
1190
|
+
"Int4"
|
|
1191
|
+
],
|
|
1192
|
+
"model_id": "TheBloke/Llama-2-13B-AWQ"
|
|
1193
|
+
},
|
|
1114
1194
|
{
|
|
1115
1195
|
"model_format": "pytorch",
|
|
1116
1196
|
"model_size_in_billions": 70,
|
|
@@ -1121,6 +1201,22 @@
|
|
|
1121
1201
|
],
|
|
1122
1202
|
"model_id": "meta-llama/Llama-2-70b-hf",
|
|
1123
1203
|
"model_revision": "cc8aa03a000ff08b4d5c5b39673321a2a396c396"
|
|
1204
|
+
},
|
|
1205
|
+
{
|
|
1206
|
+
"model_format": "gptq",
|
|
1207
|
+
"model_size_in_billions": 70,
|
|
1208
|
+
"quantizations": [
|
|
1209
|
+
"Int4"
|
|
1210
|
+
],
|
|
1211
|
+
"model_id": "TheBloke/Llama-2-70B-GPTQ"
|
|
1212
|
+
},
|
|
1213
|
+
{
|
|
1214
|
+
"model_format": "awq",
|
|
1215
|
+
"model_size_in_billions": 70,
|
|
1216
|
+
"quantizations": [
|
|
1217
|
+
"Int4"
|
|
1218
|
+
],
|
|
1219
|
+
"model_id": "TheBloke/Llama-2-70B-AWQ"
|
|
1124
1220
|
}
|
|
1125
1221
|
]
|
|
1126
1222
|
},
|
|
@@ -1509,6 +1605,16 @@
|
|
|
1509
1605
|
],
|
|
1510
1606
|
"model_id": "Qwen/Qwen1.5-14B-Chat"
|
|
1511
1607
|
},
|
|
1608
|
+
{
|
|
1609
|
+
"model_format": "pytorch",
|
|
1610
|
+
"model_size_in_billions": 32,
|
|
1611
|
+
"quantizations": [
|
|
1612
|
+
"4-bit",
|
|
1613
|
+
"8-bit",
|
|
1614
|
+
"none"
|
|
1615
|
+
],
|
|
1616
|
+
"model_id": "Qwen/Qwen1.5-32B-Chat"
|
|
1617
|
+
},
|
|
1512
1618
|
{
|
|
1513
1619
|
"model_format": "pytorch",
|
|
1514
1620
|
"model_size_in_billions": 72,
|
|
@@ -1564,6 +1670,14 @@
|
|
|
1564
1670
|
],
|
|
1565
1671
|
"model_id": "Qwen/Qwen1.5-14B-Chat-GPTQ-{quantization}"
|
|
1566
1672
|
},
|
|
1673
|
+
{
|
|
1674
|
+
"model_format": "gptq",
|
|
1675
|
+
"model_size_in_billions": 32,
|
|
1676
|
+
"quantizations": [
|
|
1677
|
+
"Int4"
|
|
1678
|
+
],
|
|
1679
|
+
"model_id": "Qwen/Qwen1.5-32B-Chat-GPTQ-{quantization}"
|
|
1680
|
+
},
|
|
1567
1681
|
{
|
|
1568
1682
|
"model_format": "gptq",
|
|
1569
1683
|
"model_size_in_billions": 72,
|
|
@@ -1613,6 +1727,14 @@
|
|
|
1613
1727
|
],
|
|
1614
1728
|
"model_id": "Qwen/Qwen1.5-14B-Chat-AWQ"
|
|
1615
1729
|
},
|
|
1730
|
+
{
|
|
1731
|
+
"model_format": "awq",
|
|
1732
|
+
"model_size_in_billions": 32,
|
|
1733
|
+
"quantizations": [
|
|
1734
|
+
"Int4"
|
|
1735
|
+
],
|
|
1736
|
+
"model_id": "Qwen/Qwen1.5-32B-Chat-AWQ"
|
|
1737
|
+
},
|
|
1616
1738
|
{
|
|
1617
1739
|
"model_format": "awq",
|
|
1618
1740
|
"model_size_in_billions": 72,
|
|
@@ -1701,6 +1823,22 @@
|
|
|
1701
1823
|
"model_id": "Qwen/Qwen1.5-14B-Chat-GGUF",
|
|
1702
1824
|
"model_file_name_template": "qwen1_5-14b-chat-{quantization}.gguf"
|
|
1703
1825
|
},
|
|
1826
|
+
{
|
|
1827
|
+
"model_format": "ggufv2",
|
|
1828
|
+
"model_size_in_billions": 32,
|
|
1829
|
+
"quantizations": [
|
|
1830
|
+
"q2_k",
|
|
1831
|
+
"q3_k_m",
|
|
1832
|
+
"q4_0",
|
|
1833
|
+
"q4_k_m",
|
|
1834
|
+
"q5_0",
|
|
1835
|
+
"q5_k_m",
|
|
1836
|
+
"q6_k",
|
|
1837
|
+
"q8_0"
|
|
1838
|
+
],
|
|
1839
|
+
"model_id": "Qwen/Qwen1.5-32B-Chat-GGUF",
|
|
1840
|
+
"model_file_name_template": "qwen1_5-32b-chat-{quantization}.gguf"
|
|
1841
|
+
},
|
|
1704
1842
|
{
|
|
1705
1843
|
"model_format": "ggufv2",
|
|
1706
1844
|
"model_size_in_billions": 72,
|
|
@@ -1740,6 +1878,58 @@
|
|
|
1740
1878
|
]
|
|
1741
1879
|
}
|
|
1742
1880
|
},
|
|
1881
|
+
{
|
|
1882
|
+
"version": 1,
|
|
1883
|
+
"context_length": 32768,
|
|
1884
|
+
"model_name": "qwen1.5-moe-chat",
|
|
1885
|
+
"model_lang": [
|
|
1886
|
+
"en",
|
|
1887
|
+
"zh"
|
|
1888
|
+
],
|
|
1889
|
+
"model_ability": [
|
|
1890
|
+
"chat"
|
|
1891
|
+
],
|
|
1892
|
+
"model_description": "Qwen1.5-MoE is a transformer-based MoE decoder-only language model pretrained on a large amount of data.",
|
|
1893
|
+
"model_specs": [
|
|
1894
|
+
{
|
|
1895
|
+
"model_format": "pytorch",
|
|
1896
|
+
"model_size_in_billions": "2_7",
|
|
1897
|
+
"quantizations": [
|
|
1898
|
+
"4-bit",
|
|
1899
|
+
"8-bit",
|
|
1900
|
+
"none"
|
|
1901
|
+
],
|
|
1902
|
+
"model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat"
|
|
1903
|
+
},
|
|
1904
|
+
{
|
|
1905
|
+
"model_format": "gptq",
|
|
1906
|
+
"model_size_in_billions": "2_7",
|
|
1907
|
+
"quantizations": [
|
|
1908
|
+
"Int4"
|
|
1909
|
+
],
|
|
1910
|
+
"model_id": "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4"
|
|
1911
|
+
}
|
|
1912
|
+
],
|
|
1913
|
+
"prompt_style": {
|
|
1914
|
+
"style_name": "QWEN",
|
|
1915
|
+
"system_prompt": "You are a helpful assistant.",
|
|
1916
|
+
"roles": [
|
|
1917
|
+
"user",
|
|
1918
|
+
"assistant"
|
|
1919
|
+
],
|
|
1920
|
+
"intra_message_sep": "\n",
|
|
1921
|
+
"stop_token_ids": [
|
|
1922
|
+
151643,
|
|
1923
|
+
151644,
|
|
1924
|
+
151645
|
|
1925
|
+
],
|
|
1926
|
+
"stop": [
|
|
1927
|
+
"<|endoftext|>",
|
|
1928
|
+
"<|im_start|>",
|
|
1929
|
+
"<|im_end|>"
|
|
1930
|
+
]
|
|
1931
|
+
}
|
|
1932
|
+
},
|
|
1743
1933
|
{
|
|
1744
1934
|
"version": 1,
|
|
1745
1935
|
"context_length": 8192,
|
|
@@ -1780,13 +1970,13 @@
|
|
|
1780
1970
|
"model_description": "GPT-2 is a Transformer-based LLM that is trained on WebTest, a 40 GB dataset of Reddit posts with 3+ upvotes.",
|
|
1781
1971
|
"model_specs": [
|
|
1782
1972
|
{
|
|
1783
|
-
"model_format": "
|
|
1784
|
-
"model_size_in_billions":
|
|
1973
|
+
"model_format": "pytorch",
|
|
1974
|
+
"model_size_in_billions": "1_5",
|
|
1785
1975
|
"quantizations": [
|
|
1786
1976
|
"none"
|
|
1787
1977
|
],
|
|
1788
|
-
"model_id": "
|
|
1789
|
-
"
|
|
1978
|
+
"model_id": "openai-community/gpt2",
|
|
1979
|
+
"model_revision": "607a30d783dfa663caf39e06633721c8d4cfcd7e"
|
|
1790
1980
|
}
|
|
1791
1981
|
]
|
|
1792
1982
|
},
|
|
@@ -2569,6 +2759,22 @@
|
|
|
2569
2759
|
"model_id": "mistralai/Mistral-7B-Instruct-v0.1",
|
|
2570
2760
|
"model_revision": "54766df6d50e4d3d7ccd66758e5341ba105a6d36"
|
|
2571
2761
|
},
|
|
2762
|
+
{
|
|
2763
|
+
"model_format": "awq",
|
|
2764
|
+
"model_size_in_billions": 7,
|
|
2765
|
+
"quantizations": [
|
|
2766
|
+
"Int4"
|
|
2767
|
+
],
|
|
2768
|
+
"model_id": "TheBloke/Mistral-7B-Instruct-v0.1-AWQ"
|
|
2769
|
+
},
|
|
2770
|
+
{
|
|
2771
|
+
"model_format": "gptq",
|
|
2772
|
+
"model_size_in_billions": 7,
|
|
2773
|
+
"quantizations": [
|
|
2774
|
+
"Int4"
|
|
2775
|
+
],
|
|
2776
|
+
"model_id": "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
|
|
2777
|
+
},
|
|
2572
2778
|
{
|
|
2573
2779
|
"model_format": "ggufv2",
|
|
2574
2780
|
"model_size_in_billions": 7,
|
|
@@ -2630,6 +2836,22 @@
|
|
|
2630
2836
|
"model_id": "mistralai/Mistral-7B-Instruct-v0.2",
|
|
2631
2837
|
"model_revision": "b70aa86578567ba3301b21c8a27bea4e8f6d6d61"
|
|
2632
2838
|
},
|
|
2839
|
+
{
|
|
2840
|
+
"model_format": "gptq",
|
|
2841
|
+
"model_size_in_billions": 7,
|
|
2842
|
+
"quantizations": [
|
|
2843
|
+
"Int4"
|
|
2844
|
+
],
|
|
2845
|
+
"model_id": "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
|
|
2846
|
+
},
|
|
2847
|
+
{
|
|
2848
|
+
"model_format": "awq",
|
|
2849
|
+
"model_size_in_billions": 7,
|
|
2850
|
+
"quantizations": [
|
|
2851
|
+
"Int4"
|
|
2852
|
+
],
|
|
2853
|
+
"model_id": "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
|
|
2854
|
+
},
|
|
2633
2855
|
{
|
|
2634
2856
|
"model_format": "ggufv2",
|
|
2635
2857
|
"model_size_in_billions": 7,
|
|
@@ -2790,6 +3012,14 @@
|
|
|
2790
3012
|
"model_id": "mistralai/Mixtral-8x7B-v0.1",
|
|
2791
3013
|
"model_revision": "58301445dc1378584211722b7ebf8743ec4e192b"
|
|
2792
3014
|
},
|
|
3015
|
+
{
|
|
3016
|
+
"model_format": "gptq",
|
|
3017
|
+
"model_size_in_billions": "46_7",
|
|
3018
|
+
"quantizations": [
|
|
3019
|
+
"Int4"
|
|
3020
|
+
],
|
|
3021
|
+
"model_id": "TheBloke/Mixtral-8x7B-v0.1-GPTQ"
|
|
3022
|
+
},
|
|
2793
3023
|
{
|
|
2794
3024
|
"model_format": "ggufv2",
|
|
2795
3025
|
"model_size_in_billions": "46_7",
|
|
@@ -2839,10 +3069,17 @@
|
|
|
2839
3069
|
"model_format": "awq",
|
|
2840
3070
|
"model_size_in_billions": "46_7",
|
|
2841
3071
|
"quantizations": [
|
|
2842
|
-
"
|
|
3072
|
+
"Int4"
|
|
3073
|
+
],
|
|
3074
|
+
"model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ"
|
|
3075
|
+
},
|
|
3076
|
+
{
|
|
3077
|
+
"model_format": "gptq",
|
|
3078
|
+
"model_size_in_billions": "46_7",
|
|
3079
|
+
"quantizations": [
|
|
3080
|
+
"Int4"
|
|
2843
3081
|
],
|
|
2844
|
-
"model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-
|
|
2845
|
-
"model_revision": "9afb6f0a7d7fe9ecebdda1baa4ff4e13e73e97d7"
|
|
3082
|
+
"model_id": "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
|
|
2846
3083
|
},
|
|
2847
3084
|
{
|
|
2848
3085
|
"model_format": "ggufv2",
|
|
@@ -199,6 +199,21 @@ class CustomLLMFamilyV1(LLMFamilyV1):
|
|
|
199
199
|
)
|
|
200
200
|
llm_spec.prompt_style = BUILTIN_LLM_PROMPT_STYLE[prompt_style_name]
|
|
201
201
|
|
|
202
|
+
# check model ability, registering LLM only provides generate and chat
|
|
203
|
+
# but for vision models, we add back the abilities so that
|
|
204
|
+
# gradio chat interface can be generated properly
|
|
205
|
+
if (
|
|
206
|
+
llm_spec.model_family != "other"
|
|
207
|
+
and llm_spec.model_family
|
|
208
|
+
in {
|
|
209
|
+
family.model_name
|
|
210
|
+
for family in BUILTIN_LLM_FAMILIES
|
|
211
|
+
if "vision" in family.model_ability
|
|
212
|
+
}
|
|
213
|
+
and "vision" not in llm_spec.model_ability
|
|
214
|
+
):
|
|
215
|
+
llm_spec.model_ability.append("vision")
|
|
216
|
+
|
|
202
217
|
return llm_spec
|
|
203
218
|
|
|
204
219
|
|