xinference 0.10.1__py3-none-any.whl → 0.10.2.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +9 -9
- xinference/client/restful/restful_client.py +29 -16
- xinference/core/supervisor.py +32 -9
- xinference/core/worker.py +13 -8
- xinference/deploy/cmdline.py +22 -9
- xinference/model/audio/__init__.py +40 -1
- xinference/model/audio/core.py +25 -45
- xinference/model/audio/custom.py +148 -0
- xinference/model/core.py +6 -9
- xinference/model/embedding/model_spec.json +24 -0
- xinference/model/embedding/model_spec_modelscope.json +24 -0
- xinference/model/image/core.py +12 -4
- xinference/model/image/stable_diffusion/core.py +8 -7
- xinference/model/llm/core.py +9 -14
- xinference/model/llm/llm_family.json +263 -0
- xinference/model/llm/llm_family.py +26 -4
- xinference/model/llm/llm_family_modelscope.json +160 -0
- xinference/model/llm/pytorch/baichuan.py +4 -3
- xinference/model/llm/pytorch/chatglm.py +3 -2
- xinference/model/llm/pytorch/core.py +15 -13
- xinference/model/llm/pytorch/falcon.py +6 -5
- xinference/model/llm/pytorch/internlm2.py +3 -2
- xinference/model/llm/pytorch/llama_2.py +6 -5
- xinference/model/llm/pytorch/vicuna.py +4 -3
- xinference/model/llm/vllm/core.py +3 -0
- xinference/model/rerank/core.py +23 -12
- xinference/model/rerank/model_spec.json +24 -0
- xinference/model/rerank/model_spec_modelscope.json +25 -1
- xinference/model/utils.py +12 -1
- xinference/types.py +55 -0
- xinference/utils.py +1 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.26fdbfbe.js +3 -0
- xinference/web/ui/build/static/js/main.26fdbfbe.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1e86938a0cdf706d21e99b21f5d868fa247c0c88b26807047e26dcdc4d9a9db3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f9290c0738db50065492ceedc6a4af25083fe18399b7c44d942273349ad9e643.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +1 -0
- {xinference-0.10.1.dist-info → xinference-0.10.2.post1.dist-info}/METADATA +4 -1
- {xinference-0.10.1.dist-info → xinference-0.10.2.post1.dist-info}/RECORD +49 -46
- xinference/web/ui/build/static/js/main.76ef2b17.js +0 -3
- xinference/web/ui/build/static/js/main.76ef2b17.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/35d0e4a317e5582cbb79d901302e9d706520ac53f8a734c2fd8bfde6eb5a4f02.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d076fd56cf3b15ed2433e3744b98c6b4e4410a19903d1db4de5bba0e1a1b3347.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/daad8131d91134f6d7aef895a0c9c32e1cb928277cb5aa66c01028126d215be0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f16aec63602a77bd561d0e67fa00b76469ac54b8033754bba114ec5eb3257964.json +0 -1
- /xinference/web/ui/build/static/js/{main.76ef2b17.js.LICENSE.txt → main.26fdbfbe.js.LICENSE.txt} +0 -0
- {xinference-0.10.1.dist-info → xinference-0.10.2.post1.dist-info}/LICENSE +0 -0
- {xinference-0.10.1.dist-info → xinference-0.10.2.post1.dist-info}/WHEEL +0 -0
- {xinference-0.10.1.dist-info → xinference-0.10.2.post1.dist-info}/entry_points.txt +0 -0
- {xinference-0.10.1.dist-info → xinference-0.10.2.post1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
from threading import Lock
|
|
18
|
+
from typing import Any, List, Optional
|
|
19
|
+
|
|
20
|
+
from ..._compat import (
|
|
21
|
+
ROOT_KEY,
|
|
22
|
+
ErrorWrapper,
|
|
23
|
+
Protocol,
|
|
24
|
+
StrBytes,
|
|
25
|
+
ValidationError,
|
|
26
|
+
load_str_bytes,
|
|
27
|
+
)
|
|
28
|
+
from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
|
|
29
|
+
from .core import AudioModelFamilyV1
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
UD_AUDIO_LOCK = Lock()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class CustomAudioModelFamilyV1(AudioModelFamilyV1):
|
|
37
|
+
model_id: Optional[str] # type: ignore
|
|
38
|
+
model_revision: Optional[str] # type: ignore
|
|
39
|
+
model_uri: Optional[str]
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def parse_raw(
|
|
43
|
+
cls: Any,
|
|
44
|
+
b: StrBytes,
|
|
45
|
+
*,
|
|
46
|
+
content_type: Optional[str] = None,
|
|
47
|
+
encoding: str = "utf8",
|
|
48
|
+
proto: Protocol = None,
|
|
49
|
+
allow_pickle: bool = False,
|
|
50
|
+
) -> AudioModelFamilyV1:
|
|
51
|
+
# See source code of BaseModel.parse_raw
|
|
52
|
+
try:
|
|
53
|
+
obj = load_str_bytes(
|
|
54
|
+
b,
|
|
55
|
+
proto=proto,
|
|
56
|
+
content_type=content_type,
|
|
57
|
+
encoding=encoding,
|
|
58
|
+
allow_pickle=allow_pickle,
|
|
59
|
+
json_loads=cls.__config__.json_loads,
|
|
60
|
+
)
|
|
61
|
+
except (ValueError, TypeError, UnicodeDecodeError) as e:
|
|
62
|
+
raise ValidationError([ErrorWrapper(e, loc=ROOT_KEY)], cls)
|
|
63
|
+
|
|
64
|
+
audio_spec: AudioModelFamilyV1 = cls.parse_obj(obj)
|
|
65
|
+
|
|
66
|
+
# check model_family
|
|
67
|
+
if audio_spec.model_family is None:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"You must specify `model_family` when registering custom Audio models."
|
|
70
|
+
)
|
|
71
|
+
assert isinstance(audio_spec.model_family, str)
|
|
72
|
+
return audio_spec
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
UD_AUDIOS: List[CustomAudioModelFamilyV1] = []
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_user_defined_audios() -> List[CustomAudioModelFamilyV1]:
|
|
79
|
+
with UD_AUDIO_LOCK:
|
|
80
|
+
return UD_AUDIOS.copy()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def register_audio(model_spec: CustomAudioModelFamilyV1, persist: bool):
|
|
84
|
+
from ...constants import XINFERENCE_MODEL_DIR
|
|
85
|
+
from ..utils import is_valid_model_name, is_valid_model_uri
|
|
86
|
+
from . import BUILTIN_AUDIO_MODELS
|
|
87
|
+
|
|
88
|
+
if not is_valid_model_name(model_spec.model_name):
|
|
89
|
+
raise ValueError(f"Invalid model name {model_spec.model_name}.")
|
|
90
|
+
|
|
91
|
+
with UD_AUDIO_LOCK:
|
|
92
|
+
for model_name in list(BUILTIN_AUDIO_MODELS.keys()) + [
|
|
93
|
+
spec.model_name for spec in UD_AUDIOS
|
|
94
|
+
]:
|
|
95
|
+
if model_spec.model_name == model_name:
|
|
96
|
+
raise ValueError(
|
|
97
|
+
f"Model name conflicts with existing model {model_spec.model_name}"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
UD_AUDIOS.append(model_spec)
|
|
101
|
+
|
|
102
|
+
if persist:
|
|
103
|
+
# We only validate model URL when persist is True.
|
|
104
|
+
model_uri = model_spec.model_uri
|
|
105
|
+
if model_uri and not is_valid_model_uri(model_uri):
|
|
106
|
+
raise ValueError(f"Invalid model URI {model_uri}.")
|
|
107
|
+
|
|
108
|
+
persist_path = os.path.join(
|
|
109
|
+
XINFERENCE_MODEL_DIR, "audio", f"{model_spec.model_name}.json"
|
|
110
|
+
)
|
|
111
|
+
os.makedirs(os.path.dirname(persist_path), exist_ok=True)
|
|
112
|
+
with open(persist_path, mode="w") as fd:
|
|
113
|
+
fd.write(model_spec.json())
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def unregister_audio(model_name: str, raise_error: bool = True):
|
|
117
|
+
with UD_AUDIO_LOCK:
|
|
118
|
+
model_spec = None
|
|
119
|
+
for i, f in enumerate(UD_AUDIOS):
|
|
120
|
+
if f.model_name == model_name:
|
|
121
|
+
model_spec = f
|
|
122
|
+
break
|
|
123
|
+
if model_spec:
|
|
124
|
+
UD_AUDIOS.remove(model_spec)
|
|
125
|
+
|
|
126
|
+
persist_path = os.path.join(
|
|
127
|
+
XINFERENCE_MODEL_DIR, "audio", f"{model_spec.model_name}.json"
|
|
128
|
+
)
|
|
129
|
+
if os.path.exists(persist_path):
|
|
130
|
+
os.remove(persist_path)
|
|
131
|
+
|
|
132
|
+
cache_dir = os.path.join(XINFERENCE_CACHE_DIR, model_spec.model_name)
|
|
133
|
+
if os.path.exists(cache_dir):
|
|
134
|
+
logger.warning(
|
|
135
|
+
f"Remove the cache of user-defined model {model_spec.model_name}. "
|
|
136
|
+
f"Cache directory: {cache_dir}"
|
|
137
|
+
)
|
|
138
|
+
if os.path.isdir(cache_dir):
|
|
139
|
+
os.rmdir(cache_dir)
|
|
140
|
+
else:
|
|
141
|
+
logger.warning(
|
|
142
|
+
f"Cache directory is not a soft link, please remove it manually."
|
|
143
|
+
)
|
|
144
|
+
else:
|
|
145
|
+
if raise_error:
|
|
146
|
+
raise ValueError(f"Model {model_name} not found")
|
|
147
|
+
else:
|
|
148
|
+
logger.warning(f"Custom audio model {model_name} not found")
|
xinference/model/core.py
CHANGED
|
@@ -13,9 +13,10 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
from abc import ABC, abstractmethod
|
|
16
|
-
from typing import Any,
|
|
16
|
+
from typing import Any, List, Optional, Tuple, Union
|
|
17
17
|
|
|
18
18
|
from .._compat import BaseModel
|
|
19
|
+
from ..types import PeftModelConfig
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class ModelDescription(ABC):
|
|
@@ -50,11 +51,9 @@ def create_model_instance(
|
|
|
50
51
|
model_type: str,
|
|
51
52
|
model_name: str,
|
|
52
53
|
model_format: Optional[str] = None,
|
|
53
|
-
model_size_in_billions: Optional[int] = None,
|
|
54
|
+
model_size_in_billions: Optional[Union[int, str]] = None,
|
|
54
55
|
quantization: Optional[str] = None,
|
|
55
|
-
|
|
56
|
-
image_lora_load_kwargs: Optional[Dict] = None,
|
|
57
|
-
image_lora_fuse_kwargs: Optional[Dict] = None,
|
|
56
|
+
peft_model_config: Optional[PeftModelConfig] = None,
|
|
58
57
|
is_local_deployment: bool = False,
|
|
59
58
|
**kwargs,
|
|
60
59
|
) -> Tuple[Any, ModelDescription]:
|
|
@@ -73,7 +72,7 @@ def create_model_instance(
|
|
|
73
72
|
model_format,
|
|
74
73
|
model_size_in_billions,
|
|
75
74
|
quantization,
|
|
76
|
-
|
|
75
|
+
peft_model_config,
|
|
77
76
|
is_local_deployment,
|
|
78
77
|
**kwargs,
|
|
79
78
|
)
|
|
@@ -90,9 +89,7 @@ def create_model_instance(
|
|
|
90
89
|
devices,
|
|
91
90
|
model_uid,
|
|
92
91
|
model_name,
|
|
93
|
-
|
|
94
|
-
lora_load_kwargs=image_lora_load_kwargs,
|
|
95
|
-
lora_fuse_kwargs=image_lora_fuse_kwargs,
|
|
92
|
+
peft_model_config,
|
|
96
93
|
**kwargs,
|
|
97
94
|
)
|
|
98
95
|
elif model_type == "rerank":
|
|
@@ -206,5 +206,29 @@
|
|
|
206
206
|
"language": ["zh", "en"],
|
|
207
207
|
"model_id": "maidalun1020/bce-embedding-base_v1",
|
|
208
208
|
"model_revision": "236d9024fc1b4046f03848723f934521a66a9323"
|
|
209
|
+
},
|
|
210
|
+
{
|
|
211
|
+
"model_name": "m3e-small",
|
|
212
|
+
"dimensions": 512,
|
|
213
|
+
"max_tokens": 512,
|
|
214
|
+
"language": ["zh", "en"],
|
|
215
|
+
"model_id": "moka-ai/m3e-small",
|
|
216
|
+
"model_revision": "44c696631b2a8c200220aaaad5f987f096e986df"
|
|
217
|
+
},
|
|
218
|
+
{
|
|
219
|
+
"model_name": "m3e-base",
|
|
220
|
+
"dimensions": 768,
|
|
221
|
+
"max_tokens": 512,
|
|
222
|
+
"language": ["zh", "en"],
|
|
223
|
+
"model_id": "moka-ai/m3e-base",
|
|
224
|
+
"model_revision": "764b537a0e50e5c7d64db883f2d2e051cbe3c64c"
|
|
225
|
+
},
|
|
226
|
+
{
|
|
227
|
+
"model_name": "m3e-large",
|
|
228
|
+
"dimensions": 1024,
|
|
229
|
+
"max_tokens": 512,
|
|
230
|
+
"language": ["zh", "en"],
|
|
231
|
+
"model_id": "moka-ai/m3e-large",
|
|
232
|
+
"model_revision": "12900375086c37ba5d83d1e417b21dc7d1d1f388"
|
|
209
233
|
}
|
|
210
234
|
]
|
|
@@ -208,5 +208,29 @@
|
|
|
208
208
|
"language": ["zh", "en"],
|
|
209
209
|
"model_id": "maidalun/bce-embedding-base_v1",
|
|
210
210
|
"model_hub": "modelscope"
|
|
211
|
+
},
|
|
212
|
+
{
|
|
213
|
+
"model_name": "m3e-small",
|
|
214
|
+
"dimensions": 512,
|
|
215
|
+
"max_tokens": 512,
|
|
216
|
+
"language": ["zh", "en"],
|
|
217
|
+
"model_id": "AI-ModelScope/m3e-small",
|
|
218
|
+
"model_hub": "modelscope"
|
|
219
|
+
},
|
|
220
|
+
{
|
|
221
|
+
"model_name": "m3e-base",
|
|
222
|
+
"dimensions": 768,
|
|
223
|
+
"max_tokens": 512,
|
|
224
|
+
"language": ["zh", "en"],
|
|
225
|
+
"model_id": "AI-ModelScope/m3e-base",
|
|
226
|
+
"model_hub": "modelscope"
|
|
227
|
+
},
|
|
228
|
+
{
|
|
229
|
+
"model_name": "m3e-large",
|
|
230
|
+
"dimensions": 1024,
|
|
231
|
+
"max_tokens": 512,
|
|
232
|
+
"language": ["zh", "en"],
|
|
233
|
+
"model_id": "AI-ModelScope/m3e-large",
|
|
234
|
+
"model_hub": "modelscope"
|
|
211
235
|
}
|
|
212
236
|
]
|
xinference/model/image/core.py
CHANGED
|
@@ -18,6 +18,7 @@ from collections import defaultdict
|
|
|
18
18
|
from typing import Dict, List, Optional, Tuple
|
|
19
19
|
|
|
20
20
|
from ...constants import XINFERENCE_CACHE_DIR
|
|
21
|
+
from ...types import PeftModelConfig
|
|
21
22
|
from ..core import CacheableModelSpec, ModelDescription
|
|
22
23
|
from ..utils import valid_model_revision
|
|
23
24
|
from .stable_diffusion.core import DiffusionModel
|
|
@@ -175,9 +176,7 @@ def create_image_model_instance(
|
|
|
175
176
|
devices: List[str],
|
|
176
177
|
model_uid: str,
|
|
177
178
|
model_name: str,
|
|
178
|
-
|
|
179
|
-
lora_load_kwargs: Optional[Dict] = None,
|
|
180
|
-
lora_fuse_kwargs: Optional[Dict] = None,
|
|
179
|
+
peft_model_config: Optional[PeftModelConfig] = None,
|
|
181
180
|
**kwargs,
|
|
182
181
|
) -> Tuple[DiffusionModel, ImageModelDescription]:
|
|
183
182
|
model_spec = match_diffusion(model_name)
|
|
@@ -210,10 +209,19 @@ def create_image_model_instance(
|
|
|
210
209
|
else:
|
|
211
210
|
kwargs["controlnet"] = controlnet_model_paths
|
|
212
211
|
model_path = cache(model_spec)
|
|
212
|
+
if peft_model_config is not None:
|
|
213
|
+
lora_model = peft_model_config.peft_model
|
|
214
|
+
lora_load_kwargs = peft_model_config.image_lora_load_kwargs
|
|
215
|
+
lora_fuse_kwargs = peft_model_config.image_lora_fuse_kwargs
|
|
216
|
+
else:
|
|
217
|
+
lora_model = None
|
|
218
|
+
lora_load_kwargs = None
|
|
219
|
+
lora_fuse_kwargs = None
|
|
220
|
+
|
|
213
221
|
model = DiffusionModel(
|
|
214
222
|
model_uid,
|
|
215
223
|
model_path,
|
|
216
|
-
|
|
224
|
+
lora_model_paths=lora_model,
|
|
217
225
|
lora_load_kwargs=lora_load_kwargs,
|
|
218
226
|
lora_fuse_kwargs=lora_fuse_kwargs,
|
|
219
227
|
**kwargs,
|
|
@@ -25,7 +25,7 @@ from typing import Dict, List, Optional, Union
|
|
|
25
25
|
|
|
26
26
|
from ....constants import XINFERENCE_IMAGE_DIR
|
|
27
27
|
from ....device_utils import move_model_to_available_device
|
|
28
|
-
from ....types import Image, ImageList
|
|
28
|
+
from ....types import Image, ImageList, LoRA
|
|
29
29
|
|
|
30
30
|
logger = logging.getLogger(__name__)
|
|
31
31
|
|
|
@@ -36,7 +36,7 @@ class DiffusionModel:
|
|
|
36
36
|
model_uid: str,
|
|
37
37
|
model_path: str,
|
|
38
38
|
device: Optional[str] = None,
|
|
39
|
-
|
|
39
|
+
lora_model: Optional[List[LoRA]] = None,
|
|
40
40
|
lora_load_kwargs: Optional[Dict] = None,
|
|
41
41
|
lora_fuse_kwargs: Optional[Dict] = None,
|
|
42
42
|
**kwargs,
|
|
@@ -45,20 +45,21 @@ class DiffusionModel:
|
|
|
45
45
|
self._model_path = model_path
|
|
46
46
|
self._device = device
|
|
47
47
|
self._model = None
|
|
48
|
-
self.
|
|
48
|
+
self._lora_model = lora_model
|
|
49
49
|
self._lora_load_kwargs = lora_load_kwargs or {}
|
|
50
50
|
self._lora_fuse_kwargs = lora_fuse_kwargs or {}
|
|
51
51
|
self._kwargs = kwargs
|
|
52
52
|
|
|
53
53
|
def _apply_lora(self):
|
|
54
|
-
if self.
|
|
54
|
+
if self._lora_model is not None:
|
|
55
55
|
logger.info(
|
|
56
56
|
f"Loading the LoRA with load kwargs: {self._lora_load_kwargs}, fuse kwargs: {self._lora_fuse_kwargs}."
|
|
57
57
|
)
|
|
58
58
|
assert self._model is not None
|
|
59
|
-
self.
|
|
60
|
-
self.
|
|
61
|
-
|
|
59
|
+
for lora_model in self._lora_model:
|
|
60
|
+
self._model.load_lora_weights(
|
|
61
|
+
lora_model.local_path, **self._lora_load_kwargs
|
|
62
|
+
)
|
|
62
63
|
self._model.fuse_lora(**self._lora_fuse_kwargs)
|
|
63
64
|
logger.info(f"Successfully loaded the LoRA for model {self._model_uid}.")
|
|
64
65
|
|
xinference/model/llm/core.py
CHANGED
|
@@ -21,6 +21,7 @@ from collections import defaultdict
|
|
|
21
21
|
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
|
|
22
22
|
|
|
23
23
|
from ...core.utils import parse_replica_model_uid
|
|
24
|
+
from ...types import PeftModelConfig
|
|
24
25
|
from ..core import ModelDescription
|
|
25
26
|
|
|
26
27
|
if TYPE_CHECKING:
|
|
@@ -178,9 +179,9 @@ def create_llm_model_instance(
|
|
|
178
179
|
model_uid: str,
|
|
179
180
|
model_name: str,
|
|
180
181
|
model_format: Optional[str] = None,
|
|
181
|
-
model_size_in_billions: Optional[int] = None,
|
|
182
|
+
model_size_in_billions: Optional[Union[int, str]] = None,
|
|
182
183
|
quantization: Optional[str] = None,
|
|
183
|
-
|
|
184
|
+
peft_model_config: Optional[PeftModelConfig] = None,
|
|
184
185
|
is_local_deployment: bool = False,
|
|
185
186
|
**kwargs,
|
|
186
187
|
) -> Tuple[LLM, LLMDescription]:
|
|
@@ -204,9 +205,9 @@ def create_llm_model_instance(
|
|
|
204
205
|
assert quantization is not None
|
|
205
206
|
save_path = cache(llm_family, llm_spec, quantization)
|
|
206
207
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
)
|
|
208
|
+
peft_model = peft_model_config.peft_model if peft_model_config else None
|
|
209
|
+
|
|
210
|
+
llm_cls = match_llm_cls(llm_family, llm_spec, quantization, peft_model=peft_model)
|
|
210
211
|
if not llm_cls:
|
|
211
212
|
raise ValueError(
|
|
212
213
|
f"Model not supported, name: {model_name}, format: {model_format},"
|
|
@@ -214,15 +215,9 @@ def create_llm_model_instance(
|
|
|
214
215
|
)
|
|
215
216
|
logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
|
|
216
217
|
|
|
217
|
-
if
|
|
218
|
+
if peft_model is not None:
|
|
218
219
|
model = llm_cls(
|
|
219
|
-
model_uid,
|
|
220
|
-
llm_family,
|
|
221
|
-
llm_spec,
|
|
222
|
-
quantization,
|
|
223
|
-
save_path,
|
|
224
|
-
kwargs,
|
|
225
|
-
peft_model_path,
|
|
220
|
+
model_uid, llm_family, llm_spec, quantization, save_path, kwargs, peft_model
|
|
226
221
|
)
|
|
227
222
|
else:
|
|
228
223
|
model = llm_cls(
|
|
@@ -238,7 +233,7 @@ def create_speculative_llm_model_instance(
|
|
|
238
233
|
devices: List[str],
|
|
239
234
|
model_uid: str,
|
|
240
235
|
model_name: str,
|
|
241
|
-
model_size_in_billions: Optional[int],
|
|
236
|
+
model_size_in_billions: Optional[Union[int, str]],
|
|
242
237
|
quantization: Optional[str],
|
|
243
238
|
draft_model_name: str,
|
|
244
239
|
draft_model_size_in_billions: Optional[int],
|
|
@@ -1930,6 +1930,74 @@
|
|
|
1930
1930
|
]
|
|
1931
1931
|
}
|
|
1932
1932
|
},
|
|
1933
|
+
{
|
|
1934
|
+
"version": 1,
|
|
1935
|
+
"context_length": 65536,
|
|
1936
|
+
"model_name": "codeqwen1.5-chat",
|
|
1937
|
+
"model_lang": [
|
|
1938
|
+
"en",
|
|
1939
|
+
"zh"
|
|
1940
|
+
],
|
|
1941
|
+
"model_ability": [
|
|
1942
|
+
"chat"
|
|
1943
|
+
],
|
|
1944
|
+
"model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
|
|
1945
|
+
"model_specs": [
|
|
1946
|
+
{
|
|
1947
|
+
"model_format": "ggufv2",
|
|
1948
|
+
"model_size_in_billions": 7,
|
|
1949
|
+
"quantizations": [
|
|
1950
|
+
"q2_k",
|
|
1951
|
+
"q3_k_m",
|
|
1952
|
+
"q4_0",
|
|
1953
|
+
"q4_k_m",
|
|
1954
|
+
"q5_0",
|
|
1955
|
+
"q5_k_m",
|
|
1956
|
+
"q6_k",
|
|
1957
|
+
"q8_0"
|
|
1958
|
+
],
|
|
1959
|
+
"model_id": "Qwen/CodeQwen1.5-7B-Chat-GGUF",
|
|
1960
|
+
"model_file_name_template": "codeqwen-1_5-7b-chat-{quantization}.gguf"
|
|
1961
|
+
},
|
|
1962
|
+
{
|
|
1963
|
+
"model_format": "pytorch",
|
|
1964
|
+
"model_size_in_billions": 7,
|
|
1965
|
+
"quantizations": [
|
|
1966
|
+
"4-bit",
|
|
1967
|
+
"8-bit",
|
|
1968
|
+
"none"
|
|
1969
|
+
],
|
|
1970
|
+
"model_id": "Qwen/CodeQwen1.5-7B-Chat"
|
|
1971
|
+
},
|
|
1972
|
+
{
|
|
1973
|
+
"model_format": "awq",
|
|
1974
|
+
"model_size_in_billions": 7,
|
|
1975
|
+
"quantizations": [
|
|
1976
|
+
"Int4"
|
|
1977
|
+
],
|
|
1978
|
+
"model_id": "Qwen/CodeQwen1.5-7B-Chat-AWQ"
|
|
1979
|
+
}
|
|
1980
|
+
],
|
|
1981
|
+
"prompt_style": {
|
|
1982
|
+
"style_name": "QWEN",
|
|
1983
|
+
"system_prompt": "You are a helpful assistant.",
|
|
1984
|
+
"roles": [
|
|
1985
|
+
"user",
|
|
1986
|
+
"assistant"
|
|
1987
|
+
],
|
|
1988
|
+
"intra_message_sep": "\n",
|
|
1989
|
+
"stop_token_ids": [
|
|
1990
|
+
151643,
|
|
1991
|
+
151644,
|
|
1992
|
+
151645
|
|
1993
|
+
],
|
|
1994
|
+
"stop": [
|
|
1995
|
+
"<|endoftext|>",
|
|
1996
|
+
"<|im_start|>",
|
|
1997
|
+
"<|im_end|>"
|
|
1998
|
+
]
|
|
1999
|
+
}
|
|
2000
|
+
},
|
|
1933
2001
|
{
|
|
1934
2002
|
"version": 1,
|
|
1935
2003
|
"context_length": 8192,
|
|
@@ -4752,5 +4820,200 @@
|
|
|
4752
4820
|
"</s>"
|
|
4753
4821
|
]
|
|
4754
4822
|
}
|
|
4823
|
+
},
|
|
4824
|
+
{
|
|
4825
|
+
"version": 1,
|
|
4826
|
+
"context_length": 8192,
|
|
4827
|
+
"model_name": "seallm_v2",
|
|
4828
|
+
"model_lang": [
|
|
4829
|
+
"en",
|
|
4830
|
+
"zh",
|
|
4831
|
+
"vi",
|
|
4832
|
+
"id",
|
|
4833
|
+
"th",
|
|
4834
|
+
"ms",
|
|
4835
|
+
"km",
|
|
4836
|
+
"lo",
|
|
4837
|
+
"my",
|
|
4838
|
+
"tl"
|
|
4839
|
+
],
|
|
4840
|
+
"model_ability": [
|
|
4841
|
+
"generate"
|
|
4842
|
+
],
|
|
4843
|
+
"model_description": "We introduce SeaLLM-7B-v2, the state-of-the-art multilingual LLM for Southeast Asian (SEA) languages",
|
|
4844
|
+
"model_specs": [
|
|
4845
|
+
{
|
|
4846
|
+
"model_format": "pytorch",
|
|
4847
|
+
"model_size_in_billions": 7,
|
|
4848
|
+
"quantizations": [
|
|
4849
|
+
"none"
|
|
4850
|
+
],
|
|
4851
|
+
"model_id": "SeaLLMs/SeaLLM-7B-v2",
|
|
4852
|
+
"model_revision": "f1bd48e0d75365c24a3c5ad006b2d0a0c9dca30f"
|
|
4853
|
+
},
|
|
4854
|
+
{
|
|
4855
|
+
"model_format": "ggufv2",
|
|
4856
|
+
"model_size_in_billions": 7,
|
|
4857
|
+
"quantizations": [
|
|
4858
|
+
"Q4_0",
|
|
4859
|
+
"Q8_0"
|
|
4860
|
+
],
|
|
4861
|
+
"model_id": "SeaLLMs/SeaLLM-7B-v2-gguf",
|
|
4862
|
+
"model_file_name_template": "SeaLLM-7B-v2.{quantization}.gguf"
|
|
4863
|
+
}
|
|
4864
|
+
]
|
|
4865
|
+
},
|
|
4866
|
+
{
|
|
4867
|
+
"version": 1,
|
|
4868
|
+
"context_length": 8192,
|
|
4869
|
+
"model_name": "seallm_v2.5",
|
|
4870
|
+
"model_lang": [
|
|
4871
|
+
"en",
|
|
4872
|
+
"zh",
|
|
4873
|
+
"vi",
|
|
4874
|
+
"id",
|
|
4875
|
+
"th",
|
|
4876
|
+
"ms",
|
|
4877
|
+
"km",
|
|
4878
|
+
"lo",
|
|
4879
|
+
"my",
|
|
4880
|
+
"tl"
|
|
4881
|
+
],
|
|
4882
|
+
"model_ability": [
|
|
4883
|
+
"generate"
|
|
4884
|
+
],
|
|
4885
|
+
"model_description": "We introduce SeaLLM-7B-v2.5, the state-of-the-art multilingual LLM for Southeast Asian (SEA) languages",
|
|
4886
|
+
"model_specs": [
|
|
4887
|
+
{
|
|
4888
|
+
"model_format": "pytorch",
|
|
4889
|
+
"model_size_in_billions": 7,
|
|
4890
|
+
"quantizations": [
|
|
4891
|
+
"none"
|
|
4892
|
+
],
|
|
4893
|
+
"model_id": "SeaLLMs/SeaLLM-7B-v2.5",
|
|
4894
|
+
"model_revision": "c54a8eb8e2d58c5a680bfbbe3a7ae71753bb644b"
|
|
4895
|
+
},
|
|
4896
|
+
{
|
|
4897
|
+
"model_format": "ggufv2",
|
|
4898
|
+
"model_size_in_billions": 7,
|
|
4899
|
+
"quantizations": [
|
|
4900
|
+
"Q4_K_M",
|
|
4901
|
+
"Q8_0"
|
|
4902
|
+
],
|
|
4903
|
+
"model_id": "SeaLLMs/SeaLLM-7B-v2.5-GGUF",
|
|
4904
|
+
"model_file_name_template": "SeaLLM-7B-v2.5.{quantization}.gguf"
|
|
4905
|
+
}
|
|
4906
|
+
]
|
|
4907
|
+
},
|
|
4908
|
+
{
|
|
4909
|
+
"version": 1,
|
|
4910
|
+
"context_length": 131072,
|
|
4911
|
+
"model_name": "c4ai-command-r-v01",
|
|
4912
|
+
"model_lang": [
|
|
4913
|
+
"en",
|
|
4914
|
+
"fr",
|
|
4915
|
+
"de",
|
|
4916
|
+
"es",
|
|
4917
|
+
"it",
|
|
4918
|
+
"pt",
|
|
4919
|
+
"ja",
|
|
4920
|
+
"ko",
|
|
4921
|
+
"zh",
|
|
4922
|
+
"ar"
|
|
4923
|
+
],
|
|
4924
|
+
"model_ability": [
|
|
4925
|
+
"generate"
|
|
4926
|
+
],
|
|
4927
|
+
"model_description": "C4AI Command-R is a research release of a 35 billion parameter highly performant generative model.",
|
|
4928
|
+
"model_specs": [
|
|
4929
|
+
{
|
|
4930
|
+
"model_format": "pytorch",
|
|
4931
|
+
"model_size_in_billions": 35,
|
|
4932
|
+
"quantizations": [
|
|
4933
|
+
"none"
|
|
4934
|
+
],
|
|
4935
|
+
"model_id": "CohereForAI/c4ai-command-r-v01",
|
|
4936
|
+
"model_revision": "16881ccde1c68bbc7041280e6a66637bc46bfe88"
|
|
4937
|
+
},
|
|
4938
|
+
{
|
|
4939
|
+
"model_format": "ggufv2",
|
|
4940
|
+
"model_size_in_billions": 35,
|
|
4941
|
+
"quantizations": [
|
|
4942
|
+
"Q2_K",
|
|
4943
|
+
"Q3_K_L",
|
|
4944
|
+
"Q3_K_M",
|
|
4945
|
+
"Q3_K_S",
|
|
4946
|
+
"Q4_0",
|
|
4947
|
+
"Q4_K_M",
|
|
4948
|
+
"Q4_K_S",
|
|
4949
|
+
"Q5_0",
|
|
4950
|
+
"Q5_K_M",
|
|
4951
|
+
"Q5_K_S",
|
|
4952
|
+
"Q6_K",
|
|
4953
|
+
"Q8_0"
|
|
4954
|
+
],
|
|
4955
|
+
"model_id": "andrewcanis/c4ai-command-r-v01-GGUF",
|
|
4956
|
+
"model_file_name_template": "c4ai-command-r-v01.{quantization}.gguf"
|
|
4957
|
+
},
|
|
4958
|
+
{
|
|
4959
|
+
"model_format": "pytorch",
|
|
4960
|
+
"model_size_in_billions": 104,
|
|
4961
|
+
"quantizations": [
|
|
4962
|
+
"none"
|
|
4963
|
+
],
|
|
4964
|
+
"model_id": "CohereForAI/c4ai-command-r-plus",
|
|
4965
|
+
"model_revision": "ba7f1d954c9d1609013677d87e4142ab95c34e62"
|
|
4966
|
+
},
|
|
4967
|
+
{
|
|
4968
|
+
"model_format": "gptq",
|
|
4969
|
+
"model_size_in_billions": 104,
|
|
4970
|
+
"quantizations": [
|
|
4971
|
+
"Int4"
|
|
4972
|
+
],
|
|
4973
|
+
"model_id": "alpindale/c4ai-command-r-plus-GPTQ",
|
|
4974
|
+
"model_revision": "35febfc08f723ac0df32480eb4af349a7d08656e"
|
|
4975
|
+
}
|
|
4976
|
+
]
|
|
4977
|
+
},
|
|
4978
|
+
{
|
|
4979
|
+
"version": 1,
|
|
4980
|
+
"context_length": 131072,
|
|
4981
|
+
"model_name": "c4ai-command-r-v01-4bit",
|
|
4982
|
+
"model_lang": [
|
|
4983
|
+
"en",
|
|
4984
|
+
"fr",
|
|
4985
|
+
"de",
|
|
4986
|
+
"es",
|
|
4987
|
+
"it",
|
|
4988
|
+
"pt",
|
|
4989
|
+
"ja",
|
|
4990
|
+
"ko",
|
|
4991
|
+
"zh",
|
|
4992
|
+
"ar"
|
|
4993
|
+
],
|
|
4994
|
+
"model_ability": [
|
|
4995
|
+
"generate"
|
|
4996
|
+
],
|
|
4997
|
+
"model_description": "This model is 4bit quantized version of C4AI Command-R using bitsandbytes.",
|
|
4998
|
+
"model_specs": [
|
|
4999
|
+
{
|
|
5000
|
+
"model_format": "pytorch",
|
|
5001
|
+
"model_size_in_billions": 35,
|
|
5002
|
+
"quantizations": [
|
|
5003
|
+
"none"
|
|
5004
|
+
],
|
|
5005
|
+
"model_id": "CohereForAI/c4ai-command-r-v01-4bit",
|
|
5006
|
+
"model_revision": "f2e87936a146643c9dd143422dcafb9cb1552611"
|
|
5007
|
+
},
|
|
5008
|
+
{
|
|
5009
|
+
"model_format": "pytorch",
|
|
5010
|
+
"model_size_in_billions": 104,
|
|
5011
|
+
"quantizations": [
|
|
5012
|
+
"none"
|
|
5013
|
+
],
|
|
5014
|
+
"model_id": "CohereForAI/c4ai-command-r-plus-4bit",
|
|
5015
|
+
"model_revision": "bb63b5b7005ecedb30b0cfd0d5953b02a5817f7b"
|
|
5016
|
+
}
|
|
5017
|
+
]
|
|
4755
5018
|
}
|
|
4756
5019
|
]
|