xinference 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +9 -9
- xinference/client/restful/restful_client.py +32 -16
- xinference/core/supervisor.py +32 -9
- xinference/core/worker.py +13 -8
- xinference/deploy/cmdline.py +22 -9
- xinference/model/audio/__init__.py +40 -1
- xinference/model/audio/core.py +25 -45
- xinference/model/audio/custom.py +148 -0
- xinference/model/core.py +6 -9
- xinference/model/embedding/model_spec.json +24 -0
- xinference/model/embedding/model_spec_modelscope.json +24 -0
- xinference/model/image/core.py +12 -4
- xinference/model/image/stable_diffusion/core.py +8 -7
- xinference/model/llm/core.py +9 -14
- xinference/model/llm/llm_family.json +263 -0
- xinference/model/llm/llm_family.py +26 -4
- xinference/model/llm/llm_family_modelscope.json +160 -0
- xinference/model/llm/pytorch/baichuan.py +4 -3
- xinference/model/llm/pytorch/chatglm.py +3 -2
- xinference/model/llm/pytorch/core.py +15 -13
- xinference/model/llm/pytorch/falcon.py +6 -5
- xinference/model/llm/pytorch/internlm2.py +3 -2
- xinference/model/llm/pytorch/llama_2.py +6 -5
- xinference/model/llm/pytorch/vicuna.py +4 -3
- xinference/model/llm/vllm/core.py +3 -0
- xinference/model/rerank/core.py +23 -12
- xinference/model/rerank/model_spec.json +24 -0
- xinference/model/rerank/model_spec_modelscope.json +25 -1
- xinference/model/utils.py +12 -1
- xinference/types.py +55 -0
- xinference/utils.py +1 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.26fdbfbe.js +3 -0
- xinference/web/ui/build/static/js/main.26fdbfbe.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1e86938a0cdf706d21e99b21f5d868fa247c0c88b26807047e26dcdc4d9a9db3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f9290c0738db50065492ceedc6a4af25083fe18399b7c44d942273349ad9e643.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/feabb04b4aa507102da0a64398a40818e878fd1df9b75dda8461b3e1e7ff3f11.json +1 -0
- {xinference-0.10.1.dist-info → xinference-0.10.2.dist-info}/METADATA +4 -1
- {xinference-0.10.1.dist-info → xinference-0.10.2.dist-info}/RECORD +49 -46
- xinference/web/ui/build/static/js/main.76ef2b17.js +0 -3
- xinference/web/ui/build/static/js/main.76ef2b17.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/35d0e4a317e5582cbb79d901302e9d706520ac53f8a734c2fd8bfde6eb5a4f02.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d076fd56cf3b15ed2433e3744b98c6b4e4410a19903d1db4de5bba0e1a1b3347.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/daad8131d91134f6d7aef895a0c9c32e1cb928277cb5aa66c01028126d215be0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f16aec63602a77bd561d0e67fa00b76469ac54b8033754bba114ec5eb3257964.json +0 -1
- /xinference/web/ui/build/static/js/{main.76ef2b17.js.LICENSE.txt → main.26fdbfbe.js.LICENSE.txt} +0 -0
- {xinference-0.10.1.dist-info → xinference-0.10.2.dist-info}/LICENSE +0 -0
- {xinference-0.10.1.dist-info → xinference-0.10.2.dist-info}/WHEEL +0 -0
- {xinference-0.10.1.dist-info → xinference-0.10.2.dist-info}/entry_points.txt +0 -0
- {xinference-0.10.1.dist-info → xinference-0.10.2.dist-info}/top_level.txt +0 -0
|
@@ -33,6 +33,7 @@ from ..._compat import (
|
|
|
33
33
|
validator,
|
|
34
34
|
)
|
|
35
35
|
from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
|
|
36
|
+
from ...types import LoRA
|
|
36
37
|
from ..utils import (
|
|
37
38
|
download_from_modelscope,
|
|
38
39
|
is_valid_model_uri,
|
|
@@ -797,10 +798,29 @@ def get_user_defined_llm_families():
|
|
|
797
798
|
return UD_LLM_FAMILIES.copy()
|
|
798
799
|
|
|
799
800
|
|
|
801
|
+
def match_model_size(
|
|
802
|
+
model_size: Union[int, str], spec_model_size: Union[int, str]
|
|
803
|
+
) -> bool:
|
|
804
|
+
if isinstance(model_size, str):
|
|
805
|
+
model_size = model_size.replace("_", ".")
|
|
806
|
+
if isinstance(spec_model_size, str):
|
|
807
|
+
spec_model_size = spec_model_size.replace("_", ".")
|
|
808
|
+
|
|
809
|
+
if model_size == spec_model_size:
|
|
810
|
+
return True
|
|
811
|
+
|
|
812
|
+
try:
|
|
813
|
+
ms = int(model_size)
|
|
814
|
+
ss = int(spec_model_size)
|
|
815
|
+
return ms == ss
|
|
816
|
+
except ValueError:
|
|
817
|
+
return False
|
|
818
|
+
|
|
819
|
+
|
|
800
820
|
def match_llm(
|
|
801
821
|
model_name: str,
|
|
802
822
|
model_format: Optional[str] = None,
|
|
803
|
-
model_size_in_billions: Optional[int] = None,
|
|
823
|
+
model_size_in_billions: Optional[Union[int, str]] = None,
|
|
804
824
|
quantization: Optional[str] = None,
|
|
805
825
|
is_local_deployment: bool = False,
|
|
806
826
|
) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
|
|
@@ -844,7 +864,9 @@ def match_llm(
|
|
|
844
864
|
model_format
|
|
845
865
|
and model_format != spec.model_format
|
|
846
866
|
or model_size_in_billions
|
|
847
|
-
and
|
|
867
|
+
and not match_model_size(
|
|
868
|
+
model_size_in_billions, spec.model_size_in_billions
|
|
869
|
+
)
|
|
848
870
|
or quantization
|
|
849
871
|
and matched_quantization is None
|
|
850
872
|
):
|
|
@@ -954,12 +976,12 @@ def match_llm_cls(
|
|
|
954
976
|
family: LLMFamilyV1,
|
|
955
977
|
llm_spec: "LLMSpecV1",
|
|
956
978
|
quantization: str,
|
|
957
|
-
|
|
979
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
958
980
|
) -> Optional[Type[LLM]]:
|
|
959
981
|
"""
|
|
960
982
|
Find an LLM implementation for given LLM family and spec.
|
|
961
983
|
"""
|
|
962
|
-
if
|
|
984
|
+
if peft_model is not None:
|
|
963
985
|
for cls in PEFT_SUPPORTED_CLASSES:
|
|
964
986
|
if cls.match(family, llm_spec, quantization):
|
|
965
987
|
return cls
|
|
@@ -2175,6 +2175,77 @@
|
|
|
2175
2175
|
]
|
|
2176
2176
|
}
|
|
2177
2177
|
},
|
|
2178
|
+
{
|
|
2179
|
+
"version": 1,
|
|
2180
|
+
"context_length": 65536,
|
|
2181
|
+
"model_name": "codeqwen1.5-chat",
|
|
2182
|
+
"model_lang": [
|
|
2183
|
+
"en",
|
|
2184
|
+
"zh"
|
|
2185
|
+
],
|
|
2186
|
+
"model_ability": [
|
|
2187
|
+
"chat"
|
|
2188
|
+
],
|
|
2189
|
+
"model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
|
|
2190
|
+
"model_specs": [
|
|
2191
|
+
{
|
|
2192
|
+
"model_format": "ggufv2",
|
|
2193
|
+
"model_size_in_billions": 7,
|
|
2194
|
+
"quantizations": [
|
|
2195
|
+
"q2_k",
|
|
2196
|
+
"q3_k_m",
|
|
2197
|
+
"q4_0",
|
|
2198
|
+
"q4_k_m",
|
|
2199
|
+
"q5_0",
|
|
2200
|
+
"q5_k_m",
|
|
2201
|
+
"q6_k",
|
|
2202
|
+
"q8_0"
|
|
2203
|
+
],
|
|
2204
|
+
"model_id": "qwen/CodeQwen1.5-7B-Chat-GGUF",
|
|
2205
|
+
"model_hub": "modelscope",
|
|
2206
|
+
"model_file_name_template": "codeqwen-1_5-7b-chat-{quantization}.gguf"
|
|
2207
|
+
},
|
|
2208
|
+
{
|
|
2209
|
+
"model_format": "pytorch",
|
|
2210
|
+
"model_size_in_billions": 7,
|
|
2211
|
+
"quantizations": [
|
|
2212
|
+
"4-bit",
|
|
2213
|
+
"8-bit",
|
|
2214
|
+
"none"
|
|
2215
|
+
],
|
|
2216
|
+
"model_id": "qwen/CodeQwen1.5-7B-Chat",
|
|
2217
|
+
"model_hub": "modelscope"
|
|
2218
|
+
},
|
|
2219
|
+
{
|
|
2220
|
+
"model_format": "awq",
|
|
2221
|
+
"model_size_in_billions": 7,
|
|
2222
|
+
"quantizations": [
|
|
2223
|
+
"Int4"
|
|
2224
|
+
],
|
|
2225
|
+
"model_id": "qwen/CodeQwen1.5-7B-Chat-AWQ",
|
|
2226
|
+
"model_hub": "modelscope"
|
|
2227
|
+
}
|
|
2228
|
+
],
|
|
2229
|
+
"prompt_style": {
|
|
2230
|
+
"style_name": "QWEN",
|
|
2231
|
+
"system_prompt": "You are a helpful assistant.",
|
|
2232
|
+
"roles": [
|
|
2233
|
+
"user",
|
|
2234
|
+
"assistant"
|
|
2235
|
+
],
|
|
2236
|
+
"intra_message_sep": "\n",
|
|
2237
|
+
"stop_token_ids": [
|
|
2238
|
+
151643,
|
|
2239
|
+
151644,
|
|
2240
|
+
151645
|
|
2241
|
+
],
|
|
2242
|
+
"stop": [
|
|
2243
|
+
"<|endoftext|>",
|
|
2244
|
+
"<|im_start|>",
|
|
2245
|
+
"<|im_end|>"
|
|
2246
|
+
]
|
|
2247
|
+
}
|
|
2248
|
+
},
|
|
2178
2249
|
{
|
|
2179
2250
|
"version": 1,
|
|
2180
2251
|
"context_length": 4096,
|
|
@@ -3045,5 +3116,94 @@
|
|
|
3045
3116
|
"</s>"
|
|
3046
3117
|
]
|
|
3047
3118
|
}
|
|
3119
|
+
},
|
|
3120
|
+
{
|
|
3121
|
+
"version": 1,
|
|
3122
|
+
"context_length": 131072,
|
|
3123
|
+
"model_name": "c4ai-command-r-v01",
|
|
3124
|
+
"model_lang": [
|
|
3125
|
+
"en",
|
|
3126
|
+
"fr",
|
|
3127
|
+
"de",
|
|
3128
|
+
"es",
|
|
3129
|
+
"it",
|
|
3130
|
+
"pt",
|
|
3131
|
+
"ja",
|
|
3132
|
+
"ko",
|
|
3133
|
+
"zh",
|
|
3134
|
+
"ar"
|
|
3135
|
+
],
|
|
3136
|
+
"model_ability": [
|
|
3137
|
+
"generate"
|
|
3138
|
+
],
|
|
3139
|
+
"model_description": "C4AI Command-R is a research release of a 35 billion parameter highly performant generative model.",
|
|
3140
|
+
"model_specs": [
|
|
3141
|
+
{
|
|
3142
|
+
"model_format": "pytorch",
|
|
3143
|
+
"model_size_in_billions": 35,
|
|
3144
|
+
"quantizations": [
|
|
3145
|
+
"none"
|
|
3146
|
+
],
|
|
3147
|
+
"model_hub": "modelscope",
|
|
3148
|
+
"model_id": "AI-ModelScope/c4ai-command-r-v01",
|
|
3149
|
+
"model_revision": "master"
|
|
3150
|
+
},
|
|
3151
|
+
{
|
|
3152
|
+
"model_format": "ggufv2",
|
|
3153
|
+
"model_size_in_billions": 35,
|
|
3154
|
+
"quantizations": [
|
|
3155
|
+
"Q2_K",
|
|
3156
|
+
"Q4_K_M",
|
|
3157
|
+
"Q5_K_M"
|
|
3158
|
+
],
|
|
3159
|
+
"model_id": "mirror013/C4AI-Command-R-v01-GGUF",
|
|
3160
|
+
"model_file_name_template": "c4ai-command-r-v01.{quantization}.gguf",
|
|
3161
|
+
"model_hub": "modelscope",
|
|
3162
|
+
"model_revision": "master"
|
|
3163
|
+
},
|
|
3164
|
+
{
|
|
3165
|
+
"model_format": "pytorch",
|
|
3166
|
+
"model_size_in_billions": 104,
|
|
3167
|
+
"quantizations": [
|
|
3168
|
+
"none"
|
|
3169
|
+
],
|
|
3170
|
+
"model_hub": "modelscope",
|
|
3171
|
+
"model_id": "AI-ModelScope/c4ai-command-r-plus",
|
|
3172
|
+
"model_revision": "master"
|
|
3173
|
+
}
|
|
3174
|
+
]
|
|
3175
|
+
},
|
|
3176
|
+
{
|
|
3177
|
+
"version": 1,
|
|
3178
|
+
"context_length": 131072,
|
|
3179
|
+
"model_name": "c4ai-command-r-v01-4bit",
|
|
3180
|
+
"model_lang": [
|
|
3181
|
+
"en",
|
|
3182
|
+
"fr",
|
|
3183
|
+
"de",
|
|
3184
|
+
"es",
|
|
3185
|
+
"it",
|
|
3186
|
+
"pt",
|
|
3187
|
+
"ja",
|
|
3188
|
+
"ko",
|
|
3189
|
+
"zh",
|
|
3190
|
+
"ar"
|
|
3191
|
+
],
|
|
3192
|
+
"model_ability": [
|
|
3193
|
+
"generate"
|
|
3194
|
+
],
|
|
3195
|
+
"model_description": "This model is 4bit quantized version of C4AI Command-R using bitsandbytes.",
|
|
3196
|
+
"model_specs": [
|
|
3197
|
+
{
|
|
3198
|
+
"model_format": "pytorch",
|
|
3199
|
+
"model_size_in_billions": 35,
|
|
3200
|
+
"quantizations": [
|
|
3201
|
+
"none"
|
|
3202
|
+
],
|
|
3203
|
+
"model_hub": "modelscope",
|
|
3204
|
+
"model_id": "mirror013/c4ai-command-r-v01-4bit",
|
|
3205
|
+
"model_revision": "master"
|
|
3206
|
+
}
|
|
3207
|
+
]
|
|
3048
3208
|
}
|
|
3049
3209
|
]
|
|
@@ -12,8 +12,9 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Optional
|
|
15
|
+
from typing import List, Optional
|
|
16
16
|
|
|
17
|
+
from ....types import LoRA
|
|
17
18
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
18
19
|
from .core import PytorchChatModel, PytorchModelConfig
|
|
19
20
|
|
|
@@ -27,7 +28,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
|
|
|
27
28
|
quantization: str,
|
|
28
29
|
model_path: str,
|
|
29
30
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
30
|
-
|
|
31
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
31
32
|
):
|
|
32
33
|
super().__init__(
|
|
33
34
|
model_uid,
|
|
@@ -36,7 +37,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
|
|
|
36
37
|
quantization,
|
|
37
38
|
model_path,
|
|
38
39
|
pytorch_model_config=pytorch_model_config,
|
|
39
|
-
|
|
40
|
+
peft_model=peft_model,
|
|
40
41
|
)
|
|
41
42
|
self._use_fast_tokenizer = False
|
|
42
43
|
|
|
@@ -24,6 +24,7 @@ from ....types import (
|
|
|
24
24
|
CompletionChoice,
|
|
25
25
|
CompletionChunk,
|
|
26
26
|
CompletionUsage,
|
|
27
|
+
LoRA,
|
|
27
28
|
PytorchGenerateConfig,
|
|
28
29
|
)
|
|
29
30
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -39,7 +40,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
39
40
|
quantization: str,
|
|
40
41
|
model_path: str,
|
|
41
42
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
42
|
-
|
|
43
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
43
44
|
):
|
|
44
45
|
super().__init__(
|
|
45
46
|
model_uid,
|
|
@@ -48,7 +49,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
48
49
|
quantization,
|
|
49
50
|
model_path,
|
|
50
51
|
pytorch_model_config=pytorch_model_config,
|
|
51
|
-
|
|
52
|
+
peft_model=peft_model,
|
|
52
53
|
)
|
|
53
54
|
|
|
54
55
|
def _load_model(self, **kwargs):
|
|
@@ -32,6 +32,7 @@ from ....types import (
|
|
|
32
32
|
Embedding,
|
|
33
33
|
EmbeddingData,
|
|
34
34
|
EmbeddingUsage,
|
|
35
|
+
LoRA,
|
|
35
36
|
PytorchGenerateConfig,
|
|
36
37
|
PytorchModelConfig,
|
|
37
38
|
)
|
|
@@ -71,14 +72,14 @@ class PytorchModel(LLM):
|
|
|
71
72
|
quantization: str,
|
|
72
73
|
model_path: str,
|
|
73
74
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
74
|
-
|
|
75
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
75
76
|
):
|
|
76
77
|
super().__init__(model_uid, model_family, model_spec, quantization, model_path)
|
|
77
78
|
self._use_fast_tokenizer = True
|
|
78
79
|
self._pytorch_model_config: PytorchModelConfig = self._sanitize_model_config(
|
|
79
80
|
pytorch_model_config
|
|
80
81
|
)
|
|
81
|
-
self.
|
|
82
|
+
self._peft_model = peft_model
|
|
82
83
|
|
|
83
84
|
def _sanitize_model_config(
|
|
84
85
|
self, pytorch_model_config: Optional[PytorchModelConfig]
|
|
@@ -134,7 +135,7 @@ class PytorchModel(LLM):
|
|
|
134
135
|
return model, tokenizer
|
|
135
136
|
|
|
136
137
|
def _apply_lora(self):
|
|
137
|
-
if self.
|
|
138
|
+
if self._peft_model is not None:
|
|
138
139
|
try:
|
|
139
140
|
from peft import PeftModel
|
|
140
141
|
except ImportError:
|
|
@@ -142,14 +143,15 @@ class PytorchModel(LLM):
|
|
|
142
143
|
f"Failed to import 'PeftModel' from 'peft'. Please make sure 'peft' is installed.\n\n"
|
|
143
144
|
)
|
|
144
145
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
self._model
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
146
|
+
for peft_model in self._peft_model:
|
|
147
|
+
# Apply LoRA
|
|
148
|
+
self._model = PeftModel.from_pretrained(
|
|
149
|
+
self._model,
|
|
150
|
+
peft_model.local_path,
|
|
151
|
+
)
|
|
152
|
+
logger.info(
|
|
153
|
+
f"PEFT adaptor '{peft_model.lora_name}' successfully loaded for model '{self.model_uid}'."
|
|
154
|
+
)
|
|
153
155
|
|
|
154
156
|
def load(self):
|
|
155
157
|
try:
|
|
@@ -421,7 +423,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
421
423
|
quantization: str,
|
|
422
424
|
model_path: str,
|
|
423
425
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
424
|
-
|
|
426
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
425
427
|
):
|
|
426
428
|
super().__init__(
|
|
427
429
|
model_uid,
|
|
@@ -430,7 +432,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
430
432
|
quantization,
|
|
431
433
|
model_path,
|
|
432
434
|
pytorch_model_config,
|
|
433
|
-
|
|
435
|
+
peft_model,
|
|
434
436
|
)
|
|
435
437
|
|
|
436
438
|
def _sanitize_generate_config(
|
|
@@ -12,8 +12,9 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Optional
|
|
15
|
+
from typing import List, Optional
|
|
16
16
|
|
|
17
|
+
from ....types import LoRA
|
|
17
18
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
18
19
|
from .core import PytorchChatModel, PytorchModel, PytorchModelConfig
|
|
19
20
|
|
|
@@ -27,7 +28,7 @@ class FalconPytorchModel(PytorchModel):
|
|
|
27
28
|
quantization: str,
|
|
28
29
|
model_path: str,
|
|
29
30
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
30
|
-
|
|
31
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
31
32
|
):
|
|
32
33
|
super().__init__(
|
|
33
34
|
model_uid,
|
|
@@ -36,7 +37,7 @@ class FalconPytorchModel(PytorchModel):
|
|
|
36
37
|
quantization,
|
|
37
38
|
model_path,
|
|
38
39
|
pytorch_model_config=pytorch_model_config,
|
|
39
|
-
|
|
40
|
+
peft_model=peft_model,
|
|
40
41
|
)
|
|
41
42
|
|
|
42
43
|
def _load_model(self, **kwargs):
|
|
@@ -86,7 +87,7 @@ class FalconPytorchChatModel(PytorchChatModel):
|
|
|
86
87
|
quantization: str,
|
|
87
88
|
model_path: str,
|
|
88
89
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
89
|
-
|
|
90
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
90
91
|
):
|
|
91
92
|
super().__init__(
|
|
92
93
|
model_uid,
|
|
@@ -95,7 +96,7 @@ class FalconPytorchChatModel(PytorchChatModel):
|
|
|
95
96
|
quantization,
|
|
96
97
|
model_path,
|
|
97
98
|
pytorch_model_config=pytorch_model_config,
|
|
98
|
-
|
|
99
|
+
peft_model=peft_model,
|
|
99
100
|
)
|
|
100
101
|
|
|
101
102
|
def _load_model(self, **kwargs):
|
|
@@ -23,6 +23,7 @@ from ....types import (
|
|
|
23
23
|
CompletionChoice,
|
|
24
24
|
CompletionChunk,
|
|
25
25
|
CompletionUsage,
|
|
26
|
+
LoRA,
|
|
26
27
|
PytorchGenerateConfig,
|
|
27
28
|
)
|
|
28
29
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -38,7 +39,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
38
39
|
quantization: str,
|
|
39
40
|
model_path: str,
|
|
40
41
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
41
|
-
|
|
42
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
42
43
|
):
|
|
43
44
|
super().__init__(
|
|
44
45
|
model_uid,
|
|
@@ -47,7 +48,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
|
|
|
47
48
|
quantization,
|
|
48
49
|
model_path,
|
|
49
50
|
pytorch_model_config=pytorch_model_config,
|
|
50
|
-
|
|
51
|
+
peft_model=peft_model,
|
|
51
52
|
)
|
|
52
53
|
|
|
53
54
|
def _load_model(self, **kwargs):
|
|
@@ -12,8 +12,9 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Optional
|
|
15
|
+
from typing import List, Optional
|
|
16
16
|
|
|
17
|
+
from ....types import LoRA
|
|
17
18
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
18
19
|
from .core import PytorchChatModel, PytorchModel, PytorchModelConfig
|
|
19
20
|
|
|
@@ -27,7 +28,7 @@ class LlamaPytorchModel(PytorchModel):
|
|
|
27
28
|
quantization: str,
|
|
28
29
|
model_path: str,
|
|
29
30
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
30
|
-
|
|
31
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
31
32
|
):
|
|
32
33
|
super().__init__(
|
|
33
34
|
model_uid,
|
|
@@ -36,7 +37,7 @@ class LlamaPytorchModel(PytorchModel):
|
|
|
36
37
|
quantization,
|
|
37
38
|
model_path,
|
|
38
39
|
pytorch_model_config=pytorch_model_config,
|
|
39
|
-
|
|
40
|
+
peft_model=peft_model,
|
|
40
41
|
)
|
|
41
42
|
|
|
42
43
|
def _load_model(self, **kwargs):
|
|
@@ -69,8 +70,8 @@ class LlamaPytorchChatModel(PytorchChatModel):
|
|
|
69
70
|
model_spec: "LLMSpecV1",
|
|
70
71
|
quantization: str,
|
|
71
72
|
model_path: str,
|
|
72
|
-
peft_model_path: Optional[str] = None,
|
|
73
73
|
pytorch_model_config: Optional["PytorchModelConfig"] = None,
|
|
74
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
74
75
|
):
|
|
75
76
|
super().__init__(
|
|
76
77
|
model_uid,
|
|
@@ -78,7 +79,7 @@ class LlamaPytorchChatModel(PytorchChatModel):
|
|
|
78
79
|
model_spec,
|
|
79
80
|
quantization,
|
|
80
81
|
model_path,
|
|
81
|
-
|
|
82
|
+
peft_model=peft_model,
|
|
82
83
|
pytorch_model_config=pytorch_model_config,
|
|
83
84
|
)
|
|
84
85
|
self._use_fast_tokenizer = False
|
|
@@ -26,8 +26,9 @@
|
|
|
26
26
|
# See the License for the specific language governing permissions and
|
|
27
27
|
# limitations under the License.
|
|
28
28
|
|
|
29
|
-
from typing import Optional
|
|
29
|
+
from typing import List, Optional
|
|
30
30
|
|
|
31
|
+
from ....types import LoRA
|
|
31
32
|
from .. import LLMFamilyV1, LLMSpecV1
|
|
32
33
|
from .core import PytorchChatModel, PytorchModelConfig
|
|
33
34
|
|
|
@@ -41,7 +42,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
|
|
|
41
42
|
quantization: str,
|
|
42
43
|
model_path: str,
|
|
43
44
|
pytorch_model_config: Optional["PytorchModelConfig"] = None,
|
|
44
|
-
|
|
45
|
+
peft_model: Optional[List[LoRA]] = None,
|
|
45
46
|
):
|
|
46
47
|
super().__init__(
|
|
47
48
|
model_uid,
|
|
@@ -50,7 +51,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
|
|
|
50
51
|
quantization,
|
|
51
52
|
model_path,
|
|
52
53
|
pytorch_model_config=pytorch_model_config,
|
|
53
|
-
|
|
54
|
+
peft_model=peft_model,
|
|
54
55
|
)
|
|
55
56
|
self._use_fast_tokenizer = False
|
|
56
57
|
|
|
@@ -116,6 +116,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
|
|
|
116
116
|
]
|
|
117
117
|
if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
|
|
118
118
|
VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-chat")
|
|
119
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat")
|
|
119
120
|
|
|
120
121
|
if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
|
|
121
122
|
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
|
|
@@ -126,6 +127,8 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.3":
|
|
|
126
127
|
|
|
127
128
|
if VLLM_INSTALLED and vllm.__version__ >= "0.4.0":
|
|
128
129
|
VLLM_SUPPORTED_CHAT_MODELS.append("qwen1.5-moe-chat")
|
|
130
|
+
VLLM_SUPPORTED_MODELS.append("c4ai-command-r-v01")
|
|
131
|
+
VLLM_SUPPORTED_MODELS.append("c4ai-command-r-v01-4bit")
|
|
129
132
|
|
|
130
133
|
|
|
131
134
|
class VLLMModel(LLM):
|
xinference/model/rerank/core.py
CHANGED
|
@@ -42,8 +42,9 @@ def get_rerank_model_descriptions():
|
|
|
42
42
|
class RerankModelSpec(CacheableModelSpec):
|
|
43
43
|
model_name: str
|
|
44
44
|
language: List[str]
|
|
45
|
+
type: Optional[str] = "normal"
|
|
45
46
|
model_id: str
|
|
46
|
-
model_revision: str
|
|
47
|
+
model_revision: Optional[str]
|
|
47
48
|
model_hub: str = "huggingface"
|
|
48
49
|
|
|
49
50
|
|
|
@@ -63,6 +64,7 @@ class RerankModelDescription(ModelDescription):
|
|
|
63
64
|
"model_type": "rerank",
|
|
64
65
|
"address": self.address,
|
|
65
66
|
"accelerators": self.devices,
|
|
67
|
+
"type": self._model_spec.type,
|
|
66
68
|
"model_name": self._model_spec.model_name,
|
|
67
69
|
"language": self._model_spec.language,
|
|
68
70
|
"model_revision": self._model_spec.model_revision,
|
|
@@ -97,12 +99,14 @@ def generate_rerank_description(model_spec: RerankModelSpec) -> Dict[str, List[D
|
|
|
97
99
|
class RerankModel:
|
|
98
100
|
def __init__(
|
|
99
101
|
self,
|
|
102
|
+
model_spec: RerankModelSpec,
|
|
100
103
|
model_uid: str,
|
|
101
104
|
model_path: str,
|
|
102
105
|
device: Optional[str] = None,
|
|
103
106
|
use_fp16: bool = False,
|
|
104
107
|
model_config: Optional[Dict] = None,
|
|
105
108
|
):
|
|
109
|
+
self._model_spec = model_spec
|
|
106
110
|
self._model_uid = model_uid
|
|
107
111
|
self._model_path = model_path
|
|
108
112
|
self._device = device
|
|
@@ -112,20 +116,25 @@ class RerankModel:
|
|
|
112
116
|
|
|
113
117
|
def load(self):
|
|
114
118
|
try:
|
|
115
|
-
|
|
119
|
+
if self._model_spec.type == "normal":
|
|
120
|
+
from FlagEmbedding import FlagReranker
|
|
121
|
+
elif self._model_spec.type == "LLM-based":
|
|
122
|
+
from FlagEmbedding import FlagLLMReranker as FlagReranker
|
|
123
|
+
elif self._model_spec.type == "LLM-based layerwise":
|
|
124
|
+
from FlagEmbedding import LayerWiseFlagLLMReranker as FlagReranker
|
|
125
|
+
else:
|
|
126
|
+
raise RuntimeError(
|
|
127
|
+
f"Unsupported Rank model type: {self._model_spec.type}"
|
|
128
|
+
)
|
|
116
129
|
except ImportError:
|
|
117
|
-
error_message = "Failed to import module '
|
|
130
|
+
error_message = "Failed to import module 'FlagEmbedding'"
|
|
118
131
|
installation_guide = [
|
|
119
|
-
"Please make sure '
|
|
120
|
-
"You can install it by `pip install
|
|
132
|
+
"Please make sure 'FlagEmbedding' is installed. ",
|
|
133
|
+
"You can install it by `pip install FlagEmbedding`\n",
|
|
121
134
|
]
|
|
122
135
|
|
|
123
136
|
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
124
|
-
self._model =
|
|
125
|
-
self._model_path, device=self._device, **self._model_config
|
|
126
|
-
)
|
|
127
|
-
if self._use_fp16:
|
|
128
|
-
self._model.model.half()
|
|
137
|
+
self._model = FlagReranker(self._model_path, use_fp16=True)
|
|
129
138
|
|
|
130
139
|
def rerank(
|
|
131
140
|
self,
|
|
@@ -142,7 +151,7 @@ class RerankModel:
|
|
|
142
151
|
if max_chunks_per_doc is not None:
|
|
143
152
|
raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
|
|
144
153
|
sentence_combinations = [[query, doc] for doc in documents]
|
|
145
|
-
similarity_scores = self._model.
|
|
154
|
+
similarity_scores = self._model.compute_score(sentence_combinations)
|
|
146
155
|
sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
|
|
147
156
|
if top_n is not None:
|
|
148
157
|
sim_scores_argsort = sim_scores_argsort[:top_n]
|
|
@@ -224,7 +233,9 @@ def create_rerank_model_instance(
|
|
|
224
233
|
|
|
225
234
|
model_path = cache(model_spec)
|
|
226
235
|
use_fp16 = kwargs.pop("use_fp16", False)
|
|
227
|
-
model = RerankModel(
|
|
236
|
+
model = RerankModel(
|
|
237
|
+
model_spec, model_uid, model_path, use_fp16=use_fp16, model_config=kwargs
|
|
238
|
+
)
|
|
228
239
|
model_description = RerankModelDescription(
|
|
229
240
|
subpool_addr, devices, model_spec, model_path=model_path
|
|
230
241
|
)
|
|
@@ -1,20 +1,44 @@
|
|
|
1
1
|
[
|
|
2
2
|
{
|
|
3
3
|
"model_name": "bge-reranker-large",
|
|
4
|
+
"type": "normal",
|
|
4
5
|
"language": ["en", "zh"],
|
|
5
6
|
"model_id": "BAAI/bge-reranker-large",
|
|
6
7
|
"model_revision": "27c9168d479987529781de8474dff94d69beca11"
|
|
7
8
|
},
|
|
8
9
|
{
|
|
9
10
|
"model_name": "bge-reranker-base",
|
|
11
|
+
"type": "normal",
|
|
10
12
|
"language": ["en", "zh"],
|
|
11
13
|
"model_id": "BAAI/bge-reranker-base",
|
|
12
14
|
"model_revision": "465b4b7ddf2be0a020c8ad6e525b9bb1dbb708ae"
|
|
13
15
|
},
|
|
14
16
|
{
|
|
15
17
|
"model_name": "bce-reranker-base_v1",
|
|
18
|
+
"type": "normal",
|
|
16
19
|
"language": ["en", "zh"],
|
|
17
20
|
"model_id": "maidalun1020/bce-reranker-base_v1",
|
|
18
21
|
"model_revision": "eaa31a577a0574e87a08959bd229ca14ce1b5496"
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"model_name": "bge-reranker-v2-m3",
|
|
25
|
+
"type": "normal",
|
|
26
|
+
"language": ["en", "zh", "multilingual"],
|
|
27
|
+
"model_id": "BAAI/bge-reranker-v2-m3",
|
|
28
|
+
"model_revision": "12e974610ba9083ed95f3edf08d7e899581f4de4"
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"model_name": "bge-reranker-v2-gemma",
|
|
32
|
+
"type": "LLM-based",
|
|
33
|
+
"language": ["en", "zh", "multilingual"],
|
|
34
|
+
"model_id": "BAAI/bge-reranker-v2-gemma",
|
|
35
|
+
"model_revision": "1787044f8b6fb740a9de4557c3a12377f84d9e17"
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"model_name": "bge-reranker-v2-minicpm-layerwise",
|
|
39
|
+
"type": "LLM-based layerwise",
|
|
40
|
+
"language": ["en", "zh", "multilingual"],
|
|
41
|
+
"model_id": "BAAI/bge-reranker-v2-minicpm-layerwise",
|
|
42
|
+
"model_revision": "47b5332b296c4d8cb6ee2c60502cc62a0d708881"
|
|
19
43
|
}
|
|
20
44
|
]
|