xinference 0.10.2.post1__py3-none-any.whl → 0.10.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/core/worker.py +8 -4
- xinference/deploy/cmdline.py +9 -5
- xinference/model/audio/model_spec.json +8 -1
- xinference/model/embedding/core.py +13 -0
- xinference/model/llm/llm_family.json +143 -1
- xinference/model/llm/llm_family_modelscope.json +91 -1
- xinference/model/llm/utils.py +16 -0
- xinference/model/llm/vllm/core.py +2 -0
- xinference/model/rerank/core.py +48 -20
- {xinference-0.10.2.post1.dist-info → xinference-0.10.3.dist-info}/METADATA +4 -3
- {xinference-0.10.2.post1.dist-info → xinference-0.10.3.dist-info}/RECORD +16 -16
- {xinference-0.10.2.post1.dist-info → xinference-0.10.3.dist-info}/LICENSE +0 -0
- {xinference-0.10.2.post1.dist-info → xinference-0.10.3.dist-info}/WHEEL +0 -0
- {xinference-0.10.2.post1.dist-info → xinference-0.10.3.dist-info}/entry_points.txt +0 -0
- {xinference-0.10.2.post1.dist-info → xinference-0.10.3.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2024-04-
|
|
11
|
+
"date": "2024-04-24T10:45:37+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.10.
|
|
14
|
+
"full-revisionid": "2ba72b0ed55c2dbff12491485ffacee7996d3490",
|
|
15
|
+
"version": "0.10.3"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
xinference/core/worker.py
CHANGED
|
@@ -612,6 +612,14 @@ class WorkerActor(xo.StatelessActor):
|
|
|
612
612
|
gpu_idx: Optional[Union[int, List[int]]] = None,
|
|
613
613
|
**kwargs,
|
|
614
614
|
):
|
|
615
|
+
# !!! Note that The following code must be placed at the very beginning of this function,
|
|
616
|
+
# or there will be problems with auto-recovery.
|
|
617
|
+
# Because `locals()` will collect all the local parameters of this function and pass to this function again.
|
|
618
|
+
launch_args = locals()
|
|
619
|
+
launch_args.pop("self")
|
|
620
|
+
launch_args.pop("kwargs")
|
|
621
|
+
launch_args.update(kwargs)
|
|
622
|
+
|
|
615
623
|
event_model_uid, _, __ = parse_replica_model_uid(model_uid)
|
|
616
624
|
await self._event_collector_ref.report_event(
|
|
617
625
|
event_model_uid,
|
|
@@ -621,10 +629,6 @@ class WorkerActor(xo.StatelessActor):
|
|
|
621
629
|
event_content="Launch model",
|
|
622
630
|
),
|
|
623
631
|
)
|
|
624
|
-
launch_args = locals()
|
|
625
|
-
launch_args.pop("self")
|
|
626
|
-
launch_args.pop("kwargs")
|
|
627
|
-
launch_args.update(kwargs)
|
|
628
632
|
|
|
629
633
|
if gpu_idx is not None:
|
|
630
634
|
logger.info(
|
xinference/deploy/cmdline.py
CHANGED
|
@@ -736,11 +736,15 @@ def model_launch(
|
|
|
736
736
|
else []
|
|
737
737
|
)
|
|
738
738
|
|
|
739
|
-
peft_model_config =
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
739
|
+
peft_model_config = (
|
|
740
|
+
{
|
|
741
|
+
"image_lora_load_kwargs": image_lora_load_params,
|
|
742
|
+
"image_lora_fuse_kwargs": image_lora_fuse_params,
|
|
743
|
+
"lora_list": lora_list,
|
|
744
|
+
}
|
|
745
|
+
if lora_list or image_lora_load_params or image_lora_fuse_params
|
|
746
|
+
else None
|
|
747
|
+
)
|
|
744
748
|
|
|
745
749
|
_gpu_idx: Optional[List[int]] = (
|
|
746
750
|
None if gpu_idx is None else [int(idx) for idx in gpu_idx.split(",")]
|
|
@@ -75,5 +75,12 @@
|
|
|
75
75
|
"model_id": "BELLE-2/Belle-whisper-large-v2-zh",
|
|
76
76
|
"model_revision": "ec5bd5d78598545b7585814edde86dac2002b5b9",
|
|
77
77
|
"multilingual": false
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
"model_name": "Belle-whisper-large-v3-zh",
|
|
81
|
+
"model_family": "whisper",
|
|
82
|
+
"model_id": "BELLE-2/Belle-whisper-large-v3-zh",
|
|
83
|
+
"model_revision": "3bebc7247696b39f5ab9ed22db426943ac33f600",
|
|
84
|
+
"multilingual": false
|
|
78
85
|
}
|
|
79
|
-
]
|
|
86
|
+
]
|
|
@@ -12,12 +12,15 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import gc
|
|
15
16
|
import logging
|
|
17
|
+
import os
|
|
16
18
|
from collections import defaultdict
|
|
17
19
|
from typing import Dict, List, Optional, Tuple, Union, no_type_check
|
|
18
20
|
|
|
19
21
|
import numpy as np
|
|
20
22
|
|
|
23
|
+
from ...device_utils import empty_cache
|
|
21
24
|
from ...types import Embedding, EmbeddingData, EmbeddingUsage
|
|
22
25
|
from ..core import CacheableModelSpec, ModelDescription
|
|
23
26
|
from ..utils import get_cache_dir, is_model_cached
|
|
@@ -28,6 +31,10 @@ logger = logging.getLogger(__name__)
|
|
|
28
31
|
# Init when registering all the builtin models.
|
|
29
32
|
MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
|
|
30
33
|
EMBEDDING_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
|
|
34
|
+
EMBEDDING_EMPTY_CACHE_COUNT = int(
|
|
35
|
+
os.getenv("XINFERENCE_EMBEDDING_EMPTY_CACHE_COUNT", "10")
|
|
36
|
+
)
|
|
37
|
+
assert EMBEDDING_EMPTY_CACHE_COUNT > 0
|
|
31
38
|
|
|
32
39
|
|
|
33
40
|
def get_embedding_model_descriptions():
|
|
@@ -116,6 +123,7 @@ class EmbeddingModel:
|
|
|
116
123
|
self._model_path = model_path
|
|
117
124
|
self._device = device
|
|
118
125
|
self._model = None
|
|
126
|
+
self._counter = 0
|
|
119
127
|
|
|
120
128
|
def load(self):
|
|
121
129
|
try:
|
|
@@ -134,6 +142,11 @@ class EmbeddingModel:
|
|
|
134
142
|
self._model = SentenceTransformer(self._model_path, device=self._device)
|
|
135
143
|
|
|
136
144
|
def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
|
|
145
|
+
self._counter += 1
|
|
146
|
+
if self._counter % EMBEDDING_EMPTY_CACHE_COUNT == 0:
|
|
147
|
+
logger.debug("Empty embedding cache.")
|
|
148
|
+
gc.collect()
|
|
149
|
+
empty_cache()
|
|
137
150
|
from sentence_transformers import SentenceTransformer
|
|
138
151
|
|
|
139
152
|
kwargs.setdefault("normalize_embeddings", True)
|
|
@@ -1220,6 +1220,148 @@
|
|
|
1220
1220
|
}
|
|
1221
1221
|
]
|
|
1222
1222
|
},
|
|
1223
|
+
{
|
|
1224
|
+
"version": 1,
|
|
1225
|
+
"context_length": 8192,
|
|
1226
|
+
"model_name": "llama-3",
|
|
1227
|
+
"model_lang": [
|
|
1228
|
+
"en"
|
|
1229
|
+
],
|
|
1230
|
+
"model_ability": [
|
|
1231
|
+
"generate"
|
|
1232
|
+
],
|
|
1233
|
+
"model_description": "Llama 3 is an auto-regressive language model that uses an optimized transformer architecture",
|
|
1234
|
+
"model_specs": [
|
|
1235
|
+
{
|
|
1236
|
+
"model_format": "pytorch",
|
|
1237
|
+
"model_size_in_billions": 8,
|
|
1238
|
+
"quantizations": [
|
|
1239
|
+
"4-bit",
|
|
1240
|
+
"8-bit",
|
|
1241
|
+
"none"
|
|
1242
|
+
],
|
|
1243
|
+
"model_id": "meta-llama/Meta-Llama-3-8B"
|
|
1244
|
+
},
|
|
1245
|
+
{
|
|
1246
|
+
"model_format": "ggufv2",
|
|
1247
|
+
"model_size_in_billions": 8,
|
|
1248
|
+
"quantizations": [
|
|
1249
|
+
"Q2_K",
|
|
1250
|
+
"Q3_K_L",
|
|
1251
|
+
"Q3_K_M",
|
|
1252
|
+
"Q3_K_S",
|
|
1253
|
+
"Q4_0",
|
|
1254
|
+
"Q4_1",
|
|
1255
|
+
"Q4_K_M",
|
|
1256
|
+
"Q4_K_S",
|
|
1257
|
+
"Q5_0",
|
|
1258
|
+
"Q5_1",
|
|
1259
|
+
"Q5_K_M",
|
|
1260
|
+
"Q5_K_S",
|
|
1261
|
+
"Q6_K",
|
|
1262
|
+
"Q8_0"
|
|
1263
|
+
],
|
|
1264
|
+
"model_id": "QuantFactory/Meta-Llama-3-8B-GGUF",
|
|
1265
|
+
"model_file_name_template": "Meta-Llama-3-8B.{quantization}.gguf"
|
|
1266
|
+
},
|
|
1267
|
+
{
|
|
1268
|
+
"model_format": "pytorch",
|
|
1269
|
+
"model_size_in_billions": 70,
|
|
1270
|
+
"quantizations": [
|
|
1271
|
+
"4-bit",
|
|
1272
|
+
"8-bit",
|
|
1273
|
+
"none"
|
|
1274
|
+
],
|
|
1275
|
+
"model_id": "meta-llama/Meta-Llama-3-70B"
|
|
1276
|
+
},
|
|
1277
|
+
{
|
|
1278
|
+
"model_format": "ggufv2",
|
|
1279
|
+
"model_size_in_billions": 70,
|
|
1280
|
+
"quantizations": [
|
|
1281
|
+
"Q4_K_M",
|
|
1282
|
+
"Q5_K_M"
|
|
1283
|
+
],
|
|
1284
|
+
"model_id": "NousResearch/Meta-Llama-3-70B-GGUF",
|
|
1285
|
+
"model_file_name_template": "Meta-Llama-3-70B-{quantization}.gguf"
|
|
1286
|
+
}
|
|
1287
|
+
]
|
|
1288
|
+
},
|
|
1289
|
+
{
|
|
1290
|
+
"version": 1,
|
|
1291
|
+
"context_length": 8192,
|
|
1292
|
+
"model_name": "llama-3-instruct",
|
|
1293
|
+
"model_lang": [
|
|
1294
|
+
"en"
|
|
1295
|
+
],
|
|
1296
|
+
"model_ability": [
|
|
1297
|
+
"chat"
|
|
1298
|
+
],
|
|
1299
|
+
"model_description": "The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks..",
|
|
1300
|
+
"model_specs": [
|
|
1301
|
+
{
|
|
1302
|
+
"model_format": "ggufv2",
|
|
1303
|
+
"model_size_in_billions": 8,
|
|
1304
|
+
"quantizations": [
|
|
1305
|
+
"IQ3_M",
|
|
1306
|
+
"Q4_K_M",
|
|
1307
|
+
"Q5_K_M",
|
|
1308
|
+
"Q6_K",
|
|
1309
|
+
"Q8_0"
|
|
1310
|
+
],
|
|
1311
|
+
"model_id": "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
|
|
1312
|
+
"model_file_name_template": "Meta-Llama-3-8B-Instruct-{quantization}.gguf"
|
|
1313
|
+
},
|
|
1314
|
+
{
|
|
1315
|
+
"model_format": "pytorch",
|
|
1316
|
+
"model_size_in_billions": 8,
|
|
1317
|
+
"quantizations": [
|
|
1318
|
+
"4-bit",
|
|
1319
|
+
"8-bit",
|
|
1320
|
+
"none"
|
|
1321
|
+
],
|
|
1322
|
+
"model_id": "meta-llama/Meta-Llama-3-8B-Instruct"
|
|
1323
|
+
},
|
|
1324
|
+
{
|
|
1325
|
+
"model_format": "ggufv2",
|
|
1326
|
+
"model_size_in_billions": 70,
|
|
1327
|
+
"quantizations": [
|
|
1328
|
+
"IQ1_M",
|
|
1329
|
+
"IQ2_XS",
|
|
1330
|
+
"Q4_K_M"
|
|
1331
|
+
],
|
|
1332
|
+
"model_id": "lmstudio-community/Meta-Llama-3-70B-Instruct-GGUF",
|
|
1333
|
+
"model_file_name_template": "Meta-Llama-3-8B-Instruct-{quantization}.gguf"
|
|
1334
|
+
},
|
|
1335
|
+
{
|
|
1336
|
+
"model_format": "pytorch",
|
|
1337
|
+
"model_size_in_billions": 70,
|
|
1338
|
+
"quantizations": [
|
|
1339
|
+
"4-bit",
|
|
1340
|
+
"8-bit",
|
|
1341
|
+
"none"
|
|
1342
|
+
],
|
|
1343
|
+
"model_id": "meta-llama/Meta-Llama-3-70B-Instruct"
|
|
1344
|
+
}
|
|
1345
|
+
],
|
|
1346
|
+
"prompt_style": {
|
|
1347
|
+
"style_name": "LLAMA3",
|
|
1348
|
+
"system_prompt": "You are a helpful assistant.",
|
|
1349
|
+
"roles": [
|
|
1350
|
+
"user",
|
|
1351
|
+
"assistant"
|
|
1352
|
+
],
|
|
1353
|
+
"intra_message_sep": "\n\n",
|
|
1354
|
+
"inter_message_sep": "<|eot_id|>",
|
|
1355
|
+
"stop_token_ids": [
|
|
1356
|
+
128001,
|
|
1357
|
+
128009
|
|
1358
|
+
],
|
|
1359
|
+
"stop": [
|
|
1360
|
+
"<|end_of_text|>",
|
|
1361
|
+
"<|eot_id|>"
|
|
1362
|
+
]
|
|
1363
|
+
}
|
|
1364
|
+
},
|
|
1223
1365
|
{
|
|
1224
1366
|
"version": 1,
|
|
1225
1367
|
"context_length": 2048,
|
|
@@ -1932,7 +2074,7 @@
|
|
|
1932
2074
|
},
|
|
1933
2075
|
{
|
|
1934
2076
|
"version": 1,
|
|
1935
|
-
"context_length":
|
|
2077
|
+
"context_length": 32768,
|
|
1936
2078
|
"model_name": "codeqwen1.5-chat",
|
|
1937
2079
|
"model_lang": [
|
|
1938
2080
|
"en",
|
|
@@ -84,6 +84,96 @@
|
|
|
84
84
|
]
|
|
85
85
|
}
|
|
86
86
|
},
|
|
87
|
+
{
|
|
88
|
+
"version": 1,
|
|
89
|
+
"context_length": 8192,
|
|
90
|
+
"model_name": "llama-3",
|
|
91
|
+
"model_lang": [
|
|
92
|
+
"en"
|
|
93
|
+
],
|
|
94
|
+
"model_ability": [
|
|
95
|
+
"generate"
|
|
96
|
+
],
|
|
97
|
+
"model_description": "Llama 3 is an auto-regressive language model that uses an optimized transformer architecture",
|
|
98
|
+
"model_specs": [
|
|
99
|
+
{
|
|
100
|
+
"model_format": "pytorch",
|
|
101
|
+
"model_size_in_billions": 8,
|
|
102
|
+
"quantizations": [
|
|
103
|
+
"4-bit",
|
|
104
|
+
"8-bit",
|
|
105
|
+
"none"
|
|
106
|
+
],
|
|
107
|
+
"model_id": "LLM-Research/Meta-Llama-3-8B",
|
|
108
|
+
"model_hub": "modelscope"
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
"model_format": "pytorch",
|
|
112
|
+
"model_size_in_billions": 70,
|
|
113
|
+
"quantizations": [
|
|
114
|
+
"4-bit",
|
|
115
|
+
"8-bit",
|
|
116
|
+
"none"
|
|
117
|
+
],
|
|
118
|
+
"model_id": "LLM-Research/Meta-Llama-3-70B",
|
|
119
|
+
"model_hub": "modelscope"
|
|
120
|
+
}
|
|
121
|
+
]
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
"version": 1,
|
|
125
|
+
"context_length": 8192,
|
|
126
|
+
"model_name": "llama-3-instruct",
|
|
127
|
+
"model_lang": [
|
|
128
|
+
"en"
|
|
129
|
+
],
|
|
130
|
+
"model_ability": [
|
|
131
|
+
"chat"
|
|
132
|
+
],
|
|
133
|
+
"model_description": "The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks..",
|
|
134
|
+
"model_specs": [
|
|
135
|
+
{
|
|
136
|
+
"model_format": "pytorch",
|
|
137
|
+
"model_size_in_billions": 8,
|
|
138
|
+
"quantizations": [
|
|
139
|
+
"4-bit",
|
|
140
|
+
"8-bit",
|
|
141
|
+
"none"
|
|
142
|
+
],
|
|
143
|
+
"model_id": "LLM-Research/Meta-Llama-3-8B-Instruct",
|
|
144
|
+
"model_hub": "modelscope"
|
|
145
|
+
},
|
|
146
|
+
{
|
|
147
|
+
"model_format": "pytorch",
|
|
148
|
+
"model_size_in_billions": 70,
|
|
149
|
+
"quantizations": [
|
|
150
|
+
"4-bit",
|
|
151
|
+
"8-bit",
|
|
152
|
+
"none"
|
|
153
|
+
],
|
|
154
|
+
"model_id": "LLM-Research/Meta-Llama-3-70B-Instruct",
|
|
155
|
+
"model_hub": "modelscope"
|
|
156
|
+
}
|
|
157
|
+
],
|
|
158
|
+
"prompt_style": {
|
|
159
|
+
"style_name": "LLAMA3",
|
|
160
|
+
"system_prompt": "You are a helpful assistant.",
|
|
161
|
+
"roles": [
|
|
162
|
+
"user",
|
|
163
|
+
"assistant"
|
|
164
|
+
],
|
|
165
|
+
"intra_message_sep": "\n\n",
|
|
166
|
+
"inter_message_sep": "<|eot_id|>",
|
|
167
|
+
"stop_token_ids": [
|
|
168
|
+
128001,
|
|
169
|
+
128009
|
|
170
|
+
],
|
|
171
|
+
"stop": [
|
|
172
|
+
"<|end_of_text|>",
|
|
173
|
+
"<|eot_id|>"
|
|
174
|
+
]
|
|
175
|
+
}
|
|
176
|
+
},
|
|
87
177
|
{
|
|
88
178
|
"version": 1,
|
|
89
179
|
"context_length": 2048,
|
|
@@ -2177,7 +2267,7 @@
|
|
|
2177
2267
|
},
|
|
2178
2268
|
{
|
|
2179
2269
|
"version": 1,
|
|
2180
|
-
"context_length":
|
|
2270
|
+
"context_length": 32768,
|
|
2181
2271
|
"model_name": "codeqwen1.5-chat",
|
|
2182
2272
|
"model_lang": [
|
|
2183
2273
|
"en",
|
xinference/model/llm/utils.py
CHANGED
|
@@ -114,6 +114,22 @@ class ChatModelMixin:
|
|
|
114
114
|
else:
|
|
115
115
|
ret += role
|
|
116
116
|
return ret
|
|
117
|
+
elif prompt_style.style_name == "LLAMA3":
|
|
118
|
+
ret = (
|
|
119
|
+
f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
|
|
120
|
+
f"{prompt_style.intra_message_sep}{prompt_style.system_prompt}{prompt_style.inter_message_sep}"
|
|
121
|
+
)
|
|
122
|
+
for i, message in enumerate(chat_history):
|
|
123
|
+
role = get_role(message["role"])
|
|
124
|
+
content = message["content"]
|
|
125
|
+
if content:
|
|
126
|
+
ret += (
|
|
127
|
+
f"<|start_header_id|>{role}<|end_header_id|>"
|
|
128
|
+
f"{prompt_style.intra_message_sep}{content}{prompt_style.inter_message_sep}"
|
|
129
|
+
)
|
|
130
|
+
else:
|
|
131
|
+
ret += f"<|start_header_id|>{role}<|end_header_id|>{prompt_style.intra_message_sep}"
|
|
132
|
+
return ret
|
|
117
133
|
elif prompt_style.style_name == "FALCON":
|
|
118
134
|
ret = prompt_style.system_prompt
|
|
119
135
|
for message in chat_history:
|
|
@@ -85,6 +85,7 @@ except ImportError:
|
|
|
85
85
|
|
|
86
86
|
VLLM_SUPPORTED_MODELS = [
|
|
87
87
|
"llama-2",
|
|
88
|
+
"llama-3",
|
|
88
89
|
"baichuan",
|
|
89
90
|
"internlm-16k",
|
|
90
91
|
"mistral-v0.1",
|
|
@@ -94,6 +95,7 @@ VLLM_SUPPORTED_MODELS = [
|
|
|
94
95
|
]
|
|
95
96
|
VLLM_SUPPORTED_CHAT_MODELS = [
|
|
96
97
|
"llama-2-chat",
|
|
98
|
+
"llama-3-instruct",
|
|
97
99
|
"vicuna-v1.3",
|
|
98
100
|
"vicuna-v1.5",
|
|
99
101
|
"baichuan-chat",
|
xinference/model/rerank/core.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import gc
|
|
15
16
|
import logging
|
|
16
17
|
import os
|
|
17
18
|
import uuid
|
|
@@ -21,6 +22,7 @@ from typing import Dict, List, Optional, Tuple
|
|
|
21
22
|
import numpy as np
|
|
22
23
|
|
|
23
24
|
from ...constants import XINFERENCE_CACHE_DIR
|
|
25
|
+
from ...device_utils import empty_cache
|
|
24
26
|
from ...types import Document, DocumentObj, Rerank
|
|
25
27
|
from ..core import CacheableModelSpec, ModelDescription
|
|
26
28
|
from ..utils import is_model_cached
|
|
@@ -31,6 +33,8 @@ logger = logging.getLogger(__name__)
|
|
|
31
33
|
# Init when registering all the builtin models.
|
|
32
34
|
MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
|
|
33
35
|
RERANK_MODEL_DESCRIPTIONS: Dict[str, List[Dict]] = defaultdict(list)
|
|
36
|
+
RERANK_EMPTY_CACHE_COUNT = int(os.getenv("XINFERENCE_RERANK_EMPTY_CACHE_COUNT", "10"))
|
|
37
|
+
assert RERANK_EMPTY_CACHE_COUNT > 0
|
|
34
38
|
|
|
35
39
|
|
|
36
40
|
def get_rerank_model_descriptions():
|
|
@@ -113,28 +117,44 @@ class RerankModel:
|
|
|
113
117
|
self._model_config = model_config or dict()
|
|
114
118
|
self._use_fp16 = use_fp16
|
|
115
119
|
self._model = None
|
|
120
|
+
self._counter = 0
|
|
116
121
|
|
|
117
122
|
def load(self):
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
from
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
123
|
+
if self._model_spec.type == "normal":
|
|
124
|
+
try:
|
|
125
|
+
from sentence_transformers.cross_encoder import CrossEncoder
|
|
126
|
+
except ImportError:
|
|
127
|
+
error_message = "Failed to import module 'sentence-transformers'"
|
|
128
|
+
installation_guide = [
|
|
129
|
+
"Please make sure 'sentence-transformers' is installed. ",
|
|
130
|
+
"You can install it by `pip install sentence-transformers`\n",
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
134
|
+
self._model = CrossEncoder(
|
|
135
|
+
self._model_path, device=self._device, **self._model_config
|
|
136
|
+
)
|
|
137
|
+
if self._use_fp16:
|
|
138
|
+
self._model.model.half()
|
|
139
|
+
else:
|
|
140
|
+
try:
|
|
141
|
+
if self._model_spec.type == "LLM-based":
|
|
142
|
+
from FlagEmbedding import FlagLLMReranker as FlagReranker
|
|
143
|
+
elif self._model_spec.type == "LLM-based layerwise":
|
|
144
|
+
from FlagEmbedding import LayerWiseFlagLLMReranker as FlagReranker
|
|
145
|
+
else:
|
|
146
|
+
raise RuntimeError(
|
|
147
|
+
f"Unsupported Rank model type: {self._model_spec.type}"
|
|
148
|
+
)
|
|
149
|
+
except ImportError:
|
|
150
|
+
error_message = "Failed to import module 'FlagEmbedding'"
|
|
151
|
+
installation_guide = [
|
|
152
|
+
"Please make sure 'FlagEmbedding' is installed. ",
|
|
153
|
+
"You can install it by `pip install FlagEmbedding`\n",
|
|
154
|
+
]
|
|
135
155
|
|
|
136
|
-
|
|
137
|
-
|
|
156
|
+
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
157
|
+
self._model = FlagReranker(self._model_path, use_fp16=self._use_fp16)
|
|
138
158
|
|
|
139
159
|
def rerank(
|
|
140
160
|
self,
|
|
@@ -145,13 +165,21 @@ class RerankModel:
|
|
|
145
165
|
return_documents: Optional[bool],
|
|
146
166
|
**kwargs,
|
|
147
167
|
) -> Rerank:
|
|
168
|
+
self._counter += 1
|
|
169
|
+
if self._counter % RERANK_EMPTY_CACHE_COUNT == 0:
|
|
170
|
+
logger.debug("Empty rerank cache.")
|
|
171
|
+
gc.collect()
|
|
172
|
+
empty_cache()
|
|
148
173
|
assert self._model is not None
|
|
149
174
|
if kwargs:
|
|
150
175
|
raise ValueError("rerank hasn't support extra parameter.")
|
|
151
176
|
if max_chunks_per_doc is not None:
|
|
152
177
|
raise ValueError("rerank hasn't support `max_chunks_per_doc` parameter.")
|
|
153
178
|
sentence_combinations = [[query, doc] for doc in documents]
|
|
154
|
-
|
|
179
|
+
if self._model_spec.type == "normal":
|
|
180
|
+
similarity_scores = self._model.predict(sentence_combinations)
|
|
181
|
+
else:
|
|
182
|
+
similarity_scores = self._model.compute_score(sentence_combinations)
|
|
155
183
|
sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
|
|
156
184
|
if top_n is not None:
|
|
157
185
|
sim_scores_argsort = sim_scores_argsort[:top_n]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: xinference
|
|
3
|
-
Version: 0.10.
|
|
3
|
+
Version: 0.10.3
|
|
4
4
|
Summary: Model Serving Made Easy
|
|
5
5
|
Home-page: https://github.com/xorbitsai/inference
|
|
6
6
|
Author: Qin Xuye
|
|
@@ -176,13 +176,14 @@ potential of cutting-edge AI models.
|
|
|
176
176
|
- Docker image: [#855](https://github.com/xorbitsai/inference/pull/855)
|
|
177
177
|
- Support multimodal: [#829](https://github.com/xorbitsai/inference/pull/829)
|
|
178
178
|
### New Models
|
|
179
|
+
- Built-in support for [Llama 3](https://github.com/meta-llama/llama3): [#1332](https://github.com/xorbitsai/inference/pull/1332)
|
|
180
|
+
- Built-in support for [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01): [#1310](https://github.com/xorbitsai/inference/pull/1310)
|
|
179
181
|
- Built-in support for [Qwen1.5 MOE](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat): [#1263](https://github.com/xorbitsai/inference/pull/1263)
|
|
180
182
|
- Built-in support for [Qwen1.5 32B](https://huggingface.co/Qwen/Qwen1.5-32B-Chat): [#1249](https://github.com/xorbitsai/inference/pull/1249)
|
|
181
183
|
- Built-in support for [OmniLMM](https://github.com/OpenBMB/OmniLMM): [#1171](https://github.com/xorbitsai/inference/pull/1171)
|
|
182
184
|
- Built-in support for [Gemma](https://github.com/google-deepmind/gemma): [#1024](https://github.com/xorbitsai/inference/pull/1024)
|
|
183
|
-
- Built-in support for [Qwen1.5](https://github.com/QwenLM/Qwen1.5): [#994](https://github.com/xorbitsai/inference/pull/994)
|
|
184
|
-
- Built-in support for [Yi-VL](https://github.com/01-ai/Yi): [#946](https://github.com/xorbitsai/inference/pull/946)
|
|
185
185
|
### Integrations
|
|
186
|
+
- [FastGPT](https://github.com/labring/FastGPT): a knowledge-based platform built on the LLM, offers out-of-the-box data processing and model invocation capabilities, allows for workflow orchestration through Flow visualization.
|
|
186
187
|
- [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable.
|
|
187
188
|
- [Chatbox](https://chatboxai.app/): a desktop client for multiple cutting-edge LLM models, available on Windows, Mac and Linux.
|
|
188
189
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
xinference/__init__.py,sha256=0LgIveLP6CXxoIaSrxhlFyOh0lOqPgJBVcBe0tkWJjc,987
|
|
2
2
|
xinference/_compat.py,sha256=SQAjZMGxtBIce45qtW7ob7RWzA0zhv2yB3AxT0rb0uU,1778
|
|
3
|
-
xinference/_version.py,sha256=
|
|
3
|
+
xinference/_version.py,sha256=AQ6rrRceWHquLfKWGWzSVXI8bGhcAlO5_Q3_EWaZt1Q,498
|
|
4
4
|
xinference/conftest.py,sha256=RffV9htxwo6iDEGZwmcj0A_O_XBQM2RRUea4q6XTeGQ,9742
|
|
5
5
|
xinference/constants.py,sha256=Bu_fOJUGAvvqF_6FY5OzOHl7fQ1Nomek3LY17xr9oz4,2882
|
|
6
6
|
xinference/device_utils.py,sha256=WNKDD4Eni3Io3AehiyonsuoJaukT77Bc76Es7vNGvjc,2615
|
|
@@ -32,9 +32,9 @@ xinference/core/resource.py,sha256=FQ0aRt3T4ZQo0P6CZZf5QUKHiCsr5llBvKb1f7wfnxg,1
|
|
|
32
32
|
xinference/core/status_guard.py,sha256=ScmTFb3NPTp-RzufdHFpBh5TZHPc2bu907JA8l0gywE,2804
|
|
33
33
|
xinference/core/supervisor.py,sha256=salJ3vIjkQblexxLYl7Mi46iiWIKhpsY9W8DRXxoHrA,41212
|
|
34
34
|
xinference/core/utils.py,sha256=tUpUJUQv1zkE9i7fw1pAFfFdcB3PC6DvKJn4Bmmq75E,6008
|
|
35
|
-
xinference/core/worker.py,sha256=
|
|
35
|
+
xinference/core/worker.py,sha256=zfbxO3EJl3zJ7JKhXLEQ7EK3sd9yXSW8iUsn1dq5e00,33784
|
|
36
36
|
xinference/deploy/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
37
|
-
xinference/deploy/cmdline.py,sha256=
|
|
37
|
+
xinference/deploy/cmdline.py,sha256=bpc6g8V6FwVFx-DOGU7n8XRPSZFrXqFRmRH6atD98DE,35647
|
|
38
38
|
xinference/deploy/local.py,sha256=vlAvhcl8utP1DjW4MJpBgD4JLHQV-1Xebmdd8j9M8IM,3946
|
|
39
39
|
xinference/deploy/supervisor.py,sha256=fMHeEGigQ72PD9JEFmZ5Xudn25Uj4DhD2OVIlAu_YpA,2978
|
|
40
40
|
xinference/deploy/utils.py,sha256=_g4U6GJVzHnEHzF-KSMm-tffba2mtLNnxoEwnC8jmj8,5361
|
|
@@ -50,11 +50,11 @@ xinference/model/utils.py,sha256=qqCaje-dJvSarVzeGgmwKnq85e82JCLPVq2yCfAFZlo,145
|
|
|
50
50
|
xinference/model/audio/__init__.py,sha256=0EVzX6b4pcOO63NAcNpYWTVYVa7w7yG5cPpGxOY9MXw,2347
|
|
51
51
|
xinference/model/audio/core.py,sha256=ypbIvbueTFKeulYt7aJX7FfU4y3Hn3DzxkhhjKO6Dxw,4373
|
|
52
52
|
xinference/model/audio/custom.py,sha256=Li6VpTmpZ17YXk_bwN2-tUKRAJwNcW-O4OwrJefzC2o,4966
|
|
53
|
-
xinference/model/audio/model_spec.json,sha256=
|
|
53
|
+
xinference/model/audio/model_spec.json,sha256=dQUgG7HT9Ge4-0TBie7GcyXbPHz4lH_2HttVTm560Dg,2595
|
|
54
54
|
xinference/model/audio/utils.py,sha256=pwo5cHh8nvhyBa9f-17QaVpXMSjmbpGbPYKwBBtEhGM,717
|
|
55
55
|
xinference/model/audio/whisper.py,sha256=vWUn5huqER_g8ttxzHFNz6UNyDn2CnF7OzS_4PQjjKE,4599
|
|
56
56
|
xinference/model/embedding/__init__.py,sha256=0FLzOZyOuMctxFvhobkLXRUepwHck6RPbtjCct1eMI8,2854
|
|
57
|
-
xinference/model/embedding/core.py,sha256=
|
|
57
|
+
xinference/model/embedding/core.py,sha256=VJ1b7zUwkm5VtmtQx3-bYpJuETiKb4345dYP6P4oRM4,13023
|
|
58
58
|
xinference/model/embedding/custom.py,sha256=iE3-iWVzxarXdeTdw5e6rxv6HQRXVbPHp65wwhT2IL8,3919
|
|
59
59
|
xinference/model/embedding/model_spec.json,sha256=hpM2_FhH6gSqmrgu2MMu4u94XMEw6r9A6aKUQObsCK0,6652
|
|
60
60
|
xinference/model/embedding/model_spec_modelscope.json,sha256=No71OUu5OoALs6amJ0UiRU6JH9DkYRQvdvSgCf3IIHs,5814
|
|
@@ -68,10 +68,10 @@ xinference/model/image/stable_diffusion/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17k
|
|
|
68
68
|
xinference/model/image/stable_diffusion/core.py,sha256=ib_ZeSg7hzynmRqSnhjtrVuhoLOgZPrR1ZH2LjBmH2E,6063
|
|
69
69
|
xinference/model/llm/__init__.py,sha256=op1aUvEPtQ5KeWYvbP-skptyMC8osQphWKs7EbgNJ1c,6555
|
|
70
70
|
xinference/model/llm/core.py,sha256=FeZv1UiA7zPdmDcAQpmFL9Bslj6grqOSRvqsqkVtBHg,9572
|
|
71
|
-
xinference/model/llm/llm_family.json,sha256=
|
|
71
|
+
xinference/model/llm/llm_family.json,sha256=Few9frWihmqwN_c_Q0B5S1esZ8DPhGdnNRvxAGEQIOE,127493
|
|
72
72
|
xinference/model/llm/llm_family.py,sha256=pryVjq7WZ84x9kwzXQgXFgE5UxIqBn_LTudeXnDX5RE,34615
|
|
73
|
-
xinference/model/llm/llm_family_modelscope.json,sha256=
|
|
74
|
-
xinference/model/llm/utils.py,sha256=
|
|
73
|
+
xinference/model/llm/llm_family_modelscope.json,sha256=PJlaTLjcYdaqR95U8GYaSJsbxxCZ_Q-8k6Di4ciGZ_k,79795
|
|
74
|
+
xinference/model/llm/utils.py,sha256=gNuRa1VIk5Dv0rrkuCCNQJCFQ7iqwEKIjiej4Cfo8eY,29706
|
|
75
75
|
xinference/model/llm/ggml/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
76
76
|
xinference/model/llm/ggml/chatglm.py,sha256=Zrzw8K2EroI5v2JlwOAJ08tNFs871n86zRtBxuK97Z8,13044
|
|
77
77
|
xinference/model/llm/ggml/llamacpp.py,sha256=HLjcMOOrMoriaTx39jDOufyfY5lXdO84cCWZORjCc8U,11426
|
|
@@ -97,9 +97,9 @@ xinference/model/llm/pytorch/yi_vl.py,sha256=aZkMQPlIb522Ue1K62DAMclq1n9HVw4OQNu
|
|
|
97
97
|
xinference/model/llm/sglang/__init__.py,sha256=-sjSIQ4K6w-TEzx49kVaWeWC443fnZqODU91GCQ_JNo,581
|
|
98
98
|
xinference/model/llm/sglang/core.py,sha256=eqAczZfGJInC_jihXVeKiWQ79Llk3reHDBkdShQlH-0,12915
|
|
99
99
|
xinference/model/llm/vllm/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
100
|
-
xinference/model/llm/vllm/core.py,sha256=
|
|
100
|
+
xinference/model/llm/vllm/core.py,sha256=sV67VKfViYzX_IziSYKlwzO1rw7OUyZEJSCOnxRQSKY,18078
|
|
101
101
|
xinference/model/rerank/__init__.py,sha256=BXIL1uu3ZpZHX9bODhW9lxKUXudZE7-OkXFmmM5rpMU,2817
|
|
102
|
-
xinference/model/rerank/core.py,sha256=
|
|
102
|
+
xinference/model/rerank/core.py,sha256=UVfue73hHE9UL5c-X7OajZfTR_mLTv673RLFWZAfWV4,9665
|
|
103
103
|
xinference/model/rerank/custom.py,sha256=NKk7jA7p4xkuwS5WoOs2SY2wdnoAVpyCjBTvv317bBw,3917
|
|
104
104
|
xinference/model/rerank/model_spec.json,sha256=LCiiCdNz4NYt9vKVnHffk3ZpwvgzzHxe4zsaxOqxL18,1367
|
|
105
105
|
xinference/model/rerank/model_spec_modelscope.json,sha256=vSSC0aWy_DHnNDzzBcMWr2pqdISDmPS95FtD_YfMmn4,1275
|
|
@@ -15400,9 +15400,9 @@ xinference/web/ui/node_modules/yargs-parser/package.json,sha256=BSwbOzgetKXMK4u0
|
|
|
15400
15400
|
xinference/web/ui/node_modules/yocto-queue/package.json,sha256=6U1XHQPGXJTqsiFvT953ORihUtXTblZy4fXBWP9qxC0,725
|
|
15401
15401
|
xinference/web/ui/node_modules/yup/package.json,sha256=xRFSROB9NKxqSWHEVFvSTsPs9Ll074uo8OS1zEw0qhA,1206
|
|
15402
15402
|
xinference/web/ui/node_modules/yup/node_modules/type-fest/package.json,sha256=JTv2zTTVgxQ2H82m1-6qEpdMv08lHjFx4Puf_MsbB_Q,1134
|
|
15403
|
-
xinference-0.10.
|
|
15404
|
-
xinference-0.10.
|
|
15405
|
-
xinference-0.10.
|
|
15406
|
-
xinference-0.10.
|
|
15407
|
-
xinference-0.10.
|
|
15408
|
-
xinference-0.10.
|
|
15403
|
+
xinference-0.10.3.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
|
|
15404
|
+
xinference-0.10.3.dist-info/METADATA,sha256=CjQ70PUW3asgEheRVmH7_P6AZMUglcWaUkIo1VHvcz8,15256
|
|
15405
|
+
xinference-0.10.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
15406
|
+
xinference-0.10.3.dist-info/entry_points.txt,sha256=-lDyyzqWMFQF0Rgm7VxBNz0V-bMBMQLRR3pvQ-Y8XTY,226
|
|
15407
|
+
xinference-0.10.3.dist-info/top_level.txt,sha256=L1rQt7pl6m8tmKXpWVHzP-GtmzAxp663rXxGE7qnK00,11
|
|
15408
|
+
xinference-0.10.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|