xinference 0.15.2__py3-none-any.whl → 0.15.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/model/embedding/core.py +14 -5
- xinference/model/embedding/model_spec.json +7 -0
- xinference/model/embedding/model_spec_modelscope.json +9 -1
- xinference/model/image/stable_diffusion/core.py +12 -0
- xinference/model/llm/llm_family.json +12 -24
- xinference/model/llm/llm_family_modelscope.json +2 -10
- xinference/model/llm/utils.py +14 -3
- xinference/model/llm/vllm/core.py +22 -6
- xinference/model/llm/vllm/utils.py +42 -0
- xinference/model/rerank/core.py +19 -0
- xinference/model/rerank/model_spec.json +8 -0
- xinference/model/rerank/model_spec_modelscope.json +8 -0
- xinference/model/utils.py +0 -25
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.29578905.js → main.e51a356d.js} +3 -3
- xinference/web/ui/build/static/js/main.e51a356d.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4385c1095eefbff0a8ec3b2964ba6e5a66a05ab31be721483ca2f43e2a91f6ff.json +1 -0
- {xinference-0.15.2.dist-info → xinference-0.15.3.dist-info}/METADATA +4 -3
- {xinference-0.15.2.dist-info → xinference-0.15.3.dist-info}/RECORD +26 -25
- xinference/web/ui/build/static/js/main.29578905.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +0 -1
- /xinference/web/ui/build/static/js/{main.29578905.js.LICENSE.txt → main.e51a356d.js.LICENSE.txt} +0 -0
- {xinference-0.15.2.dist-info → xinference-0.15.3.dist-info}/LICENSE +0 -0
- {xinference-0.15.2.dist-info → xinference-0.15.3.dist-info}/WHEEL +0 -0
- {xinference-0.15.2.dist-info → xinference-0.15.3.dist-info}/entry_points.txt +0 -0
- {xinference-0.15.2.dist-info → xinference-0.15.3.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2024-09-
|
|
11
|
+
"date": "2024-09-30T20:17:26+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.15.
|
|
14
|
+
"full-revisionid": "00a9ee15279a60a6d75393c4720d8da5cbbf5796",
|
|
15
|
+
"version": "0.15.3"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -141,7 +141,15 @@ class EmbeddingModel:
|
|
|
141
141
|
|
|
142
142
|
def load(self):
|
|
143
143
|
try:
|
|
144
|
+
import sentence_transformers
|
|
144
145
|
from sentence_transformers import SentenceTransformer
|
|
146
|
+
|
|
147
|
+
if sentence_transformers.__version__ < "3.1.0":
|
|
148
|
+
raise ValueError(
|
|
149
|
+
"The sentence_transformers version must be greater than 3.1.0. "
|
|
150
|
+
"Please upgrade your version via `pip install -U sentence_transformers` or refer to "
|
|
151
|
+
"https://github.com/UKPLab/sentence-transformers"
|
|
152
|
+
)
|
|
145
153
|
except ImportError:
|
|
146
154
|
error_message = "Failed to import module 'SentenceTransformer'"
|
|
147
155
|
installation_guide = [
|
|
@@ -173,9 +181,6 @@ class EmbeddingModel:
|
|
|
173
181
|
)
|
|
174
182
|
torch_dtype = torch.float32
|
|
175
183
|
|
|
176
|
-
from ..utils import patch_trust_remote_code
|
|
177
|
-
|
|
178
|
-
patch_trust_remote_code()
|
|
179
184
|
if (
|
|
180
185
|
"gte" in self._model_spec.model_name.lower()
|
|
181
186
|
and "qwen2" in self._model_spec.model_name.lower()
|
|
@@ -191,7 +196,10 @@ class EmbeddingModel:
|
|
|
191
196
|
else:
|
|
192
197
|
model_kwargs = {"torch_dtype": torch_dtype} if torch_dtype else None
|
|
193
198
|
self._model = SentenceTransformer(
|
|
194
|
-
self._model_path,
|
|
199
|
+
self._model_path,
|
|
200
|
+
device=self._device,
|
|
201
|
+
model_kwargs=model_kwargs,
|
|
202
|
+
trust_remote_code=True,
|
|
195
203
|
)
|
|
196
204
|
|
|
197
205
|
def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
|
|
@@ -213,6 +221,7 @@ class EmbeddingModel:
|
|
|
213
221
|
convert_to_tensor: bool = False,
|
|
214
222
|
device: str = None,
|
|
215
223
|
normalize_embeddings: bool = False,
|
|
224
|
+
**kwargs,
|
|
216
225
|
):
|
|
217
226
|
"""
|
|
218
227
|
Computes sentence embeddings
|
|
@@ -317,7 +326,7 @@ class EmbeddingModel:
|
|
|
317
326
|
all_token_nums += features["attention_mask"].sum().item()
|
|
318
327
|
|
|
319
328
|
with torch.no_grad():
|
|
320
|
-
out_features = model.forward(features)
|
|
329
|
+
out_features = model.forward(features, **kwargs)
|
|
321
330
|
|
|
322
331
|
if output_value == "token_embeddings":
|
|
323
332
|
embeddings = []
|
|
@@ -238,5 +238,12 @@
|
|
|
238
238
|
"language": ["zh", "en"],
|
|
239
239
|
"model_id": "Alibaba-NLP/gte-Qwen2-7B-instruct",
|
|
240
240
|
"model_revision": "e26182b2122f4435e8b3ebecbf363990f409b45b"
|
|
241
|
+
},
|
|
242
|
+
{
|
|
243
|
+
"model_name": "jina-embeddings-v3",
|
|
244
|
+
"dimensions": 1024,
|
|
245
|
+
"max_tokens": 8192,
|
|
246
|
+
"language": ["zh", "en"],
|
|
247
|
+
"model_id": "jinaai/jina-embeddings-v3"
|
|
241
248
|
}
|
|
242
249
|
]
|
|
@@ -233,12 +233,20 @@
|
|
|
233
233
|
"model_id": "AI-ModelScope/m3e-large",
|
|
234
234
|
"model_hub": "modelscope"
|
|
235
235
|
},
|
|
236
|
-
|
|
236
|
+
{
|
|
237
237
|
"model_name": "gte-Qwen2",
|
|
238
238
|
"dimensions": 4096,
|
|
239
239
|
"max_tokens": 32000,
|
|
240
240
|
"language": ["zh", "en"],
|
|
241
241
|
"model_id": "iic/gte_Qwen2-7B-instruct",
|
|
242
242
|
"model_hub": "modelscope"
|
|
243
|
+
},
|
|
244
|
+
{
|
|
245
|
+
"model_name": "jina-embeddings-v3",
|
|
246
|
+
"dimensions": 1024,
|
|
247
|
+
"max_tokens": 8192,
|
|
248
|
+
"language": ["zh", "en"],
|
|
249
|
+
"model_id": "jinaai/jina-embeddings-v3",
|
|
250
|
+
"model_hub": "modelscope"
|
|
243
251
|
}
|
|
244
252
|
]
|
|
@@ -193,6 +193,18 @@ class DiffusionModel(SDAPIDiffusionModelMixin):
|
|
|
193
193
|
self._model_path,
|
|
194
194
|
**self._kwargs,
|
|
195
195
|
)
|
|
196
|
+
if self._kwargs.get("deepcache", True):
|
|
197
|
+
# NOTE: DeepCache should be loaded first before cpu_offloading
|
|
198
|
+
try:
|
|
199
|
+
from DeepCache import DeepCacheSDHelper
|
|
200
|
+
|
|
201
|
+
helper = DeepCacheSDHelper(pipe=self._model)
|
|
202
|
+
helper.set_params(cache_interval=3, cache_branch_id=0)
|
|
203
|
+
helper.enable()
|
|
204
|
+
except ImportError:
|
|
205
|
+
logger.debug("deepcache is not installed")
|
|
206
|
+
pass
|
|
207
|
+
|
|
196
208
|
if self._kwargs.get("cpu_offload", False):
|
|
197
209
|
logger.debug("CPU offloading model")
|
|
198
210
|
self._model.enable_model_cpu_offload()
|
|
@@ -6483,8 +6483,7 @@
|
|
|
6483
6483
|
"8-bit",
|
|
6484
6484
|
"none"
|
|
6485
6485
|
],
|
|
6486
|
-
"model_id": "OpenGVLab/InternVL2-1B"
|
|
6487
|
-
"model_revision": "a9fc14aea824b6ea1d44f8778cad6b35512c4ce1"
|
|
6486
|
+
"model_id": "OpenGVLab/InternVL2-1B"
|
|
6488
6487
|
},
|
|
6489
6488
|
{
|
|
6490
6489
|
"model_format": "pytorch",
|
|
@@ -6494,8 +6493,7 @@
|
|
|
6494
6493
|
"8-bit",
|
|
6495
6494
|
"none"
|
|
6496
6495
|
],
|
|
6497
|
-
"model_id": "OpenGVLab/InternVL2-2B"
|
|
6498
|
-
"model_revision": "422ad7c6335917bfb514958233955512338485a6"
|
|
6496
|
+
"model_id": "OpenGVLab/InternVL2-2B"
|
|
6499
6497
|
},
|
|
6500
6498
|
{
|
|
6501
6499
|
"model_format": "awq",
|
|
@@ -6503,8 +6501,7 @@
|
|
|
6503
6501
|
"quantizations": [
|
|
6504
6502
|
"Int4"
|
|
6505
6503
|
],
|
|
6506
|
-
"model_id": "OpenGVLab/InternVL2-2B-AWQ"
|
|
6507
|
-
"model_revision": "701bc3fc098a8a3b686b3b4135cfb77202be89e0"
|
|
6504
|
+
"model_id": "OpenGVLab/InternVL2-2B-AWQ"
|
|
6508
6505
|
},
|
|
6509
6506
|
{
|
|
6510
6507
|
"model_format": "pytorch",
|
|
@@ -6514,8 +6511,7 @@
|
|
|
6514
6511
|
"8-bit",
|
|
6515
6512
|
"none"
|
|
6516
6513
|
],
|
|
6517
|
-
"model_id": "OpenGVLab/InternVL2-4B"
|
|
6518
|
-
"model_revision": "b50544dafada6c41e80bfde2f57cc9b0140fc21c"
|
|
6514
|
+
"model_id": "OpenGVLab/InternVL2-4B"
|
|
6519
6515
|
},
|
|
6520
6516
|
{
|
|
6521
6517
|
"model_format": "pytorch",
|
|
@@ -6525,8 +6521,7 @@
|
|
|
6525
6521
|
"8-bit",
|
|
6526
6522
|
"none"
|
|
6527
6523
|
],
|
|
6528
|
-
"model_id": "OpenGVLab/InternVL2-8B"
|
|
6529
|
-
"model_revision": "3bfd3664dea4f3da628785f5125d30f889701253"
|
|
6524
|
+
"model_id": "OpenGVLab/InternVL2-8B"
|
|
6530
6525
|
},
|
|
6531
6526
|
{
|
|
6532
6527
|
"model_format": "awq",
|
|
@@ -6534,8 +6529,7 @@
|
|
|
6534
6529
|
"quantizations": [
|
|
6535
6530
|
"Int4"
|
|
6536
6531
|
],
|
|
6537
|
-
"model_id": "OpenGVLab/InternVL2-8B-AWQ"
|
|
6538
|
-
"model_revision": "9f1a4756b7ae18eb26d8a22b618dfc283e8193b3"
|
|
6532
|
+
"model_id": "OpenGVLab/InternVL2-8B-AWQ"
|
|
6539
6533
|
},
|
|
6540
6534
|
{
|
|
6541
6535
|
"model_format": "pytorch",
|
|
@@ -6545,8 +6539,7 @@
|
|
|
6545
6539
|
"8-bit",
|
|
6546
6540
|
"none"
|
|
6547
6541
|
],
|
|
6548
|
-
"model_id": "OpenGVLab/InternVL2-26B"
|
|
6549
|
-
"model_revision": "b9f3c7e6d575b0115e076a3ffc46fd20b7586899"
|
|
6542
|
+
"model_id": "OpenGVLab/InternVL2-26B"
|
|
6550
6543
|
},
|
|
6551
6544
|
{
|
|
6552
6545
|
"model_format": "awq",
|
|
@@ -6554,8 +6547,7 @@
|
|
|
6554
6547
|
"quantizations": [
|
|
6555
6548
|
"Int4"
|
|
6556
6549
|
],
|
|
6557
|
-
"model_id": "OpenGVLab/InternVL2-26B-AWQ"
|
|
6558
|
-
"model_revision": "469e0019ffd251e22ff6501a5c2321964e86ef0d"
|
|
6550
|
+
"model_id": "OpenGVLab/InternVL2-26B-AWQ"
|
|
6559
6551
|
},
|
|
6560
6552
|
{
|
|
6561
6553
|
"model_format": "pytorch",
|
|
@@ -6565,8 +6557,7 @@
|
|
|
6565
6557
|
"8-bit",
|
|
6566
6558
|
"none"
|
|
6567
6559
|
],
|
|
6568
|
-
"model_id": "OpenGVLab/InternVL2-40B"
|
|
6569
|
-
"model_revision": "725a12063bb855c966e30a0617d0ccd9e870d772"
|
|
6560
|
+
"model_id": "OpenGVLab/InternVL2-40B"
|
|
6570
6561
|
},
|
|
6571
6562
|
{
|
|
6572
6563
|
"model_format": "awq",
|
|
@@ -6574,8 +6565,7 @@
|
|
|
6574
6565
|
"quantizations": [
|
|
6575
6566
|
"Int4"
|
|
6576
6567
|
],
|
|
6577
|
-
"model_id": "OpenGVLab/InternVL2-40B-AWQ"
|
|
6578
|
-
"model_revision": "d92e140f6dfe8ea9679924c6a31898f42c4e1846"
|
|
6568
|
+
"model_id": "OpenGVLab/InternVL2-40B-AWQ"
|
|
6579
6569
|
},
|
|
6580
6570
|
{
|
|
6581
6571
|
"model_format": "pytorch",
|
|
@@ -6585,8 +6575,7 @@
|
|
|
6585
6575
|
"8-bit",
|
|
6586
6576
|
"none"
|
|
6587
6577
|
],
|
|
6588
|
-
"model_id": "OpenGVLab/InternVL2-Llama3-76B"
|
|
6589
|
-
"model_revision": "cf7914905f78e9e3560ddbd6f5dfc39becac494f"
|
|
6578
|
+
"model_id": "OpenGVLab/InternVL2-Llama3-76B"
|
|
6590
6579
|
},
|
|
6591
6580
|
{
|
|
6592
6581
|
"model_format": "awq",
|
|
@@ -6594,8 +6583,7 @@
|
|
|
6594
6583
|
"quantizations": [
|
|
6595
6584
|
"Int4"
|
|
6596
6585
|
],
|
|
6597
|
-
"model_id": "OpenGVLab/InternVL2-Llama3-76B-AWQ"
|
|
6598
|
-
"model_revision": "1bc796bf80f2ebc7d6a14c15f55217a4600d50a4"
|
|
6586
|
+
"model_id": "OpenGVLab/InternVL2-Llama3-76B-AWQ"
|
|
6599
6587
|
}
|
|
6600
6588
|
],
|
|
6601
6589
|
"chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
|
@@ -4334,16 +4334,8 @@
|
|
|
4334
4334
|
}
|
|
4335
4335
|
],
|
|
4336
4336
|
"chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
|
4337
|
-
"stop_token_ids": [
|
|
4338
|
-
|
|
4339
|
-
151644,
|
|
4340
|
-
151645
|
|
4341
|
-
],
|
|
4342
|
-
"stop": [
|
|
4343
|
-
"<|endoftext|>",
|
|
4344
|
-
"<|im_start|>",
|
|
4345
|
-
"<|im_end|>"
|
|
4346
|
-
]
|
|
4337
|
+
"stop_token_ids": [],
|
|
4338
|
+
"stop": []
|
|
4347
4339
|
},
|
|
4348
4340
|
{
|
|
4349
4341
|
"version": 1,
|
xinference/model/llm/utils.py
CHANGED
|
@@ -159,14 +159,25 @@ class ChatModelMixin:
|
|
|
159
159
|
for image_url in image_urls:
|
|
160
160
|
fut = executor.submit(_decode_image, image_url)
|
|
161
161
|
image_futures.append(fut)
|
|
162
|
-
images
|
|
162
|
+
images.extend([fut.result() for fut in image_futures])
|
|
163
163
|
if len(image_futures) == 0:
|
|
164
164
|
ret += role + "\n" + text + intra_message_sep + "\n"
|
|
165
165
|
else:
|
|
166
|
+
placeholders = "\n".join(
|
|
167
|
+
f"Image-{i+1}: <image>\n"
|
|
168
|
+
for i in range(
|
|
169
|
+
len(images) - len(image_futures), len(images)
|
|
170
|
+
)
|
|
171
|
+
)
|
|
166
172
|
ret += (
|
|
167
|
-
role
|
|
173
|
+
role
|
|
174
|
+
+ "\n"
|
|
175
|
+
+ f"{placeholders}\n{text}"
|
|
176
|
+
+ intra_message_sep
|
|
177
|
+
+ "\n"
|
|
168
178
|
)
|
|
169
|
-
|
|
179
|
+
if len(images) == 1:
|
|
180
|
+
ret = ret.replace("Image-1: <image>\n", "<image>\n")
|
|
170
181
|
return ret, images
|
|
171
182
|
else:
|
|
172
183
|
raise ValueError(f"Invalid model family: {model_family}")
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import asyncio
|
|
16
|
+
import json
|
|
16
17
|
import logging
|
|
17
18
|
import multiprocessing
|
|
18
19
|
import os
|
|
@@ -47,6 +48,7 @@ from ..utils import (
|
|
|
47
48
|
ChatModelMixin,
|
|
48
49
|
generate_completion_chunk,
|
|
49
50
|
)
|
|
51
|
+
from .utils import vllm_check
|
|
50
52
|
|
|
51
53
|
logger = logging.getLogger(__name__)
|
|
52
54
|
|
|
@@ -65,6 +67,7 @@ class VLLMModelConfig(TypedDict, total=False):
|
|
|
65
67
|
max_num_seqs: int
|
|
66
68
|
quantization: Optional[str]
|
|
67
69
|
max_model_len: Optional[int]
|
|
70
|
+
limit_mm_per_prompt: Optional[Dict[str, int]]
|
|
68
71
|
|
|
69
72
|
|
|
70
73
|
class VLLMGenerateConfig(TypedDict, total=False):
|
|
@@ -90,9 +93,7 @@ try:
|
|
|
90
93
|
except ImportError:
|
|
91
94
|
VLLM_INSTALLED = False
|
|
92
95
|
|
|
93
|
-
VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = [
|
|
94
|
-
"internvl2",
|
|
95
|
-
]
|
|
96
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = []
|
|
96
97
|
VLLM_SUPPORTED_MODELS = [
|
|
97
98
|
"llama-2",
|
|
98
99
|
"llama-3",
|
|
@@ -171,6 +172,9 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
|
|
|
171
172
|
VLLM_SUPPORTED_MODELS.append("llama-3.1")
|
|
172
173
|
VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
|
|
173
174
|
|
|
175
|
+
if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
|
|
176
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
|
|
177
|
+
|
|
174
178
|
|
|
175
179
|
class VLLMModel(LLM):
|
|
176
180
|
def __init__(
|
|
@@ -304,7 +308,12 @@ class VLLMModel(LLM):
|
|
|
304
308
|
model_config.setdefault("gpu_memory_utilization", 0.90)
|
|
305
309
|
model_config.setdefault("max_num_seqs", 256)
|
|
306
310
|
model_config.setdefault("quantization", None)
|
|
307
|
-
model_config.setdefault("max_model_len",
|
|
311
|
+
model_config.setdefault("max_model_len", None)
|
|
312
|
+
model_config["limit_mm_per_prompt"] = (
|
|
313
|
+
json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
|
|
314
|
+
if model_config.get("limit_mm_per_prompt")
|
|
315
|
+
else None
|
|
316
|
+
)
|
|
308
317
|
|
|
309
318
|
return model_config
|
|
310
319
|
|
|
@@ -434,6 +443,7 @@ class VLLMModel(LLM):
|
|
|
434
443
|
usage=usage,
|
|
435
444
|
)
|
|
436
445
|
|
|
446
|
+
@vllm_check
|
|
437
447
|
async def async_generate(
|
|
438
448
|
self,
|
|
439
449
|
prompt: Union[str, Dict[str, Any]],
|
|
@@ -665,6 +675,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
665
675
|
yield self._to_chat_completion_chunk(chunk)
|
|
666
676
|
i += 1
|
|
667
677
|
|
|
678
|
+
@vllm_check
|
|
668
679
|
async def async_chat(
|
|
669
680
|
self,
|
|
670
681
|
messages: List[Dict],
|
|
@@ -741,13 +752,13 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
741
752
|
)
|
|
742
753
|
return generate_config
|
|
743
754
|
|
|
755
|
+
@vllm_check
|
|
744
756
|
async def async_chat(
|
|
745
757
|
self,
|
|
746
758
|
messages: List[Dict],
|
|
747
759
|
generate_config: Optional[Dict] = None,
|
|
748
760
|
request_id: Optional[str] = None,
|
|
749
761
|
) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
|
|
750
|
-
# only support single image, waiting vllm support multi images
|
|
751
762
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
752
763
|
prompt, images = self.get_specific_prompt(model_family, messages)
|
|
753
764
|
|
|
@@ -755,11 +766,16 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
755
766
|
inputs = {
|
|
756
767
|
"prompt": prompt,
|
|
757
768
|
}
|
|
758
|
-
|
|
769
|
+
elif len(images) == 1:
|
|
759
770
|
inputs = {
|
|
760
771
|
"prompt": prompt,
|
|
761
772
|
"multi_modal_data": {"image": images[-1]}, # type: ignore
|
|
762
773
|
}
|
|
774
|
+
else:
|
|
775
|
+
inputs = {
|
|
776
|
+
"prompt": prompt,
|
|
777
|
+
"multi_modal_data": {"image": images}, # type: ignore
|
|
778
|
+
}
|
|
763
779
|
generate_config = self._sanitize_chat_config(generate_config)
|
|
764
780
|
|
|
765
781
|
stream = generate_config.get("stream", None)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import functools
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def vllm_check(fn):
|
|
22
|
+
try:
|
|
23
|
+
from vllm.engine.async_llm_engine import AsyncEngineDeadError
|
|
24
|
+
except:
|
|
25
|
+
return fn
|
|
26
|
+
|
|
27
|
+
@functools.wraps(fn)
|
|
28
|
+
async def _async_wrapper(self, *args, **kwargs):
|
|
29
|
+
logger.info("vllm_check")
|
|
30
|
+
try:
|
|
31
|
+
return await fn(self, *args, **kwargs)
|
|
32
|
+
except AsyncEngineDeadError:
|
|
33
|
+
logger.info("Detecting vLLM is not health, prepare to quit the process")
|
|
34
|
+
try:
|
|
35
|
+
self.stop()
|
|
36
|
+
except:
|
|
37
|
+
# ignore error when stop
|
|
38
|
+
pass
|
|
39
|
+
# Just kill the process and let xinference auto-recover the model
|
|
40
|
+
os._exit(1)
|
|
41
|
+
|
|
42
|
+
return _async_wrapper
|
xinference/model/rerank/core.py
CHANGED
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import gc
|
|
16
|
+
import importlib
|
|
16
17
|
import logging
|
|
17
18
|
import os
|
|
18
19
|
import threading
|
|
@@ -178,9 +179,27 @@ class RerankModel:
|
|
|
178
179
|
return rerank_type
|
|
179
180
|
|
|
180
181
|
def load(self):
|
|
182
|
+
flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
|
|
183
|
+
if (
|
|
184
|
+
self._auto_detect_type(self._model_path) != "normal"
|
|
185
|
+
and flash_attn_installed
|
|
186
|
+
):
|
|
187
|
+
logger.warning(
|
|
188
|
+
"flash_attn can only support fp16 and bf16, "
|
|
189
|
+
"will force set `use_fp16` to True"
|
|
190
|
+
)
|
|
191
|
+
self._use_fp16 = True
|
|
181
192
|
if self._model_spec.type == "normal":
|
|
182
193
|
try:
|
|
194
|
+
import sentence_transformers
|
|
183
195
|
from sentence_transformers.cross_encoder import CrossEncoder
|
|
196
|
+
|
|
197
|
+
if sentence_transformers.__version__ < "3.1.0":
|
|
198
|
+
raise ValueError(
|
|
199
|
+
"The sentence_transformers version must be greater than 3.1.0. "
|
|
200
|
+
"Please upgrade your version via `pip install -U sentence_transformers` or refer to "
|
|
201
|
+
"https://github.com/UKPLab/sentence-transformers"
|
|
202
|
+
)
|
|
184
203
|
except ImportError:
|
|
185
204
|
error_message = "Failed to import module 'sentence-transformers'"
|
|
186
205
|
installation_guide = [
|
|
@@ -54,5 +54,13 @@
|
|
|
54
54
|
"max_tokens": 1024,
|
|
55
55
|
"model_id": "jinaai/jina-reranker-v2-base-multilingual",
|
|
56
56
|
"model_revision": "298e48cada4a9318650d7fbd795f63827f884087"
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"model_name": "minicpm-reranker",
|
|
60
|
+
"type": "normal",
|
|
61
|
+
"language": ["en", "zh"],
|
|
62
|
+
"max_tokens": 1024,
|
|
63
|
+
"model_id": "openbmb/MiniCPM-Reranker",
|
|
64
|
+
"model_revision": "5d2fd7345b6444c89d4c0fa59c92272888f3f2d0"
|
|
57
65
|
}
|
|
58
66
|
]
|
|
@@ -49,5 +49,13 @@
|
|
|
49
49
|
"max_tokens": 2048,
|
|
50
50
|
"model_id": "mirror013/bge-reranker-v2-minicpm-layerwise",
|
|
51
51
|
"model_hub": "modelscope"
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"model_name": "minicpm-reranker",
|
|
55
|
+
"type": "normal",
|
|
56
|
+
"language": ["en", "zh"],
|
|
57
|
+
"max_tokens": 1024,
|
|
58
|
+
"model_id": "OpenBMB/MiniCPM-Reranker",
|
|
59
|
+
"model_hub": "modelscope"
|
|
52
60
|
}
|
|
53
61
|
]
|
xinference/model/utils.py
CHANGED
|
@@ -300,31 +300,6 @@ def cache(model_spec: CacheableModelSpec, model_description_type: type):
|
|
|
300
300
|
return cache_dir
|
|
301
301
|
|
|
302
302
|
|
|
303
|
-
def patch_trust_remote_code():
|
|
304
|
-
"""sentence-transformers calls transformers without the trust_remote_code=True, some embedding
|
|
305
|
-
models will fail to load, e.g. jina-embeddings-v2-base-en
|
|
306
|
-
|
|
307
|
-
:return:
|
|
308
|
-
"""
|
|
309
|
-
try:
|
|
310
|
-
from transformers.dynamic_module_utils import resolve_trust_remote_code
|
|
311
|
-
except ImportError:
|
|
312
|
-
logger.error("Patch transformers trust_remote_code failed.")
|
|
313
|
-
else:
|
|
314
|
-
|
|
315
|
-
def _patched_resolve_trust_remote_code(*args, **kwargs):
|
|
316
|
-
logger.info("Patched resolve_trust_remote_code: %s %s", args, kwargs)
|
|
317
|
-
return True
|
|
318
|
-
|
|
319
|
-
if (
|
|
320
|
-
resolve_trust_remote_code.__code__
|
|
321
|
-
!= _patched_resolve_trust_remote_code.__code__
|
|
322
|
-
):
|
|
323
|
-
resolve_trust_remote_code.__code__ = (
|
|
324
|
-
_patched_resolve_trust_remote_code.__code__
|
|
325
|
-
)
|
|
326
|
-
|
|
327
|
-
|
|
328
303
|
def select_device(device):
|
|
329
304
|
try:
|
|
330
305
|
import torch # noqa: F401
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
3
|
"main.css": "./static/css/main.5061c4c3.css",
|
|
4
|
-
"main.js": "./static/js/main.
|
|
4
|
+
"main.js": "./static/js/main.e51a356d.js",
|
|
5
5
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
6
6
|
"index.html": "./index.html",
|
|
7
7
|
"main.5061c4c3.css.map": "./static/css/main.5061c4c3.css.map",
|
|
8
|
-
"main.
|
|
8
|
+
"main.e51a356d.js.map": "./static/js/main.e51a356d.js.map"
|
|
9
9
|
},
|
|
10
10
|
"entrypoints": [
|
|
11
11
|
"static/css/main.5061c4c3.css",
|
|
12
|
-
"static/js/main.
|
|
12
|
+
"static/js/main.e51a356d.js"
|
|
13
13
|
]
|
|
14
14
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.e51a356d.js"></script><link href="./static/css/main.5061c4c3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|