xinference 0.15.1__py3-none-any.whl → 0.15.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/core/model.py +2 -2
- xinference/model/audio/cosyvoice.py +3 -3
- xinference/model/embedding/core.py +14 -5
- xinference/model/embedding/model_spec.json +7 -0
- xinference/model/embedding/model_spec_modelscope.json +9 -1
- xinference/model/image/stable_diffusion/core.py +42 -19
- xinference/model/llm/__init__.py +1 -1
- xinference/model/llm/llm_family.json +862 -26
- xinference/model/llm/llm_family_modelscope.json +895 -10
- xinference/model/llm/sglang/core.py +4 -0
- xinference/model/llm/utils.py +14 -3
- xinference/model/llm/vllm/core.py +27 -6
- xinference/model/llm/vllm/utils.py +42 -0
- xinference/model/rerank/core.py +19 -0
- xinference/model/rerank/model_spec.json +8 -0
- xinference/model/rerank/model_spec_modelscope.json +8 -0
- xinference/model/utils.py +0 -25
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.754740c0.js → main.e51a356d.js} +3 -3
- xinference/web/ui/build/static/js/main.e51a356d.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4385c1095eefbff0a8ec3b2964ba6e5a66a05ab31be721483ca2f43e2a91f6ff.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/c7bf40bab396765f67d0fed627ed3665890608b2d0edaa3e8cb7cfc96310db45.json +1 -0
- {xinference-0.15.1.dist-info → xinference-0.15.3.dist-info}/METADATA +8 -7
- {xinference-0.15.1.dist-info → xinference-0.15.3.dist-info}/RECORD +31 -30
- xinference/web/ui/build/static/js/main.754740c0.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/cd90b08d177025dfe84209596fc51878f8a86bcaa6a240848a3d2e5fd4c7ff24.json +0 -1
- /xinference/web/ui/build/static/js/{main.754740c0.js.LICENSE.txt → main.e51a356d.js.LICENSE.txt} +0 -0
- {xinference-0.15.1.dist-info → xinference-0.15.3.dist-info}/LICENSE +0 -0
- {xinference-0.15.1.dist-info → xinference-0.15.3.dist-info}/WHEEL +0 -0
- {xinference-0.15.1.dist-info → xinference-0.15.3.dist-info}/entry_points.txt +0 -0
- {xinference-0.15.1.dist-info → xinference-0.15.3.dist-info}/top_level.txt +0 -0
|
@@ -68,6 +68,8 @@ SGLANG_SUPPORTED_MODELS = [
|
|
|
68
68
|
"llama-3.1",
|
|
69
69
|
"mistral-v0.1",
|
|
70
70
|
"mixtral-v0.1",
|
|
71
|
+
"qwen2.5",
|
|
72
|
+
"qwen2.5-coder",
|
|
71
73
|
]
|
|
72
74
|
SGLANG_SUPPORTED_CHAT_MODELS = [
|
|
73
75
|
"llama-2-chat",
|
|
@@ -85,6 +87,8 @@ SGLANG_SUPPORTED_CHAT_MODELS = [
|
|
|
85
87
|
"deepseek-v2.5",
|
|
86
88
|
"deepseek-v2-chat",
|
|
87
89
|
"deepseek-v2-chat-0628",
|
|
90
|
+
"qwen2.5-instruct",
|
|
91
|
+
"qwen2.5-coder-instruct",
|
|
88
92
|
]
|
|
89
93
|
|
|
90
94
|
|
xinference/model/llm/utils.py
CHANGED
|
@@ -159,14 +159,25 @@ class ChatModelMixin:
|
|
|
159
159
|
for image_url in image_urls:
|
|
160
160
|
fut = executor.submit(_decode_image, image_url)
|
|
161
161
|
image_futures.append(fut)
|
|
162
|
-
images
|
|
162
|
+
images.extend([fut.result() for fut in image_futures])
|
|
163
163
|
if len(image_futures) == 0:
|
|
164
164
|
ret += role + "\n" + text + intra_message_sep + "\n"
|
|
165
165
|
else:
|
|
166
|
+
placeholders = "\n".join(
|
|
167
|
+
f"Image-{i+1}: <image>\n"
|
|
168
|
+
for i in range(
|
|
169
|
+
len(images) - len(image_futures), len(images)
|
|
170
|
+
)
|
|
171
|
+
)
|
|
166
172
|
ret += (
|
|
167
|
-
role
|
|
173
|
+
role
|
|
174
|
+
+ "\n"
|
|
175
|
+
+ f"{placeholders}\n{text}"
|
|
176
|
+
+ intra_message_sep
|
|
177
|
+
+ "\n"
|
|
168
178
|
)
|
|
169
|
-
|
|
179
|
+
if len(images) == 1:
|
|
180
|
+
ret = ret.replace("Image-1: <image>\n", "<image>\n")
|
|
170
181
|
return ret, images
|
|
171
182
|
else:
|
|
172
183
|
raise ValueError(f"Invalid model family: {model_family}")
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import asyncio
|
|
16
|
+
import json
|
|
16
17
|
import logging
|
|
17
18
|
import multiprocessing
|
|
18
19
|
import os
|
|
@@ -47,6 +48,7 @@ from ..utils import (
|
|
|
47
48
|
ChatModelMixin,
|
|
48
49
|
generate_completion_chunk,
|
|
49
50
|
)
|
|
51
|
+
from .utils import vllm_check
|
|
50
52
|
|
|
51
53
|
logger = logging.getLogger(__name__)
|
|
52
54
|
|
|
@@ -65,6 +67,7 @@ class VLLMModelConfig(TypedDict, total=False):
|
|
|
65
67
|
max_num_seqs: int
|
|
66
68
|
quantization: Optional[str]
|
|
67
69
|
max_model_len: Optional[int]
|
|
70
|
+
limit_mm_per_prompt: Optional[Dict[str, int]]
|
|
68
71
|
|
|
69
72
|
|
|
70
73
|
class VLLMGenerateConfig(TypedDict, total=False):
|
|
@@ -90,9 +93,7 @@ try:
|
|
|
90
93
|
except ImportError:
|
|
91
94
|
VLLM_INSTALLED = False
|
|
92
95
|
|
|
93
|
-
VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = [
|
|
94
|
-
"internvl2",
|
|
95
|
-
]
|
|
96
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = []
|
|
96
97
|
VLLM_SUPPORTED_MODELS = [
|
|
97
98
|
"llama-2",
|
|
98
99
|
"llama-3",
|
|
@@ -138,6 +139,11 @@ if VLLM_INSTALLED and vllm.__version__ >= "0.3.0":
|
|
|
138
139
|
VLLM_SUPPORTED_MODELS.append("codeqwen1.5")
|
|
139
140
|
VLLM_SUPPORTED_CHAT_MODELS.append("codeqwen1.5-chat")
|
|
140
141
|
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-instruct")
|
|
142
|
+
VLLM_SUPPORTED_MODELS.append("qwen2.5")
|
|
143
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct")
|
|
144
|
+
VLLM_SUPPORTED_MODELS.append("qwen2.5-coder")
|
|
145
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-coder-instruct")
|
|
146
|
+
|
|
141
147
|
|
|
142
148
|
if VLLM_INSTALLED and vllm.__version__ >= "0.3.2":
|
|
143
149
|
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-it")
|
|
@@ -166,6 +172,9 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
|
|
|
166
172
|
VLLM_SUPPORTED_MODELS.append("llama-3.1")
|
|
167
173
|
VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
|
|
168
174
|
|
|
175
|
+
if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
|
|
176
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
|
|
177
|
+
|
|
169
178
|
|
|
170
179
|
class VLLMModel(LLM):
|
|
171
180
|
def __init__(
|
|
@@ -299,7 +308,12 @@ class VLLMModel(LLM):
|
|
|
299
308
|
model_config.setdefault("gpu_memory_utilization", 0.90)
|
|
300
309
|
model_config.setdefault("max_num_seqs", 256)
|
|
301
310
|
model_config.setdefault("quantization", None)
|
|
302
|
-
model_config.setdefault("max_model_len",
|
|
311
|
+
model_config.setdefault("max_model_len", None)
|
|
312
|
+
model_config["limit_mm_per_prompt"] = (
|
|
313
|
+
json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
|
|
314
|
+
if model_config.get("limit_mm_per_prompt")
|
|
315
|
+
else None
|
|
316
|
+
)
|
|
303
317
|
|
|
304
318
|
return model_config
|
|
305
319
|
|
|
@@ -429,6 +443,7 @@ class VLLMModel(LLM):
|
|
|
429
443
|
usage=usage,
|
|
430
444
|
)
|
|
431
445
|
|
|
446
|
+
@vllm_check
|
|
432
447
|
async def async_generate(
|
|
433
448
|
self,
|
|
434
449
|
prompt: Union[str, Dict[str, Any]],
|
|
@@ -660,6 +675,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
660
675
|
yield self._to_chat_completion_chunk(chunk)
|
|
661
676
|
i += 1
|
|
662
677
|
|
|
678
|
+
@vllm_check
|
|
663
679
|
async def async_chat(
|
|
664
680
|
self,
|
|
665
681
|
messages: List[Dict],
|
|
@@ -736,13 +752,13 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
736
752
|
)
|
|
737
753
|
return generate_config
|
|
738
754
|
|
|
755
|
+
@vllm_check
|
|
739
756
|
async def async_chat(
|
|
740
757
|
self,
|
|
741
758
|
messages: List[Dict],
|
|
742
759
|
generate_config: Optional[Dict] = None,
|
|
743
760
|
request_id: Optional[str] = None,
|
|
744
761
|
) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
|
|
745
|
-
# only support single image, waiting vllm support multi images
|
|
746
762
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
747
763
|
prompt, images = self.get_specific_prompt(model_family, messages)
|
|
748
764
|
|
|
@@ -750,11 +766,16 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
750
766
|
inputs = {
|
|
751
767
|
"prompt": prompt,
|
|
752
768
|
}
|
|
753
|
-
|
|
769
|
+
elif len(images) == 1:
|
|
754
770
|
inputs = {
|
|
755
771
|
"prompt": prompt,
|
|
756
772
|
"multi_modal_data": {"image": images[-1]}, # type: ignore
|
|
757
773
|
}
|
|
774
|
+
else:
|
|
775
|
+
inputs = {
|
|
776
|
+
"prompt": prompt,
|
|
777
|
+
"multi_modal_data": {"image": images}, # type: ignore
|
|
778
|
+
}
|
|
758
779
|
generate_config = self._sanitize_chat_config(generate_config)
|
|
759
780
|
|
|
760
781
|
stream = generate_config.get("stream", None)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import functools
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def vllm_check(fn):
|
|
22
|
+
try:
|
|
23
|
+
from vllm.engine.async_llm_engine import AsyncEngineDeadError
|
|
24
|
+
except:
|
|
25
|
+
return fn
|
|
26
|
+
|
|
27
|
+
@functools.wraps(fn)
|
|
28
|
+
async def _async_wrapper(self, *args, **kwargs):
|
|
29
|
+
logger.info("vllm_check")
|
|
30
|
+
try:
|
|
31
|
+
return await fn(self, *args, **kwargs)
|
|
32
|
+
except AsyncEngineDeadError:
|
|
33
|
+
logger.info("Detecting vLLM is not health, prepare to quit the process")
|
|
34
|
+
try:
|
|
35
|
+
self.stop()
|
|
36
|
+
except:
|
|
37
|
+
# ignore error when stop
|
|
38
|
+
pass
|
|
39
|
+
# Just kill the process and let xinference auto-recover the model
|
|
40
|
+
os._exit(1)
|
|
41
|
+
|
|
42
|
+
return _async_wrapper
|
xinference/model/rerank/core.py
CHANGED
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import gc
|
|
16
|
+
import importlib
|
|
16
17
|
import logging
|
|
17
18
|
import os
|
|
18
19
|
import threading
|
|
@@ -178,9 +179,27 @@ class RerankModel:
|
|
|
178
179
|
return rerank_type
|
|
179
180
|
|
|
180
181
|
def load(self):
|
|
182
|
+
flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
|
|
183
|
+
if (
|
|
184
|
+
self._auto_detect_type(self._model_path) != "normal"
|
|
185
|
+
and flash_attn_installed
|
|
186
|
+
):
|
|
187
|
+
logger.warning(
|
|
188
|
+
"flash_attn can only support fp16 and bf16, "
|
|
189
|
+
"will force set `use_fp16` to True"
|
|
190
|
+
)
|
|
191
|
+
self._use_fp16 = True
|
|
181
192
|
if self._model_spec.type == "normal":
|
|
182
193
|
try:
|
|
194
|
+
import sentence_transformers
|
|
183
195
|
from sentence_transformers.cross_encoder import CrossEncoder
|
|
196
|
+
|
|
197
|
+
if sentence_transformers.__version__ < "3.1.0":
|
|
198
|
+
raise ValueError(
|
|
199
|
+
"The sentence_transformers version must be greater than 3.1.0. "
|
|
200
|
+
"Please upgrade your version via `pip install -U sentence_transformers` or refer to "
|
|
201
|
+
"https://github.com/UKPLab/sentence-transformers"
|
|
202
|
+
)
|
|
184
203
|
except ImportError:
|
|
185
204
|
error_message = "Failed to import module 'sentence-transformers'"
|
|
186
205
|
installation_guide = [
|
|
@@ -54,5 +54,13 @@
|
|
|
54
54
|
"max_tokens": 1024,
|
|
55
55
|
"model_id": "jinaai/jina-reranker-v2-base-multilingual",
|
|
56
56
|
"model_revision": "298e48cada4a9318650d7fbd795f63827f884087"
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"model_name": "minicpm-reranker",
|
|
60
|
+
"type": "normal",
|
|
61
|
+
"language": ["en", "zh"],
|
|
62
|
+
"max_tokens": 1024,
|
|
63
|
+
"model_id": "openbmb/MiniCPM-Reranker",
|
|
64
|
+
"model_revision": "5d2fd7345b6444c89d4c0fa59c92272888f3f2d0"
|
|
57
65
|
}
|
|
58
66
|
]
|
|
@@ -49,5 +49,13 @@
|
|
|
49
49
|
"max_tokens": 2048,
|
|
50
50
|
"model_id": "mirror013/bge-reranker-v2-minicpm-layerwise",
|
|
51
51
|
"model_hub": "modelscope"
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"model_name": "minicpm-reranker",
|
|
55
|
+
"type": "normal",
|
|
56
|
+
"language": ["en", "zh"],
|
|
57
|
+
"max_tokens": 1024,
|
|
58
|
+
"model_id": "OpenBMB/MiniCPM-Reranker",
|
|
59
|
+
"model_hub": "modelscope"
|
|
52
60
|
}
|
|
53
61
|
]
|
xinference/model/utils.py
CHANGED
|
@@ -300,31 +300,6 @@ def cache(model_spec: CacheableModelSpec, model_description_type: type):
|
|
|
300
300
|
return cache_dir
|
|
301
301
|
|
|
302
302
|
|
|
303
|
-
def patch_trust_remote_code():
|
|
304
|
-
"""sentence-transformers calls transformers without the trust_remote_code=True, some embedding
|
|
305
|
-
models will fail to load, e.g. jina-embeddings-v2-base-en
|
|
306
|
-
|
|
307
|
-
:return:
|
|
308
|
-
"""
|
|
309
|
-
try:
|
|
310
|
-
from transformers.dynamic_module_utils import resolve_trust_remote_code
|
|
311
|
-
except ImportError:
|
|
312
|
-
logger.error("Patch transformers trust_remote_code failed.")
|
|
313
|
-
else:
|
|
314
|
-
|
|
315
|
-
def _patched_resolve_trust_remote_code(*args, **kwargs):
|
|
316
|
-
logger.info("Patched resolve_trust_remote_code: %s %s", args, kwargs)
|
|
317
|
-
return True
|
|
318
|
-
|
|
319
|
-
if (
|
|
320
|
-
resolve_trust_remote_code.__code__
|
|
321
|
-
!= _patched_resolve_trust_remote_code.__code__
|
|
322
|
-
):
|
|
323
|
-
resolve_trust_remote_code.__code__ = (
|
|
324
|
-
_patched_resolve_trust_remote_code.__code__
|
|
325
|
-
)
|
|
326
|
-
|
|
327
|
-
|
|
328
303
|
def select_device(device):
|
|
329
304
|
try:
|
|
330
305
|
import torch # noqa: F401
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
3
|
"main.css": "./static/css/main.5061c4c3.css",
|
|
4
|
-
"main.js": "./static/js/main.
|
|
4
|
+
"main.js": "./static/js/main.e51a356d.js",
|
|
5
5
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
6
6
|
"index.html": "./index.html",
|
|
7
7
|
"main.5061c4c3.css.map": "./static/css/main.5061c4c3.css.map",
|
|
8
|
-
"main.
|
|
8
|
+
"main.e51a356d.js.map": "./static/js/main.e51a356d.js.map"
|
|
9
9
|
},
|
|
10
10
|
"entrypoints": [
|
|
11
11
|
"static/css/main.5061c4c3.css",
|
|
12
|
-
"static/js/main.
|
|
12
|
+
"static/js/main.e51a356d.js"
|
|
13
13
|
]
|
|
14
14
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.e51a356d.js"></script><link href="./static/css/main.5061c4c3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|