xinference 0.15.2__py3-none-any.whl → 0.15.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +29 -2
- xinference/client/restful/restful_client.py +10 -0
- xinference/constants.py +4 -0
- xinference/core/image_interface.py +76 -23
- xinference/core/model.py +80 -39
- xinference/core/progress_tracker.py +187 -0
- xinference/core/supervisor.py +11 -0
- xinference/core/worker.py +1 -0
- xinference/model/audio/chattts.py +2 -1
- xinference/model/audio/core.py +0 -2
- xinference/model/audio/model_spec.json +8 -0
- xinference/model/audio/model_spec_modelscope.json +9 -0
- xinference/model/embedding/core.py +14 -5
- xinference/model/embedding/model_spec.json +7 -0
- xinference/model/embedding/model_spec_modelscope.json +9 -1
- xinference/model/image/core.py +6 -7
- xinference/model/image/sdapi.py +35 -4
- xinference/model/image/stable_diffusion/core.py +212 -70
- xinference/model/llm/llm_family.json +28 -40
- xinference/model/llm/llm_family_modelscope.json +18 -22
- xinference/model/llm/transformers/cogvlm2.py +2 -1
- xinference/model/llm/transformers/cogvlm2_video.py +2 -0
- xinference/model/llm/transformers/core.py +6 -2
- xinference/model/llm/transformers/deepseek_vl.py +2 -0
- xinference/model/llm/transformers/glm4v.py +2 -1
- xinference/model/llm/transformers/intern_vl.py +2 -0
- xinference/model/llm/transformers/minicpmv25.py +2 -0
- xinference/model/llm/transformers/minicpmv26.py +2 -0
- xinference/model/llm/transformers/omnilmm.py +2 -0
- xinference/model/llm/transformers/qwen2_audio.py +11 -4
- xinference/model/llm/transformers/qwen2_vl.py +2 -28
- xinference/model/llm/transformers/qwen_vl.py +2 -1
- xinference/model/llm/transformers/utils.py +35 -2
- xinference/model/llm/transformers/yi_vl.py +2 -0
- xinference/model/llm/utils.py +72 -17
- xinference/model/llm/vllm/core.py +69 -9
- xinference/model/llm/vllm/utils.py +41 -0
- xinference/model/rerank/core.py +19 -0
- xinference/model/rerank/model_spec.json +8 -0
- xinference/model/rerank/model_spec_modelscope.json +8 -0
- xinference/model/utils.py +7 -29
- xinference/model/video/core.py +0 -2
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.29578905.js → main.e51a356d.js} +3 -3
- xinference/web/ui/build/static/js/main.e51a356d.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4385c1095eefbff0a8ec3b2964ba6e5a66a05ab31be721483ca2f43e2a91f6ff.json +1 -0
- {xinference-0.15.2.dist-info → xinference-0.15.4.dist-info}/METADATA +6 -5
- {xinference-0.15.2.dist-info → xinference-0.15.4.dist-info}/RECORD +55 -53
- xinference/web/ui/build/static/js/main.29578905.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +0 -1
- /xinference/web/ui/build/static/js/{main.29578905.js.LICENSE.txt → main.e51a356d.js.LICENSE.txt} +0 -0
- {xinference-0.15.2.dist-info → xinference-0.15.4.dist-info}/LICENSE +0 -0
- {xinference-0.15.2.dist-info → xinference-0.15.4.dist-info}/WHEEL +0 -0
- {xinference-0.15.2.dist-info → xinference-0.15.4.dist-info}/entry_points.txt +0 -0
- {xinference-0.15.2.dist-info → xinference-0.15.4.dist-info}/top_level.txt +0 -0
|
@@ -29,6 +29,7 @@ from ..utils import (
|
|
|
29
29
|
parse_messages,
|
|
30
30
|
)
|
|
31
31
|
from .core import PytorchChatModel, PytorchGenerateConfig
|
|
32
|
+
from .utils import cache_clean
|
|
32
33
|
|
|
33
34
|
logger = logging.getLogger(__name__)
|
|
34
35
|
|
|
@@ -99,6 +100,7 @@ class YiVLChatModel(PytorchChatModel):
|
|
|
99
100
|
raise RuntimeError("Only one image per message is supported by Yi VL.")
|
|
100
101
|
return content
|
|
101
102
|
|
|
103
|
+
@cache_clean
|
|
102
104
|
def chat(
|
|
103
105
|
self,
|
|
104
106
|
messages: List[Dict],
|
xinference/model/llm/utils.py
CHANGED
|
@@ -29,6 +29,7 @@ from ...types import (
|
|
|
29
29
|
ChatCompletion,
|
|
30
30
|
ChatCompletionChoice,
|
|
31
31
|
ChatCompletionChunk,
|
|
32
|
+
ChatCompletionMessage,
|
|
32
33
|
Completion,
|
|
33
34
|
CompletionChoice,
|
|
34
35
|
CompletionChunk,
|
|
@@ -50,6 +51,7 @@ QWEN_TOOL_CALL_FAMILY = [
|
|
|
50
51
|
"qwen1.5-moe-chat",
|
|
51
52
|
"qwen2-instruct",
|
|
52
53
|
"qwen2-moe-instruct",
|
|
54
|
+
"qwen2.5-instruct",
|
|
53
55
|
]
|
|
54
56
|
|
|
55
57
|
GLM4_TOOL_CALL_FAMILY = [
|
|
@@ -57,6 +59,10 @@ GLM4_TOOL_CALL_FAMILY = [
|
|
|
57
59
|
"glm4-chat-1m",
|
|
58
60
|
]
|
|
59
61
|
|
|
62
|
+
LLAMA3_TOOL_CALL_FAMILY = [
|
|
63
|
+
"llama-3.1-instruct",
|
|
64
|
+
]
|
|
65
|
+
|
|
60
66
|
QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
|
|
61
67
|
|
|
62
68
|
|
|
@@ -113,7 +119,7 @@ class ChatModelMixin:
|
|
|
113
119
|
return self._build_from_raw_template(messages, chat_template, **kwargs)
|
|
114
120
|
|
|
115
121
|
@staticmethod
|
|
116
|
-
def get_specific_prompt(model_family: str, messages: List[
|
|
122
|
+
def get_specific_prompt(model_family: str, messages: List[ChatCompletionMessage]):
|
|
117
123
|
"""
|
|
118
124
|
Inspired by FastChat. Format chat history into a prompt according to the prompty style of
|
|
119
125
|
different models.
|
|
@@ -129,7 +135,7 @@ class ChatModelMixin:
|
|
|
129
135
|
ret = (
|
|
130
136
|
"<s>"
|
|
131
137
|
if system_prompt == ""
|
|
132
|
-
else "<s><|im_start|>system\n"
|
|
138
|
+
else "<s><|im_start|>system\n" # type: ignore
|
|
133
139
|
+ system_prompt
|
|
134
140
|
+ intra_message_sep
|
|
135
141
|
+ "\n"
|
|
@@ -159,14 +165,25 @@ class ChatModelMixin:
|
|
|
159
165
|
for image_url in image_urls:
|
|
160
166
|
fut = executor.submit(_decode_image, image_url)
|
|
161
167
|
image_futures.append(fut)
|
|
162
|
-
images
|
|
168
|
+
images.extend([fut.result() for fut in image_futures])
|
|
163
169
|
if len(image_futures) == 0:
|
|
164
170
|
ret += role + "\n" + text + intra_message_sep + "\n"
|
|
165
171
|
else:
|
|
172
|
+
placeholders = "\n".join(
|
|
173
|
+
f"Image-{i+1}: <image>\n"
|
|
174
|
+
for i in range(
|
|
175
|
+
len(images) - len(image_futures), len(images)
|
|
176
|
+
)
|
|
177
|
+
)
|
|
166
178
|
ret += (
|
|
167
|
-
role
|
|
179
|
+
role
|
|
180
|
+
+ "\n"
|
|
181
|
+
+ f"{placeholders}\n{text}"
|
|
182
|
+
+ intra_message_sep
|
|
183
|
+
+ "\n"
|
|
168
184
|
)
|
|
169
|
-
|
|
185
|
+
if len(images) == 1:
|
|
186
|
+
ret = ret.replace("Image-1: <image>\n", "<image>\n")
|
|
170
187
|
return ret, images
|
|
171
188
|
else:
|
|
172
189
|
raise ValueError(f"Invalid model family: {model_family}")
|
|
@@ -322,8 +339,9 @@ class ChatModelMixin:
|
|
|
322
339
|
for content in contents:
|
|
323
340
|
content = content.strip()
|
|
324
341
|
if content:
|
|
325
|
-
|
|
326
|
-
|
|
342
|
+
pos = content.find(QWEN_TOOL_CALL_SYMBOLS[0])
|
|
343
|
+
if pos != -1:
|
|
344
|
+
content = content[pos + len(QWEN_TOOL_CALL_SYMBOLS[0]) :]
|
|
327
345
|
content = content.strip()
|
|
328
346
|
try:
|
|
329
347
|
res = json.loads(content)
|
|
@@ -342,6 +360,15 @@ class ChatModelMixin:
|
|
|
342
360
|
text = c["choices"][0]["text"]
|
|
343
361
|
return cls._handle_qwen_tool_result(text)
|
|
344
362
|
|
|
363
|
+
@classmethod
|
|
364
|
+
def _eval_llama3_chat_arguments(cls, c) -> List[Tuple]:
|
|
365
|
+
text = c["choices"][0]["text"]
|
|
366
|
+
try:
|
|
367
|
+
data = eval(text, {}, {})
|
|
368
|
+
return [(None, data["name"], data["parameters"])]
|
|
369
|
+
except Exception:
|
|
370
|
+
return [(text, None, None)]
|
|
371
|
+
|
|
345
372
|
@classmethod
|
|
346
373
|
def _eval_tool_arguments(cls, model_family, c):
|
|
347
374
|
family = model_family.model_family or model_family.model_name
|
|
@@ -349,6 +376,8 @@ class ChatModelMixin:
|
|
|
349
376
|
result = cls._eval_glm_chat_arguments(c)
|
|
350
377
|
elif family in QWEN_TOOL_CALL_FAMILY:
|
|
351
378
|
result = cls._eval_qwen_chat_arguments(c)
|
|
379
|
+
elif family in LLAMA3_TOOL_CALL_FAMILY:
|
|
380
|
+
result = cls._eval_llama3_chat_arguments(c)
|
|
352
381
|
else:
|
|
353
382
|
raise Exception(
|
|
354
383
|
f"Model {model_family.model_name} is not support tool calls."
|
|
@@ -365,16 +394,14 @@ class ChatModelMixin:
|
|
|
365
394
|
for content, func, args in tool_result:
|
|
366
395
|
if func:
|
|
367
396
|
tool_calls.append(
|
|
368
|
-
|
|
369
|
-
{
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
"
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
}
|
|
377
|
-
]
|
|
397
|
+
{
|
|
398
|
+
"id": f"call_{_id}",
|
|
399
|
+
"type": "function",
|
|
400
|
+
"function": {
|
|
401
|
+
"name": func,
|
|
402
|
+
"arguments": json.dumps(args, ensure_ascii=False),
|
|
403
|
+
},
|
|
404
|
+
}
|
|
378
405
|
)
|
|
379
406
|
else:
|
|
380
407
|
failed_contents.append(content)
|
|
@@ -460,6 +487,34 @@ class ChatModelMixin:
|
|
|
460
487
|
"usage": usage,
|
|
461
488
|
}
|
|
462
489
|
|
|
490
|
+
def _transform_messages(
|
|
491
|
+
self,
|
|
492
|
+
messages: List[ChatCompletionMessage],
|
|
493
|
+
):
|
|
494
|
+
transformed_messages = []
|
|
495
|
+
for msg in messages:
|
|
496
|
+
new_content = []
|
|
497
|
+
role = msg["role"]
|
|
498
|
+
content = msg["content"]
|
|
499
|
+
if isinstance(content, str):
|
|
500
|
+
new_content.append({"type": "text", "text": content})
|
|
501
|
+
elif isinstance(content, List):
|
|
502
|
+
for item in content: # type: ignore
|
|
503
|
+
if "text" in item:
|
|
504
|
+
new_content.append({"type": "text", "text": item["text"]})
|
|
505
|
+
elif "image_url" in item:
|
|
506
|
+
new_content.append(
|
|
507
|
+
{"type": "image", "image": item["image_url"]["url"]}
|
|
508
|
+
)
|
|
509
|
+
elif "video_url" in item:
|
|
510
|
+
new_content.append(
|
|
511
|
+
{"type": "video", "video": item["video_url"]["url"]}
|
|
512
|
+
)
|
|
513
|
+
new_message = {"role": role, "content": new_content}
|
|
514
|
+
transformed_messages.append(new_message)
|
|
515
|
+
|
|
516
|
+
return transformed_messages
|
|
517
|
+
|
|
463
518
|
|
|
464
519
|
def get_file_location(
|
|
465
520
|
llm_family: LLMFamilyV1, spec: LLMSpecV1, quantization: str
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import asyncio
|
|
16
|
+
import json
|
|
16
17
|
import logging
|
|
17
18
|
import multiprocessing
|
|
18
19
|
import os
|
|
@@ -33,6 +34,7 @@ from typing import (
|
|
|
33
34
|
from ....types import (
|
|
34
35
|
ChatCompletion,
|
|
35
36
|
ChatCompletionChunk,
|
|
37
|
+
ChatCompletionMessage,
|
|
36
38
|
Completion,
|
|
37
39
|
CompletionChoice,
|
|
38
40
|
CompletionChunk,
|
|
@@ -47,6 +49,7 @@ from ..utils import (
|
|
|
47
49
|
ChatModelMixin,
|
|
48
50
|
generate_completion_chunk,
|
|
49
51
|
)
|
|
52
|
+
from .utils import vllm_check
|
|
50
53
|
|
|
51
54
|
logger = logging.getLogger(__name__)
|
|
52
55
|
|
|
@@ -65,6 +68,7 @@ class VLLMModelConfig(TypedDict, total=False):
|
|
|
65
68
|
max_num_seqs: int
|
|
66
69
|
quantization: Optional[str]
|
|
67
70
|
max_model_len: Optional[int]
|
|
71
|
+
limit_mm_per_prompt: Optional[Dict[str, int]]
|
|
68
72
|
|
|
69
73
|
|
|
70
74
|
class VLLMGenerateConfig(TypedDict, total=False):
|
|
@@ -90,9 +94,7 @@ try:
|
|
|
90
94
|
except ImportError:
|
|
91
95
|
VLLM_INSTALLED = False
|
|
92
96
|
|
|
93
|
-
VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = [
|
|
94
|
-
"internvl2",
|
|
95
|
-
]
|
|
97
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = []
|
|
96
98
|
VLLM_SUPPORTED_MODELS = [
|
|
97
99
|
"llama-2",
|
|
98
100
|
"llama-3",
|
|
@@ -171,6 +173,12 @@ if VLLM_INSTALLED and vllm.__version__ > "0.5.3":
|
|
|
171
173
|
VLLM_SUPPORTED_MODELS.append("llama-3.1")
|
|
172
174
|
VLLM_SUPPORTED_CHAT_MODELS.append("llama-3.1-instruct")
|
|
173
175
|
|
|
176
|
+
if VLLM_INSTALLED and vllm.__version__ >= "0.6.1":
|
|
177
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
|
|
178
|
+
|
|
179
|
+
if VLLM_INSTALLED and vllm.__version__ >= "0.6.3":
|
|
180
|
+
VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct")
|
|
181
|
+
|
|
174
182
|
|
|
175
183
|
class VLLMModel(LLM):
|
|
176
184
|
def __init__(
|
|
@@ -304,7 +312,7 @@ class VLLMModel(LLM):
|
|
|
304
312
|
model_config.setdefault("gpu_memory_utilization", 0.90)
|
|
305
313
|
model_config.setdefault("max_num_seqs", 256)
|
|
306
314
|
model_config.setdefault("quantization", None)
|
|
307
|
-
model_config.setdefault("max_model_len",
|
|
315
|
+
model_config.setdefault("max_model_len", None)
|
|
308
316
|
|
|
309
317
|
return model_config
|
|
310
318
|
|
|
@@ -434,6 +442,7 @@ class VLLMModel(LLM):
|
|
|
434
442
|
usage=usage,
|
|
435
443
|
)
|
|
436
444
|
|
|
445
|
+
@vllm_check
|
|
437
446
|
async def async_generate(
|
|
438
447
|
self,
|
|
439
448
|
prompt: Union[str, Dict[str, Any]],
|
|
@@ -665,6 +674,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
665
674
|
yield self._to_chat_completion_chunk(chunk)
|
|
666
675
|
i += 1
|
|
667
676
|
|
|
677
|
+
@vllm_check
|
|
668
678
|
async def async_chat(
|
|
669
679
|
self,
|
|
670
680
|
messages: List[Dict],
|
|
@@ -722,6 +732,33 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
722
732
|
return False
|
|
723
733
|
return VLLM_INSTALLED
|
|
724
734
|
|
|
735
|
+
def _sanitize_model_config(
|
|
736
|
+
self, model_config: Optional[VLLMModelConfig]
|
|
737
|
+
) -> VLLMModelConfig:
|
|
738
|
+
if model_config is None:
|
|
739
|
+
model_config = VLLMModelConfig()
|
|
740
|
+
|
|
741
|
+
cuda_count = self._get_cuda_count()
|
|
742
|
+
|
|
743
|
+
model_config.setdefault("tokenizer_mode", "auto")
|
|
744
|
+
model_config.setdefault("trust_remote_code", True)
|
|
745
|
+
model_config.setdefault("tensor_parallel_size", cuda_count)
|
|
746
|
+
model_config.setdefault("block_size", 16)
|
|
747
|
+
model_config.setdefault("swap_space", 4)
|
|
748
|
+
model_config.setdefault("gpu_memory_utilization", 0.90)
|
|
749
|
+
model_config.setdefault("max_num_seqs", 256)
|
|
750
|
+
model_config.setdefault("quantization", None)
|
|
751
|
+
model_config.setdefault("max_model_len", None)
|
|
752
|
+
model_config["limit_mm_per_prompt"] = (
|
|
753
|
+
json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
|
|
754
|
+
if model_config.get("limit_mm_per_prompt")
|
|
755
|
+
else {
|
|
756
|
+
"image": 2, # default 2 images all chat
|
|
757
|
+
}
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
return model_config
|
|
761
|
+
|
|
725
762
|
def _sanitize_chat_config(
|
|
726
763
|
self,
|
|
727
764
|
generate_config: Optional[Dict] = None,
|
|
@@ -741,25 +778,48 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
741
778
|
)
|
|
742
779
|
return generate_config
|
|
743
780
|
|
|
781
|
+
@vllm_check
|
|
744
782
|
async def async_chat(
|
|
745
783
|
self,
|
|
746
|
-
messages: List[
|
|
784
|
+
messages: List[ChatCompletionMessage], # type: ignore
|
|
747
785
|
generate_config: Optional[Dict] = None,
|
|
748
786
|
request_id: Optional[str] = None,
|
|
749
787
|
) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
|
|
750
|
-
|
|
788
|
+
messages = self._transform_messages(messages)
|
|
789
|
+
tools = generate_config.pop("tools", []) if generate_config else None
|
|
790
|
+
|
|
751
791
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
752
|
-
prompt, images = self.get_specific_prompt(model_family, messages)
|
|
753
792
|
|
|
754
|
-
if
|
|
793
|
+
if "internvl2" not in model_family.lower():
|
|
794
|
+
from qwen_vl_utils import process_vision_info
|
|
795
|
+
|
|
796
|
+
full_context_kwargs = {}
|
|
797
|
+
if tools and model_family in QWEN_TOOL_CALL_FAMILY:
|
|
798
|
+
full_context_kwargs["tools"] = tools
|
|
799
|
+
assert self.model_family.chat_template is not None
|
|
800
|
+
prompt = self.get_full_context(
|
|
801
|
+
messages, self.model_family.chat_template, **full_context_kwargs
|
|
802
|
+
)
|
|
803
|
+
images, video_inputs = process_vision_info(messages)
|
|
804
|
+
if video_inputs:
|
|
805
|
+
raise ValueError("Not support video input now.")
|
|
806
|
+
else:
|
|
807
|
+
prompt, images = self.get_specific_prompt(model_family, messages)
|
|
808
|
+
|
|
809
|
+
if not images:
|
|
755
810
|
inputs = {
|
|
756
811
|
"prompt": prompt,
|
|
757
812
|
}
|
|
758
|
-
|
|
813
|
+
elif len(images) == 1:
|
|
759
814
|
inputs = {
|
|
760
815
|
"prompt": prompt,
|
|
761
816
|
"multi_modal_data": {"image": images[-1]}, # type: ignore
|
|
762
817
|
}
|
|
818
|
+
else:
|
|
819
|
+
inputs = {
|
|
820
|
+
"prompt": prompt,
|
|
821
|
+
"multi_modal_data": {"image": images}, # type: ignore
|
|
822
|
+
}
|
|
763
823
|
generate_config = self._sanitize_chat_config(generate_config)
|
|
764
824
|
|
|
765
825
|
stream = generate_config.get("stream", None)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import functools
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def vllm_check(fn):
|
|
22
|
+
try:
|
|
23
|
+
from vllm.engine.async_llm_engine import AsyncEngineDeadError
|
|
24
|
+
except:
|
|
25
|
+
return fn
|
|
26
|
+
|
|
27
|
+
@functools.wraps(fn)
|
|
28
|
+
async def _async_wrapper(self, *args, **kwargs):
|
|
29
|
+
try:
|
|
30
|
+
return await fn(self, *args, **kwargs)
|
|
31
|
+
except AsyncEngineDeadError:
|
|
32
|
+
logger.info("Detecting vLLM is not health, prepare to quit the process")
|
|
33
|
+
try:
|
|
34
|
+
self.stop()
|
|
35
|
+
except:
|
|
36
|
+
# ignore error when stop
|
|
37
|
+
pass
|
|
38
|
+
# Just kill the process and let xinference auto-recover the model
|
|
39
|
+
os._exit(1)
|
|
40
|
+
|
|
41
|
+
return _async_wrapper
|
xinference/model/rerank/core.py
CHANGED
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import gc
|
|
16
|
+
import importlib
|
|
16
17
|
import logging
|
|
17
18
|
import os
|
|
18
19
|
import threading
|
|
@@ -178,9 +179,27 @@ class RerankModel:
|
|
|
178
179
|
return rerank_type
|
|
179
180
|
|
|
180
181
|
def load(self):
|
|
182
|
+
flash_attn_installed = importlib.util.find_spec("flash_attn") is not None
|
|
183
|
+
if (
|
|
184
|
+
self._auto_detect_type(self._model_path) != "normal"
|
|
185
|
+
and flash_attn_installed
|
|
186
|
+
):
|
|
187
|
+
logger.warning(
|
|
188
|
+
"flash_attn can only support fp16 and bf16, "
|
|
189
|
+
"will force set `use_fp16` to True"
|
|
190
|
+
)
|
|
191
|
+
self._use_fp16 = True
|
|
181
192
|
if self._model_spec.type == "normal":
|
|
182
193
|
try:
|
|
194
|
+
import sentence_transformers
|
|
183
195
|
from sentence_transformers.cross_encoder import CrossEncoder
|
|
196
|
+
|
|
197
|
+
if sentence_transformers.__version__ < "3.1.0":
|
|
198
|
+
raise ValueError(
|
|
199
|
+
"The sentence_transformers version must be greater than 3.1.0. "
|
|
200
|
+
"Please upgrade your version via `pip install -U sentence_transformers` or refer to "
|
|
201
|
+
"https://github.com/UKPLab/sentence-transformers"
|
|
202
|
+
)
|
|
184
203
|
except ImportError:
|
|
185
204
|
error_message = "Failed to import module 'sentence-transformers'"
|
|
186
205
|
installation_guide = [
|
|
@@ -54,5 +54,13 @@
|
|
|
54
54
|
"max_tokens": 1024,
|
|
55
55
|
"model_id": "jinaai/jina-reranker-v2-base-multilingual",
|
|
56
56
|
"model_revision": "298e48cada4a9318650d7fbd795f63827f884087"
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"model_name": "minicpm-reranker",
|
|
60
|
+
"type": "normal",
|
|
61
|
+
"language": ["en", "zh"],
|
|
62
|
+
"max_tokens": 1024,
|
|
63
|
+
"model_id": "openbmb/MiniCPM-Reranker",
|
|
64
|
+
"model_revision": "5d2fd7345b6444c89d4c0fa59c92272888f3f2d0"
|
|
57
65
|
}
|
|
58
66
|
]
|
|
@@ -49,5 +49,13 @@
|
|
|
49
49
|
"max_tokens": 2048,
|
|
50
50
|
"model_id": "mirror013/bge-reranker-v2-minicpm-layerwise",
|
|
51
51
|
"model_hub": "modelscope"
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"model_name": "minicpm-reranker",
|
|
55
|
+
"type": "normal",
|
|
56
|
+
"language": ["en", "zh"],
|
|
57
|
+
"max_tokens": 1024,
|
|
58
|
+
"model_id": "OpenBMB/MiniCPM-Reranker",
|
|
59
|
+
"model_hub": "modelscope"
|
|
52
60
|
}
|
|
53
61
|
]
|
xinference/model/utils.py
CHANGED
|
@@ -23,12 +23,15 @@ import huggingface_hub
|
|
|
23
23
|
import numpy as np
|
|
24
24
|
import torch
|
|
25
25
|
|
|
26
|
-
from ..constants import
|
|
26
|
+
from ..constants import (
|
|
27
|
+
XINFERENCE_CACHE_DIR,
|
|
28
|
+
XINFERENCE_DOWNLOAD_MAX_ATTEMPTS,
|
|
29
|
+
XINFERENCE_ENV_MODEL_SRC,
|
|
30
|
+
)
|
|
27
31
|
from ..device_utils import get_available_device, is_device_available
|
|
28
32
|
from .core import CacheableModelSpec
|
|
29
33
|
|
|
30
34
|
logger = logging.getLogger(__name__)
|
|
31
|
-
MAX_ATTEMPTS = 3
|
|
32
35
|
IS_NEW_HUGGINGFACE_HUB: bool = huggingface_hub.__version__ >= "0.23.0"
|
|
33
36
|
|
|
34
37
|
|
|
@@ -100,11 +103,11 @@ def retry_download(
|
|
|
100
103
|
**kwargs,
|
|
101
104
|
):
|
|
102
105
|
last_ex = None
|
|
103
|
-
for current_attempt in range(1,
|
|
106
|
+
for current_attempt in range(1, XINFERENCE_DOWNLOAD_MAX_ATTEMPTS + 1):
|
|
104
107
|
try:
|
|
105
108
|
return download_func(*args, **kwargs)
|
|
106
109
|
except Exception as e:
|
|
107
|
-
remaining_attempts =
|
|
110
|
+
remaining_attempts = XINFERENCE_DOWNLOAD_MAX_ATTEMPTS - current_attempt
|
|
108
111
|
last_ex = e
|
|
109
112
|
logger.debug(
|
|
110
113
|
"Download failed: %s, download func: %s, download args: %s, kwargs: %s",
|
|
@@ -300,31 +303,6 @@ def cache(model_spec: CacheableModelSpec, model_description_type: type):
|
|
|
300
303
|
return cache_dir
|
|
301
304
|
|
|
302
305
|
|
|
303
|
-
def patch_trust_remote_code():
|
|
304
|
-
"""sentence-transformers calls transformers without the trust_remote_code=True, some embedding
|
|
305
|
-
models will fail to load, e.g. jina-embeddings-v2-base-en
|
|
306
|
-
|
|
307
|
-
:return:
|
|
308
|
-
"""
|
|
309
|
-
try:
|
|
310
|
-
from transformers.dynamic_module_utils import resolve_trust_remote_code
|
|
311
|
-
except ImportError:
|
|
312
|
-
logger.error("Patch transformers trust_remote_code failed.")
|
|
313
|
-
else:
|
|
314
|
-
|
|
315
|
-
def _patched_resolve_trust_remote_code(*args, **kwargs):
|
|
316
|
-
logger.info("Patched resolve_trust_remote_code: %s %s", args, kwargs)
|
|
317
|
-
return True
|
|
318
|
-
|
|
319
|
-
if (
|
|
320
|
-
resolve_trust_remote_code.__code__
|
|
321
|
-
!= _patched_resolve_trust_remote_code.__code__
|
|
322
|
-
):
|
|
323
|
-
resolve_trust_remote_code.__code__ = (
|
|
324
|
-
_patched_resolve_trust_remote_code.__code__
|
|
325
|
-
)
|
|
326
|
-
|
|
327
|
-
|
|
328
306
|
def select_device(device):
|
|
329
307
|
try:
|
|
330
308
|
import torch # noqa: F401
|
xinference/model/video/core.py
CHANGED
|
@@ -21,8 +21,6 @@ from ..core import CacheableModelSpec, ModelDescription
|
|
|
21
21
|
from ..utils import valid_model_revision
|
|
22
22
|
from .diffusers import DiffUsersVideoModel
|
|
23
23
|
|
|
24
|
-
MAX_ATTEMPTS = 3
|
|
25
|
-
|
|
26
24
|
logger = logging.getLogger(__name__)
|
|
27
25
|
|
|
28
26
|
MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
3
|
"main.css": "./static/css/main.5061c4c3.css",
|
|
4
|
-
"main.js": "./static/js/main.
|
|
4
|
+
"main.js": "./static/js/main.e51a356d.js",
|
|
5
5
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
6
6
|
"index.html": "./index.html",
|
|
7
7
|
"main.5061c4c3.css.map": "./static/css/main.5061c4c3.css.map",
|
|
8
|
-
"main.
|
|
8
|
+
"main.e51a356d.js.map": "./static/js/main.e51a356d.js.map"
|
|
9
9
|
},
|
|
10
10
|
"entrypoints": [
|
|
11
11
|
"static/css/main.5061c4c3.css",
|
|
12
|
-
"static/js/main.
|
|
12
|
+
"static/js/main.e51a356d.js"
|
|
13
13
|
]
|
|
14
14
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.e51a356d.js"></script><link href="./static/css/main.5061c4c3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|