xinference 1.10.1__py3-none-any.whl → 1.11.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +462 -3
- xinference/client/restful/async_restful_client.py +158 -5
- xinference/client/restful/restful_client.py +131 -0
- xinference/core/supervisor.py +12 -0
- xinference/model/audio/model_spec.json +20 -20
- xinference/model/image/model_spec.json +159 -159
- xinference/model/llm/__init__.py +2 -2
- xinference/model/llm/llm_family.json +843 -180
- xinference/model/llm/mlx/distributed_models/core.py +41 -0
- xinference/model/llm/mlx/distributed_models/qwen2.py +1 -2
- xinference/model/llm/sglang/core.py +20 -6
- xinference/model/llm/tool_parsers/qwen_tool_parser.py +29 -4
- xinference/model/llm/transformers/chatglm.py +3 -0
- xinference/model/llm/transformers/core.py +93 -16
- xinference/model/llm/transformers/multimodal/minicpmv45.py +340 -0
- xinference/model/llm/transformers/utils.py +3 -0
- xinference/model/llm/utils.py +37 -24
- xinference/model/llm/vllm/core.py +128 -69
- xinference/model/utils.py +74 -31
- xinference/thirdparty/audiotools/core/audio_signal.py +6 -6
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +1 -1
- xinference/thirdparty/melo/text/chinese_mix.py +2 -2
- xinference/types.py +9 -0
- xinference/ui/web/ui/build/asset-manifest.json +3 -3
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/js/{main.d192c4f3.js → main.e4d9a9e1.js} +3 -3
- xinference/ui/web/ui/build/static/js/{main.d192c4f3.js.map → main.e4d9a9e1.js.map} +1 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/e6770a05771952175c9fbf48fce283c9bb1bc8b5763e39edc36d099d1fe16b4a.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/ea2a26361204e70cf1018d6990fb6354bed82b3ac69690391e0f100385e7abb7.json +1 -0
- {xinference-1.10.1.dist-info → xinference-1.11.0.post1.dist-info}/METADATA +8 -5
- {xinference-1.10.1.dist-info → xinference-1.11.0.post1.dist-info}/RECORD +37 -36
- xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f995a2425dfb0822fd07127f66ffe9b026883bc156b402eb8bd0b83d52460a93.json +0 -1
- /xinference/ui/web/ui/build/static/js/{main.d192c4f3.js.LICENSE.txt → main.e4d9a9e1.js.LICENSE.txt} +0 -0
- {xinference-1.10.1.dist-info → xinference-1.11.0.post1.dist-info}/WHEEL +0 -0
- {xinference-1.10.1.dist-info → xinference-1.11.0.post1.dist-info}/entry_points.txt +0 -0
- {xinference-1.10.1.dist-info → xinference-1.11.0.post1.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.10.1.dist-info → xinference-1.11.0.post1.dist-info}/top_level.txt +0 -0
xinference/model/llm/utils.py
CHANGED
|
@@ -75,6 +75,8 @@ QWEN_TOOL_CALL_FAMILY = [
|
|
|
75
75
|
"Qwen3-VL-Thinking",
|
|
76
76
|
"Qwen3-Next-Instruct",
|
|
77
77
|
"Qwen3-Next-Thinking",
|
|
78
|
+
"Qwen3-Omni-Instruct",
|
|
79
|
+
"Qwen3-Omni-Thinking",
|
|
78
80
|
]
|
|
79
81
|
|
|
80
82
|
GLM4_TOOL_CALL_FAMILY = [
|
|
@@ -100,7 +102,6 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
|
|
|
100
102
|
|
|
101
103
|
|
|
102
104
|
class ChatModelMixin:
|
|
103
|
-
|
|
104
105
|
def __init__(self):
|
|
105
106
|
self.model_family = None
|
|
106
107
|
self.model_uid = None
|
|
@@ -143,7 +144,7 @@ class ChatModelMixin:
|
|
|
143
144
|
tokenize=False,
|
|
144
145
|
**kwargs,
|
|
145
146
|
):
|
|
146
|
-
if "vision" not in self.model_family.model_ability: # type: ignore
|
|
147
|
+
if "vision" not in self.model_family.model_ability and "audio" not in self.model_family.model_ability: # type: ignore
|
|
147
148
|
messages = self.convert_messages_with_content_list_to_str_conversion(
|
|
148
149
|
messages
|
|
149
150
|
)
|
|
@@ -186,8 +187,7 @@ class ChatModelMixin:
|
|
|
186
187
|
return kwargs
|
|
187
188
|
else:
|
|
188
189
|
raise TypeError(
|
|
189
|
-
f"`chat_template_kwargs` but be a JSON parsable str "
|
|
190
|
-
f"or dict, got: {kwargs}"
|
|
190
|
+
f"`chat_template_kwargs` but be a JSON parsable str or dict, got: {kwargs}"
|
|
191
191
|
)
|
|
192
192
|
elif reasoning_parser and not reasoning_parser.enable_thinking:
|
|
193
193
|
# hybrid model like qwen3,
|
|
@@ -853,11 +853,11 @@ class ChatModelMixin:
|
|
|
853
853
|
"tool_calls": tool_calls,
|
|
854
854
|
}
|
|
855
855
|
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
assert "prompt_tokens" in usage
|
|
859
|
-
except Exception:
|
|
856
|
+
# For tool completion chunks, use None for usage, actual values for stop
|
|
857
|
+
if finish_reason == "tool_calls":
|
|
860
858
|
usage = None
|
|
859
|
+
else:
|
|
860
|
+
usage = c.get("usage")
|
|
861
861
|
return {
|
|
862
862
|
"id": "chat" + f"cmpl-{_id}",
|
|
863
863
|
"model": model_uid,
|
|
@@ -882,25 +882,32 @@ class ChatModelMixin:
|
|
|
882
882
|
):
|
|
883
883
|
if not self.tool_parser:
|
|
884
884
|
return self._get_final_chat_completion_chunk(c)
|
|
885
|
-
|
|
886
|
-
c = self.reasoning_parser.prepare_reasoning_content(c)
|
|
885
|
+
|
|
887
886
|
_id = str(uuid.uuid4())
|
|
888
887
|
reasoning_content = None
|
|
888
|
+
content = ""
|
|
889
|
+
|
|
890
|
+
# First, process reasoning content if reasoning parser exists
|
|
891
|
+
text = c["choices"][0]["text"]
|
|
889
892
|
if self.reasoning_parser and self.reasoning_parser.check_content_parser():
|
|
890
|
-
|
|
891
|
-
reasoning_content,
|
|
893
|
+
# Extract reasoning content directly from the original text
|
|
894
|
+
reasoning_content, processed_content = (
|
|
892
895
|
self.reasoning_parser.extract_reasoning_content(text)
|
|
893
896
|
)
|
|
894
|
-
|
|
897
|
+
# Use the processed content (without thinking tags) for tool parsing
|
|
898
|
+
if processed_content:
|
|
899
|
+
text = processed_content
|
|
895
900
|
|
|
901
|
+
# Then, extract tool calls from the processed text (without thinking tags)
|
|
896
902
|
tool_calls = []
|
|
897
903
|
failed_contents = []
|
|
898
904
|
if isinstance(self.tool_parser, Glm4ToolParser):
|
|
899
905
|
tool_result = self.tool_parser.extract_tool_calls(c)
|
|
900
906
|
else:
|
|
901
|
-
text = c["choices"][0]["text"]
|
|
902
907
|
tool_result = self.tool_parser.extract_tool_calls(text)
|
|
903
|
-
|
|
908
|
+
|
|
909
|
+
# Process tool results
|
|
910
|
+
for tool_content, func, args in tool_result:
|
|
904
911
|
if func:
|
|
905
912
|
tool_calls.append(
|
|
906
913
|
{
|
|
@@ -913,25 +920,31 @@ class ChatModelMixin:
|
|
|
913
920
|
}
|
|
914
921
|
)
|
|
915
922
|
else:
|
|
916
|
-
if
|
|
917
|
-
failed_contents.append(
|
|
918
|
-
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
923
|
+
if tool_content:
|
|
924
|
+
failed_contents.append(tool_content)
|
|
919
925
|
|
|
920
|
-
|
|
926
|
+
# Determine the final content
|
|
927
|
+
if tool_calls:
|
|
928
|
+
# For tool calls, the main content should be empty or contain only non-tool parts
|
|
929
|
+
content = "".join(failed_contents) if failed_contents else ""
|
|
930
|
+
else:
|
|
931
|
+
# For non-tool calls, use the processed content from reasoning parser
|
|
932
|
+
content = text
|
|
933
|
+
|
|
934
|
+
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
921
935
|
|
|
922
936
|
m = {
|
|
923
937
|
"role": "assistant",
|
|
924
|
-
"content": content
|
|
938
|
+
"content": content,
|
|
925
939
|
"tool_calls": tool_calls,
|
|
926
940
|
}
|
|
927
941
|
# add only reasoning_content is None
|
|
928
942
|
if reasoning_content is not None:
|
|
929
943
|
m["reasoning_content"] = reasoning_content
|
|
930
944
|
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
except Exception:
|
|
945
|
+
# For tool completion chunks, use actual usage values when available
|
|
946
|
+
usage = c.get("usage")
|
|
947
|
+
if not usage or not isinstance(usage, dict) or "prompt_tokens" not in usage:
|
|
935
948
|
usage = {
|
|
936
949
|
"prompt_tokens": -1,
|
|
937
950
|
"completion_tokens": -1,
|
|
@@ -131,7 +131,7 @@ except ImportError:
|
|
|
131
131
|
VLLM_INSTALLED = False
|
|
132
132
|
VLLM_VERSION = None
|
|
133
133
|
|
|
134
|
-
|
|
134
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST: List[str] = []
|
|
135
135
|
VLLM_SUPPORTED_MODELS = [
|
|
136
136
|
"llama-2",
|
|
137
137
|
"llama-3",
|
|
@@ -229,34 +229,37 @@ if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.5.3"):
|
|
|
229
229
|
VLLM_SUPPORTED_CHAT_MODELS.append("HuatuoGPT-o1-LLaMA-3.1")
|
|
230
230
|
|
|
231
231
|
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.6.1"):
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
232
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("internvl2")
|
|
233
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("InternVL2.5")
|
|
234
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("InternVL2.5-MPO")
|
|
235
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("InternVL3")
|
|
236
236
|
|
|
237
237
|
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.6.2"):
|
|
238
238
|
VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
|
|
239
239
|
|
|
240
240
|
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.6.3"):
|
|
241
241
|
VLLM_SUPPORTED_MODELS.append("llama-3.2-vision")
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
242
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("llama-3.2-vision-instruct")
|
|
243
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2-vl-instruct")
|
|
244
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("QvQ-72B-Preview")
|
|
245
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2-audio")
|
|
245
246
|
|
|
246
247
|
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.7.0"):
|
|
247
248
|
VLLM_SUPPORTED_CHAT_MODELS.append("internlm3-instruct")
|
|
248
249
|
|
|
249
250
|
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.7.2"):
|
|
250
|
-
|
|
251
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2.5-vl-instruct")
|
|
251
252
|
VLLM_SUPPORTED_CHAT_MODELS.append("moonlight-16b-a3b-instruct")
|
|
253
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2-audio-instruct")
|
|
252
254
|
|
|
253
255
|
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.7.3"):
|
|
254
256
|
VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct-1m")
|
|
255
257
|
VLLM_SUPPORTED_CHAT_MODELS.append("qwenLong-l1")
|
|
258
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2.5-omni")
|
|
256
259
|
|
|
257
260
|
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.0"):
|
|
258
261
|
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-3-1b-it")
|
|
259
|
-
|
|
262
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("gemma-3-it")
|
|
260
263
|
|
|
261
264
|
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.4"):
|
|
262
265
|
VLLM_SUPPORTED_CHAT_MODELS.append("glm4-0414")
|
|
@@ -272,7 +275,7 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.1"):
|
|
|
272
275
|
|
|
273
276
|
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.2"):
|
|
274
277
|
VLLM_SUPPORTED_CHAT_MODELS.append("Ernie4.5")
|
|
275
|
-
|
|
278
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("glm-4.1v-thinking")
|
|
276
279
|
VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Instruct")
|
|
277
280
|
VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Thinking")
|
|
278
281
|
VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Coder")
|
|
@@ -280,7 +283,7 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.2"):
|
|
|
280
283
|
|
|
281
284
|
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.0"):
|
|
282
285
|
VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5")
|
|
283
|
-
|
|
286
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("glm-4.5v")
|
|
284
287
|
VLLM_SUPPORTED_CHAT_MODELS.append("KAT-V1")
|
|
285
288
|
|
|
286
289
|
if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
|
|
@@ -291,9 +294,11 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.2"):
|
|
|
291
294
|
VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Next-Instruct")
|
|
292
295
|
VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Next-Thinking")
|
|
293
296
|
|
|
294
|
-
if VLLM_INSTALLED and VLLM_VERSION
|
|
295
|
-
|
|
296
|
-
|
|
297
|
+
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.11.0"):
|
|
298
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("Qwen3-VL-Thinking")
|
|
299
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("Qwen3-VL-Instruct")
|
|
300
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("Qwen3-Omni-Thinking")
|
|
301
|
+
VLLM_SUPPORTED_MULTI_MODEL_LIST.append("Qwen3-Omni-Instruct")
|
|
297
302
|
|
|
298
303
|
|
|
299
304
|
class VLLMModel(LLM):
|
|
@@ -545,7 +550,7 @@ class VLLMModel(LLM):
|
|
|
545
550
|
# patch vllm Executor.get_class
|
|
546
551
|
Executor.get_class = lambda vllm_config: executor_cls
|
|
547
552
|
self._engine = AsyncLLMEngine.from_engine_args(engine_args)
|
|
548
|
-
except:
|
|
553
|
+
except: # noqa: E722
|
|
549
554
|
logger.exception("Creating vllm engine failed")
|
|
550
555
|
self._loading_error = sys.exc_info()
|
|
551
556
|
|
|
@@ -714,7 +719,7 @@ class VLLMModel(LLM):
|
|
|
714
719
|
logger.info("Detecting vLLM is not health, prepare to quit the process")
|
|
715
720
|
try:
|
|
716
721
|
self.stop()
|
|
717
|
-
except:
|
|
722
|
+
except: # noqa: E722
|
|
718
723
|
# ignore error when stop
|
|
719
724
|
pass
|
|
720
725
|
# Just kill the process and let xinference auto-recover the model
|
|
@@ -857,7 +862,7 @@ class VLLMModel(LLM):
|
|
|
857
862
|
if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
|
|
858
863
|
return False
|
|
859
864
|
if llm_spec.model_format == "pytorch":
|
|
860
|
-
if quantization != "none" and
|
|
865
|
+
if quantization != "none" and quantization is not None:
|
|
861
866
|
return False
|
|
862
867
|
if llm_spec.model_format == "awq":
|
|
863
868
|
# Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
|
|
@@ -988,7 +993,10 @@ class VLLMModel(LLM):
|
|
|
988
993
|
from vllm import TokensPrompt
|
|
989
994
|
|
|
990
995
|
token_ids = await asyncio.to_thread(
|
|
991
|
-
self._tokenize,
|
|
996
|
+
self._tokenize,
|
|
997
|
+
tokenizer,
|
|
998
|
+
prompt, # type: ignore
|
|
999
|
+
config,
|
|
992
1000
|
)
|
|
993
1001
|
return TokensPrompt(prompt_token_ids=token_ids)
|
|
994
1002
|
|
|
@@ -1082,18 +1090,43 @@ class VLLMModel(LLM):
|
|
|
1082
1090
|
logger.warning(f"Failed to create GuidedDecodingParams: {e}")
|
|
1083
1091
|
guided_options = None
|
|
1084
1092
|
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1093
|
+
try:
|
|
1094
|
+
import inspect
|
|
1095
|
+
|
|
1096
|
+
sp_sig = inspect.signature(SamplingParams)
|
|
1097
|
+
# For v0.9.2 and similar versions, prioritize guided_decoding over structured_outputs
|
|
1098
|
+
# structured_outputs was introduced later (around v0.11.0) and may not accept
|
|
1099
|
+
# GuidedDecodingParams in earlier versions even if the parameter exists
|
|
1100
|
+
if "guided_decoding" in sp_sig.parameters:
|
|
1101
|
+
sampling_params = SamplingParams(
|
|
1102
|
+
guided_decoding=guided_options, **sanitized_generate_config
|
|
1103
|
+
)
|
|
1104
|
+
elif "structured_outputs" in sp_sig.parameters:
|
|
1105
|
+
try:
|
|
1106
|
+
sampling_params = SamplingParams(
|
|
1107
|
+
structured_outputs=guided_options,
|
|
1108
|
+
**sanitized_generate_config,
|
|
1109
|
+
)
|
|
1110
|
+
except TypeError as e:
|
|
1111
|
+
if "structured_outputs" in str(e):
|
|
1112
|
+
# structured_outputs parameter exists but doesn't accept GuidedDecodingParams
|
|
1113
|
+
# Fall back to no guided decoding
|
|
1114
|
+
logger.warning(
|
|
1115
|
+
f"structured_outputs parameter failed: {e}. "
|
|
1116
|
+
"Falling back to no guided decoding for vLLM version compatibility."
|
|
1117
|
+
)
|
|
1118
|
+
sampling_params = SamplingParams(
|
|
1119
|
+
**sanitized_generate_config
|
|
1120
|
+
)
|
|
1121
|
+
else:
|
|
1122
|
+
raise
|
|
1123
|
+
else:
|
|
1124
|
+
sampling_params = SamplingParams(**sanitized_generate_config)
|
|
1125
|
+
except Exception as e:
|
|
1126
|
+
logger.warning(
|
|
1127
|
+
f"Failed to create SamplingParams with guided decoding: {e}"
|
|
1096
1128
|
)
|
|
1129
|
+
sampling_params = SamplingParams(**sanitized_generate_config)
|
|
1097
1130
|
else:
|
|
1098
1131
|
# ignore generate configs for older versions
|
|
1099
1132
|
sanitized_generate_config.pop("guided_json", None)
|
|
@@ -1111,7 +1144,9 @@ class VLLMModel(LLM):
|
|
|
1111
1144
|
# this requires tokenizing
|
|
1112
1145
|
tokenizer = await self._get_tokenizer(lora_request)
|
|
1113
1146
|
prompt_or_token_ids = await self._gen_tokens_prompt(
|
|
1114
|
-
tokenizer,
|
|
1147
|
+
tokenizer,
|
|
1148
|
+
prompt,
|
|
1149
|
+
sanitized_generate_config, # type: ignore
|
|
1115
1150
|
)
|
|
1116
1151
|
sampling_params.max_tokens = max_tokens = self._context_length - len( # type: ignore
|
|
1117
1152
|
prompt_or_token_ids["prompt_token_ids"] # type: ignore
|
|
@@ -1266,11 +1301,10 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1266
1301
|
]:
|
|
1267
1302
|
return False
|
|
1268
1303
|
if llm_spec.model_format == "pytorch":
|
|
1269
|
-
if quantization != "none" and
|
|
1304
|
+
if quantization != "none" and quantization is not None:
|
|
1270
1305
|
return False
|
|
1271
1306
|
if llm_spec.model_format == "awq":
|
|
1272
|
-
|
|
1273
|
-
if "4" not in quantization:
|
|
1307
|
+
if not any(q in quantization for q in ("4", "8")):
|
|
1274
1308
|
return False
|
|
1275
1309
|
if llm_spec.model_format == "gptq":
|
|
1276
1310
|
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"):
|
|
@@ -1430,7 +1464,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1430
1464
|
return self._to_chat_completion(c, self.reasoning_parser)
|
|
1431
1465
|
|
|
1432
1466
|
|
|
1433
|
-
class
|
|
1467
|
+
class VLLMMultiModel(VLLMModel, ChatModelMixin):
|
|
1434
1468
|
@classmethod
|
|
1435
1469
|
def match_json(
|
|
1436
1470
|
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
@@ -1442,11 +1476,10 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
1442
1476
|
if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
|
|
1443
1477
|
return False
|
|
1444
1478
|
if llm_spec.model_format == "pytorch":
|
|
1445
|
-
if quantization != "none" and
|
|
1479
|
+
if quantization != "none" and quantization is not None:
|
|
1446
1480
|
return False
|
|
1447
1481
|
if llm_spec.model_format == "awq":
|
|
1448
|
-
|
|
1449
|
-
if "4" not in quantization:
|
|
1482
|
+
if not any(q in quantization for q in ("4", "8")):
|
|
1450
1483
|
return False
|
|
1451
1484
|
if llm_spec.model_format == "gptq":
|
|
1452
1485
|
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"):
|
|
@@ -1456,12 +1489,16 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
1456
1489
|
if "4" not in quantization:
|
|
1457
1490
|
return False
|
|
1458
1491
|
if isinstance(llm_family, CustomLLMFamilyV2):
|
|
1459
|
-
if llm_family.model_family not in
|
|
1492
|
+
if llm_family.model_family not in VLLM_SUPPORTED_MULTI_MODEL_LIST:
|
|
1460
1493
|
return False
|
|
1461
1494
|
else:
|
|
1462
|
-
if llm_family.model_name not in
|
|
1495
|
+
if llm_family.model_name not in VLLM_SUPPORTED_MULTI_MODEL_LIST:
|
|
1463
1496
|
return False
|
|
1464
|
-
if
|
|
1497
|
+
if (
|
|
1498
|
+
"vision" not in llm_family.model_ability
|
|
1499
|
+
and "audio" not in llm_family.model_ability
|
|
1500
|
+
and "omni" not in llm_family.model_ability
|
|
1501
|
+
):
|
|
1465
1502
|
return False
|
|
1466
1503
|
return VLLM_INSTALLED
|
|
1467
1504
|
|
|
@@ -1470,13 +1507,21 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
1470
1507
|
) -> VLLMModelConfig:
|
|
1471
1508
|
model_config = super()._sanitize_model_config(model_config)
|
|
1472
1509
|
if VLLM_VERSION >= version.parse("0.5.5"):
|
|
1473
|
-
model_config
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1510
|
+
if model_config.get("limit_mm_per_prompt"):
|
|
1511
|
+
model_config["limit_mm_per_prompt"] = json.loads(
|
|
1512
|
+
model_config.get("limit_mm_per_prompt") # type: ignore
|
|
1513
|
+
)
|
|
1514
|
+
else:
|
|
1515
|
+
if "omni" in self.model_family.model_ability:
|
|
1516
|
+
model_config["limit_mm_per_prompt"] = {
|
|
1517
|
+
"image": 2,
|
|
1518
|
+
"video": 2,
|
|
1519
|
+
"audio": 2,
|
|
1520
|
+
}
|
|
1521
|
+
elif "vision" in self.model_family.model_ability:
|
|
1522
|
+
model_config["limit_mm_per_prompt"] = {"image": 2, "video": 2}
|
|
1523
|
+
elif "audio" in self.model_family.model_ability:
|
|
1524
|
+
model_config["limit_mm_per_prompt"] = {"audio": 2}
|
|
1480
1525
|
return model_config
|
|
1481
1526
|
|
|
1482
1527
|
def _sanitize_chat_config(
|
|
@@ -1510,7 +1555,10 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
1510
1555
|
multi_modal_data = prompt.get("multi_modal_data")
|
|
1511
1556
|
|
|
1512
1557
|
token_ids = await asyncio.to_thread(
|
|
1513
|
-
self._tokenize,
|
|
1558
|
+
self._tokenize,
|
|
1559
|
+
tokenizer,
|
|
1560
|
+
prompt_str,
|
|
1561
|
+
config, # type: ignore
|
|
1514
1562
|
)
|
|
1515
1563
|
return TokensPrompt(
|
|
1516
1564
|
prompt_token_ids=token_ids, multi_modal_data=multi_modal_data
|
|
@@ -1526,9 +1574,13 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
1526
1574
|
tools = generate_config.pop("tools", []) if generate_config else None
|
|
1527
1575
|
|
|
1528
1576
|
model_family = self.model_family.model_family or self.model_family.model_name
|
|
1529
|
-
|
|
1577
|
+
audios, images, videos = None, None, None
|
|
1530
1578
|
if "internvl" not in model_family.lower():
|
|
1531
|
-
from
|
|
1579
|
+
from qwen_omni_utils import (
|
|
1580
|
+
process_audio_info,
|
|
1581
|
+
process_mm_info,
|
|
1582
|
+
process_vision_info,
|
|
1583
|
+
)
|
|
1532
1584
|
|
|
1533
1585
|
messages = self._transform_messages(messages)
|
|
1534
1586
|
|
|
@@ -1543,29 +1595,36 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
|
|
|
1543
1595
|
if tools and model_family in QWEN_TOOL_CALL_FAMILY:
|
|
1544
1596
|
full_context_kwargs["tools"] = tools
|
|
1545
1597
|
assert self.model_family.chat_template is not None
|
|
1598
|
+
if "omni" in self.model_family.model_ability:
|
|
1599
|
+
audios, images, videos = process_mm_info(
|
|
1600
|
+
messages, use_audio_in_video=True
|
|
1601
|
+
)
|
|
1602
|
+
elif "audio" in self.model_family.model_ability:
|
|
1603
|
+
audios = process_audio_info(messages, use_audio_in_video=False)
|
|
1604
|
+
elif "vision" in self.model_family.model_ability:
|
|
1605
|
+
images, videos = process_vision_info( # type: ignore
|
|
1606
|
+
messages, return_video_kwargs=False
|
|
1607
|
+
)
|
|
1608
|
+
|
|
1546
1609
|
prompt = self.get_full_context(
|
|
1547
1610
|
messages, self.model_family.chat_template, **full_context_kwargs
|
|
1548
1611
|
)
|
|
1549
|
-
images, video_inputs = process_vision_info(messages)
|
|
1550
|
-
if video_inputs:
|
|
1551
|
-
raise ValueError("Not support video input now.")
|
|
1552
|
-
else:
|
|
1553
|
-
prompt, images = self.get_specific_prompt(model_family, messages)
|
|
1554
1612
|
|
|
1555
|
-
if not images:
|
|
1556
|
-
inputs = {
|
|
1557
|
-
"prompt": prompt,
|
|
1558
|
-
}
|
|
1559
|
-
elif len(images) == 1:
|
|
1560
|
-
inputs = {
|
|
1561
|
-
"prompt": prompt,
|
|
1562
|
-
"multi_modal_data": {"image": images[-1]}, # type: ignore
|
|
1563
|
-
}
|
|
1564
1613
|
else:
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1614
|
+
prompt, images = self.get_specific_prompt(model_family, messages)
|
|
1615
|
+
inputs = {"prompt": prompt, "multi_modal_data": {}, "mm_processor_kwargs": {}}
|
|
1616
|
+
if images:
|
|
1617
|
+
inputs["multi_modal_data"]["image"] = images
|
|
1618
|
+
if videos:
|
|
1619
|
+
inputs["multi_modal_data"]["video"] = videos
|
|
1620
|
+
if audios:
|
|
1621
|
+
inputs["multi_modal_data"]["audio"] = audios
|
|
1622
|
+
if "omni" in self.model_family.model_ability:
|
|
1623
|
+
inputs["mm_processor_kwargs"]["use_audio_in_video"] = True
|
|
1624
|
+
if inputs["multi_modal_data"] == {}:
|
|
1625
|
+
inputs.pop("multi_modal_data")
|
|
1626
|
+
if inputs["mm_processor_kwargs"] == {}:
|
|
1627
|
+
inputs.pop("mm_processor_kwargs")
|
|
1569
1628
|
generate_config = self._sanitize_chat_config(generate_config)
|
|
1570
1629
|
|
|
1571
1630
|
stream = generate_config.get("stream", None)
|
xinference/model/utils.py
CHANGED
|
@@ -315,6 +315,11 @@ def set_all_random_seed(seed: int):
|
|
|
315
315
|
|
|
316
316
|
|
|
317
317
|
class CancellableDownloader:
|
|
318
|
+
_global_lock = threading.Lock()
|
|
319
|
+
_active_instances = 0
|
|
320
|
+
_original_update = None # Class-level original update method
|
|
321
|
+
_patch_lock = threading.Lock() # Additional lock for patching operations
|
|
322
|
+
|
|
318
323
|
def __init__(
|
|
319
324
|
self,
|
|
320
325
|
cancel_error_cls: Type[BaseException] = asyncio.CancelledError,
|
|
@@ -325,23 +330,23 @@ class CancellableDownloader:
|
|
|
325
330
|
self._cancelled = threading.Event()
|
|
326
331
|
self._done_event = threading.Event()
|
|
327
332
|
self._cancel_error_cls = cancel_error_cls
|
|
328
|
-
self._original_update = None
|
|
329
333
|
# progress for tqdm that is main
|
|
330
334
|
self._main_progresses: Set[tqdm] = set()
|
|
331
335
|
# progress for file downloader
|
|
332
336
|
# mainly when tqdm unit is set
|
|
333
337
|
self._download_progresses: Set[tqdm] = set()
|
|
334
|
-
# tqdm
|
|
335
|
-
self.
|
|
338
|
+
# Instance-specific tqdm tracking
|
|
339
|
+
self._patched_instances: Set[int] = set()
|
|
336
340
|
|
|
337
341
|
def reset(self):
|
|
338
342
|
self._main_progresses.clear()
|
|
339
343
|
self._download_progresses.clear()
|
|
340
344
|
|
|
341
345
|
def get_progress(self) -> float:
|
|
342
|
-
if self.
|
|
343
|
-
# directly return 1.0 when
|
|
346
|
+
if self.done:
|
|
347
|
+
# directly return 1.0 when finished
|
|
344
348
|
return 1.0
|
|
349
|
+
# Don't return 1.0 when cancelled, calculate actual progress
|
|
345
350
|
|
|
346
351
|
tasks = finished_tasks = 0
|
|
347
352
|
for main_progress in self._main_progresses:
|
|
@@ -376,6 +381,7 @@ class CancellableDownloader:
|
|
|
376
381
|
|
|
377
382
|
def cancel(self):
|
|
378
383
|
self._cancelled.set()
|
|
384
|
+
self._done_event.set()
|
|
379
385
|
|
|
380
386
|
@property
|
|
381
387
|
def cancelled(self):
|
|
@@ -392,39 +398,76 @@ class CancellableDownloader:
|
|
|
392
398
|
raise self._cancel_error_cls(error_msg)
|
|
393
399
|
|
|
394
400
|
def patch_tqdm(self):
|
|
395
|
-
#
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
401
|
+
# Use class-level patching to avoid conflicts
|
|
402
|
+
with self._patch_lock:
|
|
403
|
+
if self._original_update is None:
|
|
404
|
+
self._original_update = original_update = tqdm.update
|
|
405
|
+
|
|
406
|
+
# Thread-safe patched update
|
|
407
|
+
def patched_update(tqdm_instance, n):
|
|
408
|
+
import gc
|
|
409
|
+
|
|
410
|
+
# Get all CancellableDownloader instances and check for cancellation
|
|
411
|
+
downloaders = [
|
|
412
|
+
obj
|
|
413
|
+
for obj in gc.get_objects()
|
|
414
|
+
if isinstance(obj, CancellableDownloader)
|
|
415
|
+
]
|
|
416
|
+
|
|
417
|
+
for downloader in downloaders:
|
|
418
|
+
# if download cancelled, throw error
|
|
419
|
+
if getattr(downloader, "cancelled", False):
|
|
420
|
+
downloader.raise_error()
|
|
421
|
+
|
|
422
|
+
progresses = None
|
|
423
|
+
if not getattr(tqdm_instance, "disable", False):
|
|
424
|
+
unit = getattr(tqdm_instance, "unit", "it")
|
|
425
|
+
if unit == "it":
|
|
426
|
+
progresses = getattr(
|
|
427
|
+
downloader, "_main_progresses", None
|
|
428
|
+
)
|
|
429
|
+
else:
|
|
430
|
+
progresses = getattr(
|
|
431
|
+
downloader, "_download_progresses", None
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
if progresses is not None:
|
|
435
|
+
progresses.add(tqdm_instance)
|
|
436
|
+
else:
|
|
437
|
+
logger.debug(
|
|
438
|
+
f"No progresses found for downloader {downloader}"
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
# Call original update with safety check
|
|
442
|
+
return original_update(tqdm_instance, n)
|
|
443
|
+
|
|
444
|
+
tqdm.update = patched_update
|
|
413
445
|
|
|
414
446
|
def unpatch_tqdm(self):
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
447
|
+
with self._patch_lock:
|
|
448
|
+
if self._original_update is not None and self._active_instances == 0:
|
|
449
|
+
tqdm.update = self._original_update
|
|
450
|
+
self._original_update = None
|
|
419
451
|
|
|
420
452
|
def __enter__(self):
|
|
421
|
-
|
|
453
|
+
# Use global lock to prevent concurrent patching
|
|
454
|
+
with self._global_lock:
|
|
455
|
+
if self._active_instances == 0:
|
|
456
|
+
self.patch_tqdm()
|
|
457
|
+
self._active_instances += 1
|
|
422
458
|
return self
|
|
423
459
|
|
|
424
460
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
425
|
-
|
|
426
|
-
self.
|
|
427
|
-
|
|
461
|
+
# Use global lock to prevent concurrent unpatching
|
|
462
|
+
with self._global_lock:
|
|
463
|
+
self._active_instances -= 1
|
|
464
|
+
if self._active_instances == 0:
|
|
465
|
+
self.unpatch_tqdm()
|
|
466
|
+
try:
|
|
467
|
+
self._done_event.set()
|
|
468
|
+
self.reset()
|
|
469
|
+
except Exception as e:
|
|
470
|
+
logger.debug(f"Error during CancellableDownloader cleanup: {e}")
|
|
428
471
|
|
|
429
472
|
|
|
430
473
|
def get_engine_params_by_name(
|