xinference 1.10.1__py3-none-any.whl → 1.11.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (39) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +462 -3
  3. xinference/client/restful/async_restful_client.py +158 -5
  4. xinference/client/restful/restful_client.py +131 -0
  5. xinference/core/supervisor.py +12 -0
  6. xinference/model/audio/model_spec.json +20 -20
  7. xinference/model/image/model_spec.json +159 -159
  8. xinference/model/llm/__init__.py +2 -2
  9. xinference/model/llm/llm_family.json +843 -180
  10. xinference/model/llm/mlx/distributed_models/core.py +41 -0
  11. xinference/model/llm/mlx/distributed_models/qwen2.py +1 -2
  12. xinference/model/llm/sglang/core.py +20 -6
  13. xinference/model/llm/tool_parsers/qwen_tool_parser.py +29 -4
  14. xinference/model/llm/transformers/chatglm.py +3 -0
  15. xinference/model/llm/transformers/core.py +93 -16
  16. xinference/model/llm/transformers/multimodal/minicpmv45.py +340 -0
  17. xinference/model/llm/transformers/utils.py +3 -0
  18. xinference/model/llm/utils.py +37 -24
  19. xinference/model/llm/vllm/core.py +128 -69
  20. xinference/model/utils.py +74 -31
  21. xinference/thirdparty/audiotools/core/audio_signal.py +6 -6
  22. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +1 -1
  23. xinference/thirdparty/melo/text/chinese_mix.py +2 -2
  24. xinference/types.py +9 -0
  25. xinference/ui/web/ui/build/asset-manifest.json +3 -3
  26. xinference/ui/web/ui/build/index.html +1 -1
  27. xinference/ui/web/ui/build/static/js/{main.d192c4f3.js → main.e4d9a9e1.js} +3 -3
  28. xinference/ui/web/ui/build/static/js/{main.d192c4f3.js.map → main.e4d9a9e1.js.map} +1 -1
  29. xinference/ui/web/ui/node_modules/.cache/babel-loader/e6770a05771952175c9fbf48fce283c9bb1bc8b5763e39edc36d099d1fe16b4a.json +1 -0
  30. xinference/ui/web/ui/node_modules/.cache/babel-loader/ea2a26361204e70cf1018d6990fb6354bed82b3ac69690391e0f100385e7abb7.json +1 -0
  31. {xinference-1.10.1.dist-info → xinference-1.11.0.post1.dist-info}/METADATA +8 -5
  32. {xinference-1.10.1.dist-info → xinference-1.11.0.post1.dist-info}/RECORD +37 -36
  33. xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +0 -1
  34. xinference/ui/web/ui/node_modules/.cache/babel-loader/f995a2425dfb0822fd07127f66ffe9b026883bc156b402eb8bd0b83d52460a93.json +0 -1
  35. /xinference/ui/web/ui/build/static/js/{main.d192c4f3.js.LICENSE.txt → main.e4d9a9e1.js.LICENSE.txt} +0 -0
  36. {xinference-1.10.1.dist-info → xinference-1.11.0.post1.dist-info}/WHEEL +0 -0
  37. {xinference-1.10.1.dist-info → xinference-1.11.0.post1.dist-info}/entry_points.txt +0 -0
  38. {xinference-1.10.1.dist-info → xinference-1.11.0.post1.dist-info}/licenses/LICENSE +0 -0
  39. {xinference-1.10.1.dist-info → xinference-1.11.0.post1.dist-info}/top_level.txt +0 -0
@@ -75,6 +75,8 @@ QWEN_TOOL_CALL_FAMILY = [
75
75
  "Qwen3-VL-Thinking",
76
76
  "Qwen3-Next-Instruct",
77
77
  "Qwen3-Next-Thinking",
78
+ "Qwen3-Omni-Instruct",
79
+ "Qwen3-Omni-Thinking",
78
80
  ]
79
81
 
80
82
  GLM4_TOOL_CALL_FAMILY = [
@@ -100,7 +102,6 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
100
102
 
101
103
 
102
104
  class ChatModelMixin:
103
-
104
105
  def __init__(self):
105
106
  self.model_family = None
106
107
  self.model_uid = None
@@ -143,7 +144,7 @@ class ChatModelMixin:
143
144
  tokenize=False,
144
145
  **kwargs,
145
146
  ):
146
- if "vision" not in self.model_family.model_ability: # type: ignore
147
+ if "vision" not in self.model_family.model_ability and "audio" not in self.model_family.model_ability: # type: ignore
147
148
  messages = self.convert_messages_with_content_list_to_str_conversion(
148
149
  messages
149
150
  )
@@ -186,8 +187,7 @@ class ChatModelMixin:
186
187
  return kwargs
187
188
  else:
188
189
  raise TypeError(
189
- f"`chat_template_kwargs` but be a JSON parsable str "
190
- f"or dict, got: {kwargs}"
190
+ f"`chat_template_kwargs` but be a JSON parsable str or dict, got: {kwargs}"
191
191
  )
192
192
  elif reasoning_parser and not reasoning_parser.enable_thinking:
193
193
  # hybrid model like qwen3,
@@ -853,11 +853,11 @@ class ChatModelMixin:
853
853
  "tool_calls": tool_calls,
854
854
  }
855
855
 
856
- try:
857
- usage = c.get("usage")
858
- assert "prompt_tokens" in usage
859
- except Exception:
856
+ # For tool completion chunks, use None for usage, actual values for stop
857
+ if finish_reason == "tool_calls":
860
858
  usage = None
859
+ else:
860
+ usage = c.get("usage")
861
861
  return {
862
862
  "id": "chat" + f"cmpl-{_id}",
863
863
  "model": model_uid,
@@ -882,25 +882,32 @@ class ChatModelMixin:
882
882
  ):
883
883
  if not self.tool_parser:
884
884
  return self._get_final_chat_completion_chunk(c)
885
- if self.reasoning_parser:
886
- c = self.reasoning_parser.prepare_reasoning_content(c)
885
+
887
886
  _id = str(uuid.uuid4())
888
887
  reasoning_content = None
888
+ content = ""
889
+
890
+ # First, process reasoning content if reasoning parser exists
891
+ text = c["choices"][0]["text"]
889
892
  if self.reasoning_parser and self.reasoning_parser.check_content_parser():
890
- text = c["choices"][0]["text"]
891
- reasoning_content, content = (
893
+ # Extract reasoning content directly from the original text
894
+ reasoning_content, processed_content = (
892
895
  self.reasoning_parser.extract_reasoning_content(text)
893
896
  )
894
- c["choices"][0]["text"] = content
897
+ # Use the processed content (without thinking tags) for tool parsing
898
+ if processed_content:
899
+ text = processed_content
895
900
 
901
+ # Then, extract tool calls from the processed text (without thinking tags)
896
902
  tool_calls = []
897
903
  failed_contents = []
898
904
  if isinstance(self.tool_parser, Glm4ToolParser):
899
905
  tool_result = self.tool_parser.extract_tool_calls(c)
900
906
  else:
901
- text = c["choices"][0]["text"]
902
907
  tool_result = self.tool_parser.extract_tool_calls(text)
903
- for content, func, args in tool_result:
908
+
909
+ # Process tool results
910
+ for tool_content, func, args in tool_result:
904
911
  if func:
905
912
  tool_calls.append(
906
913
  {
@@ -913,25 +920,31 @@ class ChatModelMixin:
913
920
  }
914
921
  )
915
922
  else:
916
- if content:
917
- failed_contents.append(content)
918
- finish_reason = "tool_calls" if tool_calls else "stop"
923
+ if tool_content:
924
+ failed_contents.append(tool_content)
919
925
 
920
- content = "".join(failed_contents) if failed_contents else None
926
+ # Determine the final content
927
+ if tool_calls:
928
+ # For tool calls, the main content should be empty or contain only non-tool parts
929
+ content = "".join(failed_contents) if failed_contents else ""
930
+ else:
931
+ # For non-tool calls, use the processed content from reasoning parser
932
+ content = text
933
+
934
+ finish_reason = "tool_calls" if tool_calls else "stop"
921
935
 
922
936
  m = {
923
937
  "role": "assistant",
924
- "content": content if content else "",
938
+ "content": content,
925
939
  "tool_calls": tool_calls,
926
940
  }
927
941
  # add only reasoning_content is None
928
942
  if reasoning_content is not None:
929
943
  m["reasoning_content"] = reasoning_content
930
944
 
931
- try:
932
- usage = c.get("usage")
933
- assert "prompt_tokens" in usage
934
- except Exception:
945
+ # For tool completion chunks, use actual usage values when available
946
+ usage = c.get("usage")
947
+ if not usage or not isinstance(usage, dict) or "prompt_tokens" not in usage:
935
948
  usage = {
936
949
  "prompt_tokens": -1,
937
950
  "completion_tokens": -1,
@@ -131,7 +131,7 @@ except ImportError:
131
131
  VLLM_INSTALLED = False
132
132
  VLLM_VERSION = None
133
133
 
134
- VLLM_SUPPORTED_VISION_MODEL_LIST: List[str] = []
134
+ VLLM_SUPPORTED_MULTI_MODEL_LIST: List[str] = []
135
135
  VLLM_SUPPORTED_MODELS = [
136
136
  "llama-2",
137
137
  "llama-3",
@@ -229,34 +229,37 @@ if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.5.3"):
229
229
  VLLM_SUPPORTED_CHAT_MODELS.append("HuatuoGPT-o1-LLaMA-3.1")
230
230
 
231
231
  if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.6.1"):
232
- VLLM_SUPPORTED_VISION_MODEL_LIST.append("internvl2")
233
- VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5")
234
- VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL2.5-MPO")
235
- VLLM_SUPPORTED_VISION_MODEL_LIST.append("InternVL3")
232
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("internvl2")
233
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("InternVL2.5")
234
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("InternVL2.5-MPO")
235
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("InternVL3")
236
236
 
237
237
  if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.6.2"):
238
238
  VLLM_SUPPORTED_CHAT_MODELS.append("minicpm3-4b")
239
239
 
240
240
  if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.6.3"):
241
241
  VLLM_SUPPORTED_MODELS.append("llama-3.2-vision")
242
- VLLM_SUPPORTED_VISION_MODEL_LIST.append("llama-3.2-vision-instruct")
243
- VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2-vl-instruct")
244
- VLLM_SUPPORTED_VISION_MODEL_LIST.append("QvQ-72B-Preview")
242
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("llama-3.2-vision-instruct")
243
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2-vl-instruct")
244
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("QvQ-72B-Preview")
245
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2-audio")
245
246
 
246
247
  if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.7.0"):
247
248
  VLLM_SUPPORTED_CHAT_MODELS.append("internlm3-instruct")
248
249
 
249
250
  if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.7.2"):
250
- VLLM_SUPPORTED_VISION_MODEL_LIST.append("qwen2.5-vl-instruct")
251
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2.5-vl-instruct")
251
252
  VLLM_SUPPORTED_CHAT_MODELS.append("moonlight-16b-a3b-instruct")
253
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2-audio-instruct")
252
254
 
253
255
  if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.7.3"):
254
256
  VLLM_SUPPORTED_CHAT_MODELS.append("qwen2.5-instruct-1m")
255
257
  VLLM_SUPPORTED_CHAT_MODELS.append("qwenLong-l1")
258
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("qwen2.5-omni")
256
259
 
257
260
  if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.0"):
258
261
  VLLM_SUPPORTED_CHAT_MODELS.append("gemma-3-1b-it")
259
- VLLM_SUPPORTED_VISION_MODEL_LIST.append("gemma-3-it")
262
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("gemma-3-it")
260
263
 
261
264
  if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.8.4"):
262
265
  VLLM_SUPPORTED_CHAT_MODELS.append("glm4-0414")
@@ -272,7 +275,7 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.1"):
272
275
 
273
276
  if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.2"):
274
277
  VLLM_SUPPORTED_CHAT_MODELS.append("Ernie4.5")
275
- VLLM_SUPPORTED_VISION_MODEL_LIST.append("glm-4.1v-thinking")
278
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("glm-4.1v-thinking")
276
279
  VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Instruct")
277
280
  VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Thinking")
278
281
  VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Coder")
@@ -280,7 +283,7 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.2"):
280
283
 
281
284
  if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.0"):
282
285
  VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5")
283
- VLLM_SUPPORTED_VISION_MODEL_LIST.append("glm-4.5v")
286
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("glm-4.5v")
284
287
  VLLM_SUPPORTED_CHAT_MODELS.append("KAT-V1")
285
288
 
286
289
  if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
@@ -291,9 +294,11 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.2"):
291
294
  VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Next-Instruct")
292
295
  VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Next-Thinking")
293
296
 
294
- if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.2"):
295
- VLLM_SUPPORTED_VISION_MODEL_LIST.append("Qwen3-VL-Instruct")
296
- VLLM_SUPPORTED_VISION_MODEL_LIST.append("Qwen3-VL-Instruct")
297
+ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.11.0"):
298
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("Qwen3-VL-Thinking")
299
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("Qwen3-VL-Instruct")
300
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("Qwen3-Omni-Thinking")
301
+ VLLM_SUPPORTED_MULTI_MODEL_LIST.append("Qwen3-Omni-Instruct")
297
302
 
298
303
 
299
304
  class VLLMModel(LLM):
@@ -545,7 +550,7 @@ class VLLMModel(LLM):
545
550
  # patch vllm Executor.get_class
546
551
  Executor.get_class = lambda vllm_config: executor_cls
547
552
  self._engine = AsyncLLMEngine.from_engine_args(engine_args)
548
- except:
553
+ except: # noqa: E722
549
554
  logger.exception("Creating vllm engine failed")
550
555
  self._loading_error = sys.exc_info()
551
556
 
@@ -714,7 +719,7 @@ class VLLMModel(LLM):
714
719
  logger.info("Detecting vLLM is not health, prepare to quit the process")
715
720
  try:
716
721
  self.stop()
717
- except:
722
+ except: # noqa: E722
718
723
  # ignore error when stop
719
724
  pass
720
725
  # Just kill the process and let xinference auto-recover the model
@@ -857,7 +862,7 @@ class VLLMModel(LLM):
857
862
  if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
858
863
  return False
859
864
  if llm_spec.model_format == "pytorch":
860
- if quantization != "none" and not (quantization is None):
865
+ if quantization != "none" and quantization is not None:
861
866
  return False
862
867
  if llm_spec.model_format == "awq":
863
868
  # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
@@ -988,7 +993,10 @@ class VLLMModel(LLM):
988
993
  from vllm import TokensPrompt
989
994
 
990
995
  token_ids = await asyncio.to_thread(
991
- self._tokenize, tokenizer, prompt, config # type: ignore
996
+ self._tokenize,
997
+ tokenizer,
998
+ prompt, # type: ignore
999
+ config,
992
1000
  )
993
1001
  return TokensPrompt(prompt_token_ids=token_ids)
994
1002
 
@@ -1082,18 +1090,43 @@ class VLLMModel(LLM):
1082
1090
  logger.warning(f"Failed to create GuidedDecodingParams: {e}")
1083
1091
  guided_options = None
1084
1092
 
1085
- # Use structured_outputs for vLLM >= 0.11.0, guided_decoding for older versions
1086
- if (
1087
- VLLM_VERSION >= version.parse("0.11.0")
1088
- or VLLM_VERSION.base_version >= "0.11.0"
1089
- ):
1090
- sampling_params = SamplingParams(
1091
- structured_outputs=guided_options, **sanitized_generate_config
1092
- )
1093
- else:
1094
- sampling_params = SamplingParams(
1095
- guided_decoding=guided_options, **sanitized_generate_config
1093
+ try:
1094
+ import inspect
1095
+
1096
+ sp_sig = inspect.signature(SamplingParams)
1097
+ # For v0.9.2 and similar versions, prioritize guided_decoding over structured_outputs
1098
+ # structured_outputs was introduced later (around v0.11.0) and may not accept
1099
+ # GuidedDecodingParams in earlier versions even if the parameter exists
1100
+ if "guided_decoding" in sp_sig.parameters:
1101
+ sampling_params = SamplingParams(
1102
+ guided_decoding=guided_options, **sanitized_generate_config
1103
+ )
1104
+ elif "structured_outputs" in sp_sig.parameters:
1105
+ try:
1106
+ sampling_params = SamplingParams(
1107
+ structured_outputs=guided_options,
1108
+ **sanitized_generate_config,
1109
+ )
1110
+ except TypeError as e:
1111
+ if "structured_outputs" in str(e):
1112
+ # structured_outputs parameter exists but doesn't accept GuidedDecodingParams
1113
+ # Fall back to no guided decoding
1114
+ logger.warning(
1115
+ f"structured_outputs parameter failed: {e}. "
1116
+ "Falling back to no guided decoding for vLLM version compatibility."
1117
+ )
1118
+ sampling_params = SamplingParams(
1119
+ **sanitized_generate_config
1120
+ )
1121
+ else:
1122
+ raise
1123
+ else:
1124
+ sampling_params = SamplingParams(**sanitized_generate_config)
1125
+ except Exception as e:
1126
+ logger.warning(
1127
+ f"Failed to create SamplingParams with guided decoding: {e}"
1096
1128
  )
1129
+ sampling_params = SamplingParams(**sanitized_generate_config)
1097
1130
  else:
1098
1131
  # ignore generate configs for older versions
1099
1132
  sanitized_generate_config.pop("guided_json", None)
@@ -1111,7 +1144,9 @@ class VLLMModel(LLM):
1111
1144
  # this requires tokenizing
1112
1145
  tokenizer = await self._get_tokenizer(lora_request)
1113
1146
  prompt_or_token_ids = await self._gen_tokens_prompt(
1114
- tokenizer, prompt, sanitized_generate_config # type: ignore
1147
+ tokenizer,
1148
+ prompt,
1149
+ sanitized_generate_config, # type: ignore
1115
1150
  )
1116
1151
  sampling_params.max_tokens = max_tokens = self._context_length - len( # type: ignore
1117
1152
  prompt_or_token_ids["prompt_token_ids"] # type: ignore
@@ -1266,11 +1301,10 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1266
1301
  ]:
1267
1302
  return False
1268
1303
  if llm_spec.model_format == "pytorch":
1269
- if quantization != "none" and not (quantization is None):
1304
+ if quantization != "none" and quantization is not None:
1270
1305
  return False
1271
1306
  if llm_spec.model_format == "awq":
1272
- # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
1273
- if "4" not in quantization:
1307
+ if not any(q in quantization for q in ("4", "8")):
1274
1308
  return False
1275
1309
  if llm_spec.model_format == "gptq":
1276
1310
  if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"):
@@ -1430,7 +1464,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1430
1464
  return self._to_chat_completion(c, self.reasoning_parser)
1431
1465
 
1432
1466
 
1433
- class VLLMVisionModel(VLLMModel, ChatModelMixin):
1467
+ class VLLMMultiModel(VLLMModel, ChatModelMixin):
1434
1468
  @classmethod
1435
1469
  def match_json(
1436
1470
  cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
@@ -1442,11 +1476,10 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
1442
1476
  if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8", "bnb"]:
1443
1477
  return False
1444
1478
  if llm_spec.model_format == "pytorch":
1445
- if quantization != "none" and not (quantization is None):
1479
+ if quantization != "none" and quantization is not None:
1446
1480
  return False
1447
1481
  if llm_spec.model_format == "awq":
1448
- # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
1449
- if "4" not in quantization:
1482
+ if not any(q in quantization for q in ("4", "8")):
1450
1483
  return False
1451
1484
  if llm_spec.model_format == "gptq":
1452
1485
  if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.3.3"):
@@ -1456,12 +1489,16 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
1456
1489
  if "4" not in quantization:
1457
1490
  return False
1458
1491
  if isinstance(llm_family, CustomLLMFamilyV2):
1459
- if llm_family.model_family not in VLLM_SUPPORTED_VISION_MODEL_LIST:
1492
+ if llm_family.model_family not in VLLM_SUPPORTED_MULTI_MODEL_LIST:
1460
1493
  return False
1461
1494
  else:
1462
- if llm_family.model_name not in VLLM_SUPPORTED_VISION_MODEL_LIST:
1495
+ if llm_family.model_name not in VLLM_SUPPORTED_MULTI_MODEL_LIST:
1463
1496
  return False
1464
- if "vision" not in llm_family.model_ability:
1497
+ if (
1498
+ "vision" not in llm_family.model_ability
1499
+ and "audio" not in llm_family.model_ability
1500
+ and "omni" not in llm_family.model_ability
1501
+ ):
1465
1502
  return False
1466
1503
  return VLLM_INSTALLED
1467
1504
 
@@ -1470,13 +1507,21 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
1470
1507
  ) -> VLLMModelConfig:
1471
1508
  model_config = super()._sanitize_model_config(model_config)
1472
1509
  if VLLM_VERSION >= version.parse("0.5.5"):
1473
- model_config["limit_mm_per_prompt"] = (
1474
- json.loads(model_config.get("limit_mm_per_prompt")) # type: ignore
1475
- if model_config.get("limit_mm_per_prompt")
1476
- else {
1477
- "image": 2, # default 2 images all chat
1478
- }
1479
- )
1510
+ if model_config.get("limit_mm_per_prompt"):
1511
+ model_config["limit_mm_per_prompt"] = json.loads(
1512
+ model_config.get("limit_mm_per_prompt") # type: ignore
1513
+ )
1514
+ else:
1515
+ if "omni" in self.model_family.model_ability:
1516
+ model_config["limit_mm_per_prompt"] = {
1517
+ "image": 2,
1518
+ "video": 2,
1519
+ "audio": 2,
1520
+ }
1521
+ elif "vision" in self.model_family.model_ability:
1522
+ model_config["limit_mm_per_prompt"] = {"image": 2, "video": 2}
1523
+ elif "audio" in self.model_family.model_ability:
1524
+ model_config["limit_mm_per_prompt"] = {"audio": 2}
1480
1525
  return model_config
1481
1526
 
1482
1527
  def _sanitize_chat_config(
@@ -1510,7 +1555,10 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
1510
1555
  multi_modal_data = prompt.get("multi_modal_data")
1511
1556
 
1512
1557
  token_ids = await asyncio.to_thread(
1513
- self._tokenize, tokenizer, prompt_str, config # type: ignore
1558
+ self._tokenize,
1559
+ tokenizer,
1560
+ prompt_str,
1561
+ config, # type: ignore
1514
1562
  )
1515
1563
  return TokensPrompt(
1516
1564
  prompt_token_ids=token_ids, multi_modal_data=multi_modal_data
@@ -1526,9 +1574,13 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
1526
1574
  tools = generate_config.pop("tools", []) if generate_config else None
1527
1575
 
1528
1576
  model_family = self.model_family.model_family or self.model_family.model_name
1529
-
1577
+ audios, images, videos = None, None, None
1530
1578
  if "internvl" not in model_family.lower():
1531
- from qwen_vl_utils import process_vision_info
1579
+ from qwen_omni_utils import (
1580
+ process_audio_info,
1581
+ process_mm_info,
1582
+ process_vision_info,
1583
+ )
1532
1584
 
1533
1585
  messages = self._transform_messages(messages)
1534
1586
 
@@ -1543,29 +1595,36 @@ class VLLMVisionModel(VLLMModel, ChatModelMixin):
1543
1595
  if tools and model_family in QWEN_TOOL_CALL_FAMILY:
1544
1596
  full_context_kwargs["tools"] = tools
1545
1597
  assert self.model_family.chat_template is not None
1598
+ if "omni" in self.model_family.model_ability:
1599
+ audios, images, videos = process_mm_info(
1600
+ messages, use_audio_in_video=True
1601
+ )
1602
+ elif "audio" in self.model_family.model_ability:
1603
+ audios = process_audio_info(messages, use_audio_in_video=False)
1604
+ elif "vision" in self.model_family.model_ability:
1605
+ images, videos = process_vision_info( # type: ignore
1606
+ messages, return_video_kwargs=False
1607
+ )
1608
+
1546
1609
  prompt = self.get_full_context(
1547
1610
  messages, self.model_family.chat_template, **full_context_kwargs
1548
1611
  )
1549
- images, video_inputs = process_vision_info(messages)
1550
- if video_inputs:
1551
- raise ValueError("Not support video input now.")
1552
- else:
1553
- prompt, images = self.get_specific_prompt(model_family, messages)
1554
1612
 
1555
- if not images:
1556
- inputs = {
1557
- "prompt": prompt,
1558
- }
1559
- elif len(images) == 1:
1560
- inputs = {
1561
- "prompt": prompt,
1562
- "multi_modal_data": {"image": images[-1]}, # type: ignore
1563
- }
1564
1613
  else:
1565
- inputs = {
1566
- "prompt": prompt,
1567
- "multi_modal_data": {"image": images}, # type: ignore
1568
- }
1614
+ prompt, images = self.get_specific_prompt(model_family, messages)
1615
+ inputs = {"prompt": prompt, "multi_modal_data": {}, "mm_processor_kwargs": {}}
1616
+ if images:
1617
+ inputs["multi_modal_data"]["image"] = images
1618
+ if videos:
1619
+ inputs["multi_modal_data"]["video"] = videos
1620
+ if audios:
1621
+ inputs["multi_modal_data"]["audio"] = audios
1622
+ if "omni" in self.model_family.model_ability:
1623
+ inputs["mm_processor_kwargs"]["use_audio_in_video"] = True
1624
+ if inputs["multi_modal_data"] == {}:
1625
+ inputs.pop("multi_modal_data")
1626
+ if inputs["mm_processor_kwargs"] == {}:
1627
+ inputs.pop("mm_processor_kwargs")
1569
1628
  generate_config = self._sanitize_chat_config(generate_config)
1570
1629
 
1571
1630
  stream = generate_config.get("stream", None)
xinference/model/utils.py CHANGED
@@ -315,6 +315,11 @@ def set_all_random_seed(seed: int):
315
315
 
316
316
 
317
317
  class CancellableDownloader:
318
+ _global_lock = threading.Lock()
319
+ _active_instances = 0
320
+ _original_update = None # Class-level original update method
321
+ _patch_lock = threading.Lock() # Additional lock for patching operations
322
+
318
323
  def __init__(
319
324
  self,
320
325
  cancel_error_cls: Type[BaseException] = asyncio.CancelledError,
@@ -325,23 +330,23 @@ class CancellableDownloader:
325
330
  self._cancelled = threading.Event()
326
331
  self._done_event = threading.Event()
327
332
  self._cancel_error_cls = cancel_error_cls
328
- self._original_update = None
329
333
  # progress for tqdm that is main
330
334
  self._main_progresses: Set[tqdm] = set()
331
335
  # progress for file downloader
332
336
  # mainly when tqdm unit is set
333
337
  self._download_progresses: Set[tqdm] = set()
334
- # tqdm original update
335
- self._original_tqdm_update = None
338
+ # Instance-specific tqdm tracking
339
+ self._patched_instances: Set[int] = set()
336
340
 
337
341
  def reset(self):
338
342
  self._main_progresses.clear()
339
343
  self._download_progresses.clear()
340
344
 
341
345
  def get_progress(self) -> float:
342
- if self.cancelled or self.done:
343
- # directly return 1.0 when cancelled or finished
346
+ if self.done:
347
+ # directly return 1.0 when finished
344
348
  return 1.0
349
+ # Don't return 1.0 when cancelled, calculate actual progress
345
350
 
346
351
  tasks = finished_tasks = 0
347
352
  for main_progress in self._main_progresses:
@@ -376,6 +381,7 @@ class CancellableDownloader:
376
381
 
377
382
  def cancel(self):
378
383
  self._cancelled.set()
384
+ self._done_event.set()
379
385
 
380
386
  @property
381
387
  def cancelled(self):
@@ -392,39 +398,76 @@ class CancellableDownloader:
392
398
  raise self._cancel_error_cls(error_msg)
393
399
 
394
400
  def patch_tqdm(self):
395
- # patch tqdm
396
- # raise error if cancelled
397
- self._original_update = original_update = tqdm.update
398
- downloader = self
399
-
400
- def patched_update(self, n):
401
- if downloader.cancelled:
402
- downloader.raise_error()
403
- if not self.disable:
404
- progresses = (
405
- downloader._main_progresses
406
- if getattr(self, "unit", "it") == "it"
407
- else downloader._download_progresses
408
- )
409
- progresses.add(self)
410
- return original_update(self, n)
411
-
412
- tqdm.update = patched_update
401
+ # Use class-level patching to avoid conflicts
402
+ with self._patch_lock:
403
+ if self._original_update is None:
404
+ self._original_update = original_update = tqdm.update
405
+
406
+ # Thread-safe patched update
407
+ def patched_update(tqdm_instance, n):
408
+ import gc
409
+
410
+ # Get all CancellableDownloader instances and check for cancellation
411
+ downloaders = [
412
+ obj
413
+ for obj in gc.get_objects()
414
+ if isinstance(obj, CancellableDownloader)
415
+ ]
416
+
417
+ for downloader in downloaders:
418
+ # if download cancelled, throw error
419
+ if getattr(downloader, "cancelled", False):
420
+ downloader.raise_error()
421
+
422
+ progresses = None
423
+ if not getattr(tqdm_instance, "disable", False):
424
+ unit = getattr(tqdm_instance, "unit", "it")
425
+ if unit == "it":
426
+ progresses = getattr(
427
+ downloader, "_main_progresses", None
428
+ )
429
+ else:
430
+ progresses = getattr(
431
+ downloader, "_download_progresses", None
432
+ )
433
+
434
+ if progresses is not None:
435
+ progresses.add(tqdm_instance)
436
+ else:
437
+ logger.debug(
438
+ f"No progresses found for downloader {downloader}"
439
+ )
440
+
441
+ # Call original update with safety check
442
+ return original_update(tqdm_instance, n)
443
+
444
+ tqdm.update = patched_update
413
445
 
414
446
  def unpatch_tqdm(self):
415
- from tqdm.auto import tqdm
416
-
417
- if self._original_update:
418
- tqdm.update = self._original_update
447
+ with self._patch_lock:
448
+ if self._original_update is not None and self._active_instances == 0:
449
+ tqdm.update = self._original_update
450
+ self._original_update = None
419
451
 
420
452
  def __enter__(self):
421
- self.patch_tqdm()
453
+ # Use global lock to prevent concurrent patching
454
+ with self._global_lock:
455
+ if self._active_instances == 0:
456
+ self.patch_tqdm()
457
+ self._active_instances += 1
422
458
  return self
423
459
 
424
460
  def __exit__(self, exc_type, exc_val, exc_tb):
425
- self.unpatch_tqdm()
426
- self._done_event.set()
427
- self.reset()
461
+ # Use global lock to prevent concurrent unpatching
462
+ with self._global_lock:
463
+ self._active_instances -= 1
464
+ if self._active_instances == 0:
465
+ self.unpatch_tqdm()
466
+ try:
467
+ self._done_event.set()
468
+ self.reset()
469
+ except Exception as e:
470
+ logger.debug(f"Error during CancellableDownloader cleanup: {e}")
428
471
 
429
472
 
430
473
  def get_engine_params_by_name(