xinference 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (92) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +415 -1
  3. xinference/constants.py +2 -0
  4. xinference/core/model.py +3 -4
  5. xinference/core/supervisor.py +29 -1
  6. xinference/core/worker.py +4 -1
  7. xinference/deploy/cmdline.py +2 -0
  8. xinference/deploy/test/test_cmdline.py +1 -1
  9. xinference/model/audio/core.py +5 -0
  10. xinference/model/audio/cosyvoice.py +0 -1
  11. xinference/model/audio/kokoro.py +1 -1
  12. xinference/model/audio/kokoro_zh.py +124 -0
  13. xinference/model/audio/model_spec.json +64 -20
  14. xinference/model/embedding/flag/core.py +5 -0
  15. xinference/model/embedding/llama_cpp/core.py +22 -19
  16. xinference/model/embedding/sentence_transformers/core.py +19 -4
  17. xinference/model/embedding/vllm/core.py +40 -8
  18. xinference/model/image/cache_manager.py +56 -0
  19. xinference/model/image/core.py +9 -0
  20. xinference/model/image/model_spec.json +116 -9
  21. xinference/model/image/stable_diffusion/core.py +141 -31
  22. xinference/model/llm/core.py +10 -0
  23. xinference/model/llm/llama_cpp/core.py +42 -40
  24. xinference/model/llm/llm_family.json +435 -23
  25. xinference/model/llm/llm_family.py +1 -0
  26. xinference/model/llm/mlx/core.py +52 -33
  27. xinference/model/llm/sglang/core.py +2 -44
  28. xinference/model/llm/tool_parsers/__init__.py +58 -0
  29. xinference/model/llm/tool_parsers/abstract_tool_parser.py +33 -0
  30. xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +128 -0
  31. xinference/model/llm/tool_parsers/deepseek_v3_tool_parser.py +145 -0
  32. xinference/model/llm/tool_parsers/glm4_tool_parser.py +123 -0
  33. xinference/model/llm/tool_parsers/llama3_tool_parser.py +77 -0
  34. xinference/model/llm/tool_parsers/qwen_tool_parser.py +320 -0
  35. xinference/model/llm/transformers/core.py +6 -12
  36. xinference/model/llm/utils.py +128 -46
  37. xinference/model/llm/vllm/core.py +8 -61
  38. xinference/model/rerank/core.py +3 -0
  39. xinference/model/rerank/sentence_transformers/core.py +1 -1
  40. xinference/model/rerank/vllm/core.py +56 -6
  41. xinference/model/utils.py +1 -2
  42. xinference/model/video/model_spec.json +95 -1
  43. xinference/thirdparty/cosyvoice/bin/export_jit.py +3 -4
  44. xinference/thirdparty/cosyvoice/bin/export_onnx.py +49 -126
  45. xinference/thirdparty/cosyvoice/bin/{inference.py → inference_deprecated.py} +1 -0
  46. xinference/thirdparty/cosyvoice/bin/train.py +23 -3
  47. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +8 -4
  48. xinference/thirdparty/cosyvoice/cli/frontend.py +4 -4
  49. xinference/thirdparty/cosyvoice/cli/model.py +53 -75
  50. xinference/thirdparty/cosyvoice/dataset/dataset.py +5 -18
  51. xinference/thirdparty/cosyvoice/dataset/processor.py +24 -25
  52. xinference/thirdparty/cosyvoice/flow/decoder.py +24 -433
  53. xinference/thirdparty/cosyvoice/flow/flow.py +6 -14
  54. xinference/thirdparty/cosyvoice/flow/flow_matching.py +33 -145
  55. xinference/thirdparty/cosyvoice/hifigan/generator.py +169 -1
  56. xinference/thirdparty/cosyvoice/llm/llm.py +108 -17
  57. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +14 -115
  58. xinference/thirdparty/cosyvoice/utils/common.py +20 -0
  59. xinference/thirdparty/cosyvoice/utils/executor.py +8 -4
  60. xinference/thirdparty/cosyvoice/utils/file_utils.py +45 -1
  61. xinference/thirdparty/cosyvoice/utils/losses.py +37 -0
  62. xinference/thirdparty/cosyvoice/utils/mask.py +35 -1
  63. xinference/thirdparty/cosyvoice/utils/train_utils.py +24 -6
  64. xinference/thirdparty/cosyvoice/vllm/cosyvoice2.py +103 -0
  65. xinference/types.py +105 -2
  66. xinference/ui/gradio/chat_interface.py +2 -0
  67. xinference/ui/gradio/media_interface.py +353 -7
  68. xinference/ui/web/ui/build/asset-manifest.json +3 -3
  69. xinference/ui/web/ui/build/index.html +1 -1
  70. xinference/ui/web/ui/build/static/js/main.1086c759.js +3 -0
  71. xinference/ui/web/ui/build/static/js/main.1086c759.js.map +1 -0
  72. xinference/ui/web/ui/node_modules/.cache/babel-loader/3c5758bd12fa334294b1de0ff6b1a4bac8d963c45472eab9dc3e530d82aa6b3f.json +1 -0
  73. xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +1 -0
  74. xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +1 -0
  75. xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +1 -0
  76. xinference/ui/web/ui/src/locales/en.json +2 -0
  77. xinference/ui/web/ui/src/locales/ja.json +2 -0
  78. xinference/ui/web/ui/src/locales/ko.json +2 -0
  79. xinference/ui/web/ui/src/locales/zh.json +2 -0
  80. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/METADATA +16 -12
  81. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/RECORD +86 -77
  82. xinference/ui/web/ui/build/static/js/main.4918643a.js +0 -3
  83. xinference/ui/web/ui/build/static/js/main.4918643a.js.map +0 -1
  84. xinference/ui/web/ui/node_modules/.cache/babel-loader/3d2a89f0eccc1f90fc5036c9a1d587c2120e6a6b128aae31d1db7d6bad52722b.json +0 -1
  85. xinference/ui/web/ui/node_modules/.cache/babel-loader/89179f8f51887b9167721860a12412549ff04f78162e921a7b6aa6532646deb2.json +0 -1
  86. xinference/ui/web/ui/node_modules/.cache/babel-loader/8e5cb82c2ff3299c6a44563fe6b1c5515c9750613c51bb63abee0b1d70fc5019.json +0 -1
  87. xinference/ui/web/ui/node_modules/.cache/babel-loader/9dc5cfc67dd0617b0272aeef8651f1589b2155a4ff1fd72ad3166b217089b619.json +0 -1
  88. /xinference/ui/web/ui/build/static/js/{main.4918643a.js.LICENSE.txt → main.1086c759.js.LICENSE.txt} +0 -0
  89. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/WHEEL +0 -0
  90. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/entry_points.txt +0 -0
  91. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/licenses/LICENSE +0 -0
  92. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/top_level.txt +0 -0
@@ -51,6 +51,7 @@ from ...types import (
51
51
  )
52
52
  from .core import chat_context_var
53
53
  from .reasoning_parser import ReasoningParser
54
+ from .tool_parsers.glm4_tool_parser import Glm4ToolParser
54
55
 
55
56
  logger = logging.getLogger(__name__)
56
57
 
@@ -82,7 +83,7 @@ LLAMA3_TOOL_CALL_FAMILY = [
82
83
  "HuatuoGPT-o1-LLaMA-3.1",
83
84
  ]
84
85
 
85
- DEEPSEEK_TOOL_CALL_FAMILY = ["deepseek-v3", "deepseek-r1-0528"]
86
+ DEEPSEEK_TOOL_CALL_FAMILY = ["deepseek-v3", "deepseek-r1-0528", "Deepseek-V3.1"]
86
87
 
87
88
  TOOL_CALL_FAMILY = (
88
89
  QWEN_TOOL_CALL_FAMILY
@@ -95,6 +96,13 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
95
96
 
96
97
 
97
98
  class ChatModelMixin:
99
+
100
+ def __init__(self):
101
+ self.model_family = None
102
+ self.model_uid = None
103
+ self.reasoning_parser = None
104
+ self.tool_parser = None
105
+
98
106
  @staticmethod
99
107
  @functools.lru_cache
100
108
  def _compile_jinja_template(chat_template):
@@ -590,16 +598,41 @@ class ChatModelMixin:
590
598
  pos2 = content.find(QWEN_TOOL_CALL_SYMBOLS[1])
591
599
  if pos2 != -1:
592
600
  content = content[:pos2]
601
+
602
+ # Skip empty content after extraction
603
+ if not content.strip():
604
+ continue
605
+
593
606
  try:
594
607
  res = json.loads(content, strict=False)
595
- results.append((None, res["name"], res["arguments"]))
596
- except Exception as e:
608
+ if isinstance(res, dict):
609
+ # Check if required fields exist
610
+ if "name" in res and "arguments" in res:
611
+ results.append((None, res["name"], res["arguments"]))
612
+ else:
613
+ logger.warning(
614
+ "Missing required fields in qwen tool call: %s", content
615
+ )
616
+ results.append((content, None, None))
617
+ else:
618
+ logger.warning(
619
+ "Qwen tool call result is not a dict: %s", content
620
+ )
621
+ results.append((content, None, None))
622
+ except json.JSONDecodeError as e:
597
623
  logger.error(
598
624
  "Can't parse single qwen tool call output: %s. Error: %s",
599
625
  content,
600
626
  e,
601
627
  )
602
628
  results.append((content, None, None))
629
+ except Exception as e:
630
+ logger.error(
631
+ "Unexpected error parsing qwen tool call: %s. Error: %s",
632
+ content,
633
+ e,
634
+ )
635
+ results.append((content, None, None))
603
636
  return results
604
637
 
605
638
  @classmethod
@@ -757,47 +790,60 @@ class ChatModelMixin:
757
790
  logger.debug(f"Tool call content: {result}")
758
791
  return result
759
792
 
760
- @classmethod
761
793
  def _post_process_completion_chunk(
762
- cls,
794
+ self,
763
795
  model_family,
764
796
  model_uid,
765
797
  c,
766
798
  chunk_id=None,
767
- reasoning_parser: Optional[ReasoningParser] = None,
768
- tool_call_text: Optional[str] = None,
799
+ previous_texts: List[str] = [""],
769
800
  ):
770
801
  _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
771
- tool_result = cls._eval_tool_arguments(model_family, c, tool_call_text)
802
+ if isinstance(self.tool_parser, Glm4ToolParser):
803
+ tool_result = self.tool_parser.extract_tool_calls_streaming(
804
+ [],
805
+ c,
806
+ c,
807
+ )
808
+ else:
809
+ finish_reason = c["choices"][0]["finish_reason"]
810
+ delta_text = c["choices"][0]["delta"]["content"]
811
+ current_text = (
812
+ previous_texts[-1] + delta_text if previous_texts else delta_text
813
+ )
814
+ tool_result = self.tool_parser.extract_tool_calls_streaming(
815
+ previous_texts,
816
+ current_text,
817
+ delta_text,
818
+ )
819
+ previous_texts[-1] = current_text
820
+ if tool_result is None and not finish_reason:
821
+ return None
772
822
  tool_calls = []
773
823
  failed_contents = []
774
- for content, func, args in tool_result:
775
- if func:
776
- tool_calls.append(
777
- {
778
- "index": 0,
779
- "id": f"call_{_id}",
780
- "type": "function",
781
- "function": {
782
- "name": func,
783
- "arguments": json.dumps(args, ensure_ascii=False),
784
- },
785
- }
786
- )
787
- else:
788
- failed_contents.append(content)
789
- finish_reason = "tool_calls" if tool_calls else "stop"
824
+ content, func, args = tool_result if tool_result else ("", None, None)
825
+ if func:
826
+ tool_calls.append(
827
+ {
828
+ "index": 0,
829
+ "id": f"call_{_id}",
830
+ "type": "function",
831
+ "function": {
832
+ "name": func,
833
+ "arguments": json.dumps(args, ensure_ascii=False),
834
+ },
835
+ }
836
+ )
837
+ else:
838
+ failed_contents.append(content)
790
839
 
791
- content = "".join(failed_contents) if failed_contents else None
840
+ finish_reason = "tool_calls" if tool_calls else finish_reason
792
841
 
793
- # fix: qwen tool_call content field return null
794
- family = model_family.model_family or model_family.model_name
795
- if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
796
- content = ""
842
+ content = "".join(failed_contents) if failed_contents else None
797
843
 
798
844
  d = {
799
845
  "role": "assistant",
800
- "content": content,
846
+ "content": content if content else "",
801
847
  "tool_calls": tool_calls,
802
848
  }
803
849
 
@@ -826,29 +872,32 @@ class ChatModelMixin:
826
872
  "usage": usage,
827
873
  }
828
874
 
829
- @classmethod
830
875
  def _post_process_completion(
831
- cls,
876
+ self,
832
877
  model_family,
833
878
  model_uid,
834
879
  c,
835
- reasoning_parser: Optional[ReasoningParser] = None,
836
880
  ):
837
- if reasoning_parser:
838
- c = reasoning_parser.prepare_reasoning_content(c)
881
+ if not self.tool_parser:
882
+ return self._get_final_chat_completion_chunk(c)
883
+ if self.reasoning_parser:
884
+ c = self.reasoning_parser.prepare_reasoning_content(c)
839
885
  _id = str(uuid.uuid4())
840
886
  reasoning_content = None
841
- if reasoning_parser and reasoning_parser.check_content_parser():
887
+ if self.reasoning_parser and self.reasoning_parser.check_content_parser():
842
888
  text = c["choices"][0]["text"]
843
- reasoning_content, content = reasoning_parser.extract_reasoning_content(
844
- text
889
+ reasoning_content, content = (
890
+ self.reasoning_parser.extract_reasoning_content(text)
845
891
  )
846
892
  c["choices"][0]["text"] = content
847
893
 
848
- tool_result = cls._eval_tool_arguments(model_family, c)
849
-
850
894
  tool_calls = []
851
895
  failed_contents = []
896
+ if isinstance(self.tool_parser, Glm4ToolParser):
897
+ tool_result = self.tool_parser.extract_tool_calls(c)
898
+ else:
899
+ text = c["choices"][0]["text"]
900
+ tool_result = self.tool_parser.extract_tool_calls(text)
852
901
  for content, func, args in tool_result:
853
902
  if func:
854
903
  tool_calls.append(
@@ -868,14 +917,9 @@ class ChatModelMixin:
868
917
 
869
918
  content = "".join(failed_contents) if failed_contents else None
870
919
 
871
- # fix: qwen tool_call content field return null
872
- family = model_family.model_family or model_family.model_name
873
- if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
874
- content = ""
875
-
876
920
  m = {
877
921
  "role": "assistant",
878
- "content": content,
922
+ "content": content if content else "",
879
923
  "tool_calls": tool_calls,
880
924
  }
881
925
  # add only reasoning_content is None
@@ -943,6 +987,44 @@ class ChatModelMixin:
943
987
 
944
988
  return transformed_messages
945
989
 
990
+ async def _async_to_tool_completion_chunks(
991
+ self,
992
+ chunks: AsyncGenerator[CompletionChunk, None],
993
+ ctx: Optional[Dict[str, Any]] = None,
994
+ ) -> AsyncGenerator[ChatCompletionChunk, None]:
995
+ def set_context():
996
+ if ctx:
997
+ chat_context_var.set(ctx)
998
+
999
+ i = 0
1000
+ previous_texts = [""]
1001
+ previous_tools_texts = [""]
1002
+ full_text = ""
1003
+ if self.reasoning_parser:
1004
+ set_context()
1005
+ chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
1006
+ async for completion_chunk in chunks:
1007
+ set_context()
1008
+ chat_chunk = self._to_chat_completion_chunk(
1009
+ completion_chunk, self.reasoning_parser, previous_texts
1010
+ )
1011
+ if (
1012
+ "reasoning_content" in chat_chunk["choices"][0]["delta"]
1013
+ and chat_chunk["choices"][0]["delta"]["reasoning_content"] is not None
1014
+ ):
1015
+ yield chat_chunk
1016
+ continue
1017
+ processed_chunk = self._post_process_completion_chunk(
1018
+ self.model_family,
1019
+ self.model_uid,
1020
+ chat_chunk,
1021
+ previous_texts=previous_tools_texts,
1022
+ )
1023
+ if processed_chunk:
1024
+ yield processed_chunk
1025
+ i += 1
1026
+ logger.debug("Chat finished, output: %s", full_text)
1027
+
946
1028
 
947
1029
  def get_model_version(
948
1030
  model_name: str,
@@ -273,13 +273,19 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.2"):
273
273
  VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Instruct")
274
274
  VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Thinking")
275
275
  VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Coder")
276
+ VLLM_SUPPORTED_CHAT_MODELS.append("Deepseek-V3.1")
276
277
 
277
278
  if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.0"):
278
279
  VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5")
279
280
  VLLM_SUPPORTED_VISION_MODEL_LIST.append("glm-4.5v")
281
+ VLLM_SUPPORTED_CHAT_MODELS.append("KAT-V1")
280
282
 
281
283
  if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
282
284
  VLLM_SUPPORTED_CHAT_MODELS.append("gpt-oss")
285
+ VLLM_SUPPORTED_CHAT_MODELS.append("seed-oss")
286
+
287
+ if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.1.1"):
288
+ VLLM_SUPPORTED_CHAT_MODELS.append("seed-oss")
283
289
 
284
290
 
285
291
  class VLLMModel(LLM):
@@ -387,6 +393,7 @@ class VLLMModel(LLM):
387
393
  self.prepare_parse_reasoning_content(
388
394
  reasoning_content, enable_thinking=enable_thinking
389
395
  )
396
+ self.prepare_parse_tool_calls()
390
397
 
391
398
  if (
392
399
  isinstance(self.model_spec, LlamaCppLLMSpecV2)
@@ -767,7 +774,6 @@ class VLLMModel(LLM):
767
774
  sanitized = VLLMGenerateConfig()
768
775
 
769
776
  response_format = generate_config.pop("response_format", None)
770
- guided_decoding_backend = generate_config.get("guided_decoding_backend", None)
771
777
  guided_json_object = None
772
778
  guided_json = None
773
779
 
@@ -778,8 +784,6 @@ class VLLMModel(LLM):
778
784
  json_schema = response_format.get("json_schema")
779
785
  assert json_schema is not None
780
786
  guided_json = json_schema.get("json_schema")
781
- if guided_decoding_backend is None:
782
- guided_decoding_backend = "outlines"
783
787
 
784
788
  sanitized.setdefault("lora_name", generate_config.get("lora_name", None))
785
789
  sanitized.setdefault("n", generate_config.get("n", 1))
@@ -827,10 +831,6 @@ class VLLMModel(LLM):
827
831
  "guided_json_object",
828
832
  generate_config.get("guided_json_object", guided_json_object),
829
833
  )
830
- sanitized.setdefault(
831
- "guided_decoding_backend",
832
- generate_config.get("guided_decoding_backend", guided_decoding_backend),
833
- )
834
834
 
835
835
  return sanitized
836
836
 
@@ -1285,59 +1285,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1285
1285
 
1286
1286
  return processed_messages
1287
1287
 
1288
- async def _async_to_tool_completion_chunks(
1289
- self,
1290
- chunks: AsyncGenerator[CompletionChunk, None],
1291
- ctx: Optional[Dict[str, Any]] = {},
1292
- ) -> AsyncGenerator[ChatCompletionChunk, None]:
1293
- def set_context():
1294
- if ctx:
1295
- chat_context_var.set(ctx)
1296
-
1297
- i = 0
1298
- previous_texts = [""]
1299
- tool_call = False
1300
- tool_call_texts = [""]
1301
- full_text = ""
1302
- if self.reasoning_parser:
1303
- set_context()
1304
- chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
1305
- async for chunk in chunks:
1306
- set_context()
1307
- if i == 0:
1308
- for first_chunk in self._get_first_chat_completion_chunk(
1309
- chunk, self.reasoning_parser
1310
- ):
1311
- yield first_chunk
1312
- # usage
1313
- choices = chunk.get("choices")
1314
- if not choices:
1315
- yield self._get_final_chat_completion_chunk(chunk)
1316
- else:
1317
- full_text += chunk["choices"][0]["text"]
1318
- if self.is_tool_call_chunk_start(chunk):
1319
- tool_call = True
1320
- if tool_call:
1321
- tool_call_text = tool_call_texts[-1]
1322
- tool_call_text += chunk["choices"][0]["text"]
1323
- tool_call_texts.append(tool_call_text)
1324
- if self.is_tool_call_chunk_end(chunk):
1325
- yield self._post_process_completion_chunk(
1326
- self.model_family,
1327
- self.model_uid,
1328
- chunk,
1329
- reasoning_parser=self.reasoning_parser,
1330
- tool_call_text=tool_call_text,
1331
- )
1332
- tool_call = False
1333
- tool_call_texts = [""]
1334
- else:
1335
- yield self._to_chat_completion_chunk(
1336
- chunk, self.reasoning_parser, previous_texts
1337
- )
1338
- i += 1
1339
- logger.debug("Chat finished, output: %s", full_text)
1340
-
1341
1288
  @vllm_check
1342
1289
  async def async_chat(
1343
1290
  self,
@@ -1402,7 +1349,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
1402
1349
  assert not isinstance(c, AsyncGenerator)
1403
1350
  if tools:
1404
1351
  return self._post_process_completion(
1405
- self.model_family, self.model_uid, c, self.reasoning_parser
1352
+ self.model_family, self.model_uid, c
1406
1353
  )
1407
1354
  return self._to_chat_completion(c, self.reasoning_parser)
1408
1355
 
@@ -97,6 +97,8 @@ class RerankModel:
97
97
  model_uid: str,
98
98
  model_path: str,
99
99
  model_family: RerankModelFamilyV2,
100
+ quantization: Optional[str],
101
+ *,
100
102
  device: Optional[str] = None,
101
103
  use_fp16: bool = False,
102
104
  **kwargs,
@@ -105,6 +107,7 @@ class RerankModel:
105
107
  self._model_spec = model_family.model_specs[0]
106
108
  self._model_uid = model_uid
107
109
  self._model_path = model_path
110
+ self._quantization = quantization
108
111
  self._device = device
109
112
  self._use_fp16 = use_fp16
110
113
  self._model = None
@@ -72,7 +72,7 @@ class SentenceTransformerRerankModel(RerankModel):
72
72
  enable_flash_attn = self._kwargs.pop(
73
73
  "enable_flash_attn", is_flash_attn_available()
74
74
  )
75
- if self._auto_detect_type(self._model_path) != "normal" and enable_flash_attn:
75
+ if enable_flash_attn:
76
76
  logger.warning(
77
77
  "flash_attn can only support fp16 and bf16, will force set `use_fp16` to True"
78
78
  )
@@ -3,6 +3,7 @@ import uuid
3
3
  from typing import List, Optional
4
4
 
5
5
  from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
6
+ from ...utils import cache_clean
6
7
  from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
7
8
 
8
9
  SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"]
@@ -22,9 +23,27 @@ class VLLMRerankModel(RerankModel):
22
23
 
23
24
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
24
25
 
26
+ if self.model_family.model_name in {
27
+ "Qwen3-Reranker-0.6B",
28
+ "Qwen3-Reranker-4B",
29
+ "Qwen3-Reranker-8B",
30
+ }:
31
+ if "hf_overrides" not in self._kwargs:
32
+ self._kwargs["hf_overrides"] = {
33
+ "architectures": ["Qwen3ForSequenceClassification"],
34
+ "classifier_from_token": ["no", "yes"],
35
+ "is_original_qwen3_reranker": True,
36
+ }
37
+ elif isinstance(self._kwargs["hf_overrides"], dict):
38
+ self._kwargs["hf_overrides"].update(
39
+ architectures=["Qwen3ForSequenceClassification"],
40
+ classifier_from_token=["no", "yes"],
41
+ is_original_qwen3_reranker=True,
42
+ )
25
43
  self._model = LLM(model=self._model_path, task="score", **self._kwargs)
26
44
  self._tokenizer = self._model.get_tokenizer()
27
45
 
46
+ @cache_clean
28
47
  def rerank(
29
48
  self,
30
49
  documents: List[str],
@@ -51,14 +70,45 @@ class VLLMRerankModel(RerankModel):
51
70
  """
52
71
  if kwargs:
53
72
  raise RuntimeError("Unexpected keyword arguments: {}".format(kwargs))
73
+ assert self._model is not None
54
74
  documents_size = len(documents)
55
75
  query_list = [query] * documents_size
56
- assert self._model is not None
57
- outputs = self._model.score(
58
- documents,
59
- query_list,
60
- use_tqdm=False,
61
- )
76
+
77
+ if self.model_family.model_name in {
78
+ "Qwen3-Reranker-0.6B",
79
+ "Qwen3-Reranker-4B",
80
+ "Qwen3-Reranker-8B",
81
+ }:
82
+ instruction = "Given a web search query, retrieve relevant passages that answer the query"
83
+ prefix = (
84
+ "<|im_start|>system\nJudge whether the Document meets the requirements based on"
85
+ " the Query and the Instruct provided. "
86
+ 'Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
87
+ )
88
+ suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
89
+ query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
90
+ document_template = "<Document>: {doc}{suffix}"
91
+ processed_queries = [
92
+ query_template.format(
93
+ prefix=prefix, instruction=instruction, query=query
94
+ )
95
+ for query in query_list
96
+ ]
97
+ processed_documents = [
98
+ document_template.format(doc=doc, suffix=suffix) for doc in documents
99
+ ]
100
+ outputs = self._model.score(
101
+ processed_documents,
102
+ processed_queries,
103
+ use_tqdm=False,
104
+ )
105
+
106
+ else:
107
+ outputs = self._model.score(
108
+ documents,
109
+ query_list,
110
+ use_tqdm=False,
111
+ )
62
112
  scores = map(lambda scoreoutput: scoreoutput.outputs.score, outputs)
63
113
  documents = list(map(lambda doc: Document(text=doc), documents))
64
114
  document_parts = list(zip(range(documents_size), scores, documents))
xinference/model/utils.py CHANGED
@@ -619,8 +619,7 @@ def is_flash_attn_available() -> bool:
619
619
  f"GPU compute capability {compute_capability} < 8.0, "
620
620
  "flash_attn may not work optimally"
621
621
  )
622
- # Note: Some older GPUs may also support flash_attn, so this is just a warning
623
- # This threshold can be adjusted based on actual requirements
622
+ return False
624
623
 
625
624
  # Try to import flash_attn core module to verify correct installation
626
625
  try:
@@ -224,7 +224,7 @@
224
224
  },
225
225
  "virtualenv": {
226
226
  "packages": [
227
- "git+https://github.com/huggingface/diffusers",
227
+ "diffusers==0.35.1",
228
228
  "ftfy",
229
229
  "imageio-ffmpeg",
230
230
  "imageio",
@@ -241,5 +241,99 @@
241
241
  "model_revision": "master"
242
242
  }
243
243
  }
244
+ },
245
+ {
246
+ "version": 2,
247
+ "model_name": "Wan2.2-A14B",
248
+ "model_family": "Wan",
249
+ "model_ability": [
250
+ "text2video"
251
+ ],
252
+ "default_model_config": {
253
+ "torch_dtype": "bfloat16"
254
+ },
255
+ "default_generate_config": {},
256
+ "virtualenv": {
257
+ "packages": [
258
+ "diffusers==0.35.1",
259
+ "ftfy",
260
+ "imageio-ffmpeg",
261
+ "imageio",
262
+ "#system_numpy#"
263
+ ]
264
+ },
265
+ "model_src": {
266
+ "huggingface": {
267
+ "model_id": "Wan-AI/Wan2.2-T2V-A14B-Diffusers",
268
+ "model_revision": "5be7df9619b54f4e2667b2755bc6a756675b5cd7"
269
+ },
270
+ "modelscope": {
271
+ "model_id": "Wan-AI/Wan2.2-T2V-A14B-Diffusers",
272
+ "model_revision": "master"
273
+ }
274
+ }
275
+ },
276
+ {
277
+ "version": 2,
278
+ "model_name": "Wan2.2-i2v-A14B",
279
+ "model_family": "Wan",
280
+ "model_ability": [
281
+ "image2video"
282
+ ],
283
+ "default_model_config": {
284
+ "torch_dtype": "bfloat16"
285
+ },
286
+ "default_generate_config": {},
287
+ "virtualenv": {
288
+ "packages": [
289
+ "diffusers==0.35.1",
290
+ "ftfy",
291
+ "imageio-ffmpeg",
292
+ "imageio",
293
+ "#system_numpy#"
294
+ ]
295
+ },
296
+ "model_src": {
297
+ "huggingface": {
298
+ "model_id": "Wan-AI/Wan2.2-I2V-A14B-Diffusers",
299
+ "model_revision": "596658fd9ca6b7b71d5057529bbf319ecbc61d74"
300
+ },
301
+ "modelscope": {
302
+ "model_id": "Wan-AI/Wan2.2-I2V-A14B-Diffusers",
303
+ "model_revision": "master"
304
+ }
305
+ }
306
+ },
307
+ {
308
+ "version": 2,
309
+ "model_name": "Wan2.2-ti2v-5B",
310
+ "model_family": "Wan",
311
+ "model_ability": [
312
+ "text2video",
313
+ "image2video"
314
+ ],
315
+ "default_model_config": {
316
+ "torch_dtype": "bfloat16"
317
+ },
318
+ "default_generate_config": {},
319
+ "virtualenv": {
320
+ "packages": [
321
+ "diffusers==0.35.1",
322
+ "ftfy",
323
+ "imageio-ffmpeg",
324
+ "imageio",
325
+ "#system_numpy#"
326
+ ]
327
+ },
328
+ "model_src": {
329
+ "huggingface": {
330
+ "model_id": "Wan-AI/Wan2.2-TI2V-5B-Diffusers",
331
+ "model_revision": "b8fff7315c768468a5333511427288870b2e9635"
332
+ },
333
+ "modelscope": {
334
+ "model_id": "Wan-AI/Wan2.2-TI2V-5B-Diffusers",
335
+ "model_revision": "master"
336
+ }
337
+ }
244
338
  }
245
339
  ]
@@ -61,8 +61,7 @@ def main():
61
61
  model = CosyVoice(args.model_dir)
62
62
  except Exception:
63
63
  try:
64
- # NOTE set use_flow_cache=True when export jit for cache inference
65
- model = CosyVoice2(args.model_dir, use_flow_cache=True)
64
+ model = CosyVoice2(args.model_dir)
66
65
  except Exception:
67
66
  raise TypeError('no valid model_type!')
68
67
 
@@ -93,9 +92,9 @@ def main():
93
92
  else:
94
93
  # 3. export flow encoder
95
94
  flow_encoder = model.model.flow.encoder
96
- script = get_optimized_script(flow_encoder, ['forward_chunk'])
95
+ script = get_optimized_script(flow_encoder)
97
96
  script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
98
- script = get_optimized_script(flow_encoder.half(), ['forward_chunk'])
97
+ script = get_optimized_script(flow_encoder.half())
99
98
  script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
100
99
  logging.info('successfully export flow_encoder')
101
100