xinference 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +415 -1
- xinference/constants.py +2 -0
- xinference/core/model.py +3 -4
- xinference/core/supervisor.py +29 -1
- xinference/core/worker.py +4 -1
- xinference/deploy/cmdline.py +2 -0
- xinference/deploy/test/test_cmdline.py +1 -1
- xinference/model/audio/core.py +5 -0
- xinference/model/audio/cosyvoice.py +0 -1
- xinference/model/audio/kokoro.py +1 -1
- xinference/model/audio/kokoro_zh.py +124 -0
- xinference/model/audio/model_spec.json +64 -20
- xinference/model/embedding/flag/core.py +5 -0
- xinference/model/embedding/llama_cpp/core.py +22 -19
- xinference/model/embedding/sentence_transformers/core.py +19 -4
- xinference/model/embedding/vllm/core.py +40 -8
- xinference/model/image/cache_manager.py +56 -0
- xinference/model/image/core.py +9 -0
- xinference/model/image/model_spec.json +116 -9
- xinference/model/image/stable_diffusion/core.py +141 -31
- xinference/model/llm/core.py +10 -0
- xinference/model/llm/llama_cpp/core.py +42 -40
- xinference/model/llm/llm_family.json +435 -23
- xinference/model/llm/llm_family.py +1 -0
- xinference/model/llm/mlx/core.py +52 -33
- xinference/model/llm/sglang/core.py +2 -44
- xinference/model/llm/tool_parsers/__init__.py +58 -0
- xinference/model/llm/tool_parsers/abstract_tool_parser.py +33 -0
- xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +128 -0
- xinference/model/llm/tool_parsers/deepseek_v3_tool_parser.py +145 -0
- xinference/model/llm/tool_parsers/glm4_tool_parser.py +123 -0
- xinference/model/llm/tool_parsers/llama3_tool_parser.py +77 -0
- xinference/model/llm/tool_parsers/qwen_tool_parser.py +320 -0
- xinference/model/llm/transformers/core.py +6 -12
- xinference/model/llm/utils.py +128 -46
- xinference/model/llm/vllm/core.py +8 -61
- xinference/model/rerank/core.py +3 -0
- xinference/model/rerank/sentence_transformers/core.py +1 -1
- xinference/model/rerank/vllm/core.py +56 -6
- xinference/model/utils.py +1 -2
- xinference/model/video/model_spec.json +95 -1
- xinference/thirdparty/cosyvoice/bin/export_jit.py +3 -4
- xinference/thirdparty/cosyvoice/bin/export_onnx.py +49 -126
- xinference/thirdparty/cosyvoice/bin/{inference.py → inference_deprecated.py} +1 -0
- xinference/thirdparty/cosyvoice/bin/train.py +23 -3
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +8 -4
- xinference/thirdparty/cosyvoice/cli/frontend.py +4 -4
- xinference/thirdparty/cosyvoice/cli/model.py +53 -75
- xinference/thirdparty/cosyvoice/dataset/dataset.py +5 -18
- xinference/thirdparty/cosyvoice/dataset/processor.py +24 -25
- xinference/thirdparty/cosyvoice/flow/decoder.py +24 -433
- xinference/thirdparty/cosyvoice/flow/flow.py +6 -14
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +33 -145
- xinference/thirdparty/cosyvoice/hifigan/generator.py +169 -1
- xinference/thirdparty/cosyvoice/llm/llm.py +108 -17
- xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +14 -115
- xinference/thirdparty/cosyvoice/utils/common.py +20 -0
- xinference/thirdparty/cosyvoice/utils/executor.py +8 -4
- xinference/thirdparty/cosyvoice/utils/file_utils.py +45 -1
- xinference/thirdparty/cosyvoice/utils/losses.py +37 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +35 -1
- xinference/thirdparty/cosyvoice/utils/train_utils.py +24 -6
- xinference/thirdparty/cosyvoice/vllm/cosyvoice2.py +103 -0
- xinference/types.py +105 -2
- xinference/ui/gradio/chat_interface.py +2 -0
- xinference/ui/gradio/media_interface.py +353 -7
- xinference/ui/web/ui/build/asset-manifest.json +3 -3
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/js/main.1086c759.js +3 -0
- xinference/ui/web/ui/build/static/js/main.1086c759.js.map +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/3c5758bd12fa334294b1de0ff6b1a4bac8d963c45472eab9dc3e530d82aa6b3f.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +1 -0
- xinference/ui/web/ui/src/locales/en.json +2 -0
- xinference/ui/web/ui/src/locales/ja.json +2 -0
- xinference/ui/web/ui/src/locales/ko.json +2 -0
- xinference/ui/web/ui/src/locales/zh.json +2 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/METADATA +16 -12
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/RECORD +86 -77
- xinference/ui/web/ui/build/static/js/main.4918643a.js +0 -3
- xinference/ui/web/ui/build/static/js/main.4918643a.js.map +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/3d2a89f0eccc1f90fc5036c9a1d587c2120e6a6b128aae31d1db7d6bad52722b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/89179f8f51887b9167721860a12412549ff04f78162e921a7b6aa6532646deb2.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/8e5cb82c2ff3299c6a44563fe6b1c5515c9750613c51bb63abee0b1d70fc5019.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/9dc5cfc67dd0617b0272aeef8651f1589b2155a4ff1fd72ad3166b217089b619.json +0 -1
- /xinference/ui/web/ui/build/static/js/{main.4918643a.js.LICENSE.txt → main.1086c759.js.LICENSE.txt} +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/WHEEL +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/top_level.txt +0 -0
xinference/model/llm/utils.py
CHANGED
|
@@ -51,6 +51,7 @@ from ...types import (
|
|
|
51
51
|
)
|
|
52
52
|
from .core import chat_context_var
|
|
53
53
|
from .reasoning_parser import ReasoningParser
|
|
54
|
+
from .tool_parsers.glm4_tool_parser import Glm4ToolParser
|
|
54
55
|
|
|
55
56
|
logger = logging.getLogger(__name__)
|
|
56
57
|
|
|
@@ -82,7 +83,7 @@ LLAMA3_TOOL_CALL_FAMILY = [
|
|
|
82
83
|
"HuatuoGPT-o1-LLaMA-3.1",
|
|
83
84
|
]
|
|
84
85
|
|
|
85
|
-
DEEPSEEK_TOOL_CALL_FAMILY = ["deepseek-v3", "deepseek-r1-0528"]
|
|
86
|
+
DEEPSEEK_TOOL_CALL_FAMILY = ["deepseek-v3", "deepseek-r1-0528", "Deepseek-V3.1"]
|
|
86
87
|
|
|
87
88
|
TOOL_CALL_FAMILY = (
|
|
88
89
|
QWEN_TOOL_CALL_FAMILY
|
|
@@ -95,6 +96,13 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
|
|
|
95
96
|
|
|
96
97
|
|
|
97
98
|
class ChatModelMixin:
|
|
99
|
+
|
|
100
|
+
def __init__(self):
|
|
101
|
+
self.model_family = None
|
|
102
|
+
self.model_uid = None
|
|
103
|
+
self.reasoning_parser = None
|
|
104
|
+
self.tool_parser = None
|
|
105
|
+
|
|
98
106
|
@staticmethod
|
|
99
107
|
@functools.lru_cache
|
|
100
108
|
def _compile_jinja_template(chat_template):
|
|
@@ -590,16 +598,41 @@ class ChatModelMixin:
|
|
|
590
598
|
pos2 = content.find(QWEN_TOOL_CALL_SYMBOLS[1])
|
|
591
599
|
if pos2 != -1:
|
|
592
600
|
content = content[:pos2]
|
|
601
|
+
|
|
602
|
+
# Skip empty content after extraction
|
|
603
|
+
if not content.strip():
|
|
604
|
+
continue
|
|
605
|
+
|
|
593
606
|
try:
|
|
594
607
|
res = json.loads(content, strict=False)
|
|
595
|
-
|
|
596
|
-
|
|
608
|
+
if isinstance(res, dict):
|
|
609
|
+
# Check if required fields exist
|
|
610
|
+
if "name" in res and "arguments" in res:
|
|
611
|
+
results.append((None, res["name"], res["arguments"]))
|
|
612
|
+
else:
|
|
613
|
+
logger.warning(
|
|
614
|
+
"Missing required fields in qwen tool call: %s", content
|
|
615
|
+
)
|
|
616
|
+
results.append((content, None, None))
|
|
617
|
+
else:
|
|
618
|
+
logger.warning(
|
|
619
|
+
"Qwen tool call result is not a dict: %s", content
|
|
620
|
+
)
|
|
621
|
+
results.append((content, None, None))
|
|
622
|
+
except json.JSONDecodeError as e:
|
|
597
623
|
logger.error(
|
|
598
624
|
"Can't parse single qwen tool call output: %s. Error: %s",
|
|
599
625
|
content,
|
|
600
626
|
e,
|
|
601
627
|
)
|
|
602
628
|
results.append((content, None, None))
|
|
629
|
+
except Exception as e:
|
|
630
|
+
logger.error(
|
|
631
|
+
"Unexpected error parsing qwen tool call: %s. Error: %s",
|
|
632
|
+
content,
|
|
633
|
+
e,
|
|
634
|
+
)
|
|
635
|
+
results.append((content, None, None))
|
|
603
636
|
return results
|
|
604
637
|
|
|
605
638
|
@classmethod
|
|
@@ -757,47 +790,60 @@ class ChatModelMixin:
|
|
|
757
790
|
logger.debug(f"Tool call content: {result}")
|
|
758
791
|
return result
|
|
759
792
|
|
|
760
|
-
@classmethod
|
|
761
793
|
def _post_process_completion_chunk(
|
|
762
|
-
|
|
794
|
+
self,
|
|
763
795
|
model_family,
|
|
764
796
|
model_uid,
|
|
765
797
|
c,
|
|
766
798
|
chunk_id=None,
|
|
767
|
-
|
|
768
|
-
tool_call_text: Optional[str] = None,
|
|
799
|
+
previous_texts: List[str] = [""],
|
|
769
800
|
):
|
|
770
801
|
_id = chunk_id if chunk_id is not None else str(uuid.uuid4())
|
|
771
|
-
|
|
802
|
+
if isinstance(self.tool_parser, Glm4ToolParser):
|
|
803
|
+
tool_result = self.tool_parser.extract_tool_calls_streaming(
|
|
804
|
+
[],
|
|
805
|
+
c,
|
|
806
|
+
c,
|
|
807
|
+
)
|
|
808
|
+
else:
|
|
809
|
+
finish_reason = c["choices"][0]["finish_reason"]
|
|
810
|
+
delta_text = c["choices"][0]["delta"]["content"]
|
|
811
|
+
current_text = (
|
|
812
|
+
previous_texts[-1] + delta_text if previous_texts else delta_text
|
|
813
|
+
)
|
|
814
|
+
tool_result = self.tool_parser.extract_tool_calls_streaming(
|
|
815
|
+
previous_texts,
|
|
816
|
+
current_text,
|
|
817
|
+
delta_text,
|
|
818
|
+
)
|
|
819
|
+
previous_texts[-1] = current_text
|
|
820
|
+
if tool_result is None and not finish_reason:
|
|
821
|
+
return None
|
|
772
822
|
tool_calls = []
|
|
773
823
|
failed_contents = []
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
824
|
+
content, func, args = tool_result if tool_result else ("", None, None)
|
|
825
|
+
if func:
|
|
826
|
+
tool_calls.append(
|
|
827
|
+
{
|
|
828
|
+
"index": 0,
|
|
829
|
+
"id": f"call_{_id}",
|
|
830
|
+
"type": "function",
|
|
831
|
+
"function": {
|
|
832
|
+
"name": func,
|
|
833
|
+
"arguments": json.dumps(args, ensure_ascii=False),
|
|
834
|
+
},
|
|
835
|
+
}
|
|
836
|
+
)
|
|
837
|
+
else:
|
|
838
|
+
failed_contents.append(content)
|
|
790
839
|
|
|
791
|
-
|
|
840
|
+
finish_reason = "tool_calls" if tool_calls else finish_reason
|
|
792
841
|
|
|
793
|
-
|
|
794
|
-
family = model_family.model_family or model_family.model_name
|
|
795
|
-
if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
|
|
796
|
-
content = ""
|
|
842
|
+
content = "".join(failed_contents) if failed_contents else None
|
|
797
843
|
|
|
798
844
|
d = {
|
|
799
845
|
"role": "assistant",
|
|
800
|
-
"content": content,
|
|
846
|
+
"content": content if content else "",
|
|
801
847
|
"tool_calls": tool_calls,
|
|
802
848
|
}
|
|
803
849
|
|
|
@@ -826,29 +872,32 @@ class ChatModelMixin:
|
|
|
826
872
|
"usage": usage,
|
|
827
873
|
}
|
|
828
874
|
|
|
829
|
-
@classmethod
|
|
830
875
|
def _post_process_completion(
|
|
831
|
-
|
|
876
|
+
self,
|
|
832
877
|
model_family,
|
|
833
878
|
model_uid,
|
|
834
879
|
c,
|
|
835
|
-
reasoning_parser: Optional[ReasoningParser] = None,
|
|
836
880
|
):
|
|
837
|
-
if
|
|
838
|
-
|
|
881
|
+
if not self.tool_parser:
|
|
882
|
+
return self._get_final_chat_completion_chunk(c)
|
|
883
|
+
if self.reasoning_parser:
|
|
884
|
+
c = self.reasoning_parser.prepare_reasoning_content(c)
|
|
839
885
|
_id = str(uuid.uuid4())
|
|
840
886
|
reasoning_content = None
|
|
841
|
-
if reasoning_parser and reasoning_parser.check_content_parser():
|
|
887
|
+
if self.reasoning_parser and self.reasoning_parser.check_content_parser():
|
|
842
888
|
text = c["choices"][0]["text"]
|
|
843
|
-
reasoning_content, content =
|
|
844
|
-
text
|
|
889
|
+
reasoning_content, content = (
|
|
890
|
+
self.reasoning_parser.extract_reasoning_content(text)
|
|
845
891
|
)
|
|
846
892
|
c["choices"][0]["text"] = content
|
|
847
893
|
|
|
848
|
-
tool_result = cls._eval_tool_arguments(model_family, c)
|
|
849
|
-
|
|
850
894
|
tool_calls = []
|
|
851
895
|
failed_contents = []
|
|
896
|
+
if isinstance(self.tool_parser, Glm4ToolParser):
|
|
897
|
+
tool_result = self.tool_parser.extract_tool_calls(c)
|
|
898
|
+
else:
|
|
899
|
+
text = c["choices"][0]["text"]
|
|
900
|
+
tool_result = self.tool_parser.extract_tool_calls(text)
|
|
852
901
|
for content, func, args in tool_result:
|
|
853
902
|
if func:
|
|
854
903
|
tool_calls.append(
|
|
@@ -868,14 +917,9 @@ class ChatModelMixin:
|
|
|
868
917
|
|
|
869
918
|
content = "".join(failed_contents) if failed_contents else None
|
|
870
919
|
|
|
871
|
-
# fix: qwen tool_call content field return null
|
|
872
|
-
family = model_family.model_family or model_family.model_name
|
|
873
|
-
if tool_calls and family in QWEN_TOOL_CALL_FAMILY and content is None:
|
|
874
|
-
content = ""
|
|
875
|
-
|
|
876
920
|
m = {
|
|
877
921
|
"role": "assistant",
|
|
878
|
-
"content": content,
|
|
922
|
+
"content": content if content else "",
|
|
879
923
|
"tool_calls": tool_calls,
|
|
880
924
|
}
|
|
881
925
|
# add only reasoning_content is None
|
|
@@ -943,6 +987,44 @@ class ChatModelMixin:
|
|
|
943
987
|
|
|
944
988
|
return transformed_messages
|
|
945
989
|
|
|
990
|
+
async def _async_to_tool_completion_chunks(
|
|
991
|
+
self,
|
|
992
|
+
chunks: AsyncGenerator[CompletionChunk, None],
|
|
993
|
+
ctx: Optional[Dict[str, Any]] = None,
|
|
994
|
+
) -> AsyncGenerator[ChatCompletionChunk, None]:
|
|
995
|
+
def set_context():
|
|
996
|
+
if ctx:
|
|
997
|
+
chat_context_var.set(ctx)
|
|
998
|
+
|
|
999
|
+
i = 0
|
|
1000
|
+
previous_texts = [""]
|
|
1001
|
+
previous_tools_texts = [""]
|
|
1002
|
+
full_text = ""
|
|
1003
|
+
if self.reasoning_parser:
|
|
1004
|
+
set_context()
|
|
1005
|
+
chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
|
|
1006
|
+
async for completion_chunk in chunks:
|
|
1007
|
+
set_context()
|
|
1008
|
+
chat_chunk = self._to_chat_completion_chunk(
|
|
1009
|
+
completion_chunk, self.reasoning_parser, previous_texts
|
|
1010
|
+
)
|
|
1011
|
+
if (
|
|
1012
|
+
"reasoning_content" in chat_chunk["choices"][0]["delta"]
|
|
1013
|
+
and chat_chunk["choices"][0]["delta"]["reasoning_content"] is not None
|
|
1014
|
+
):
|
|
1015
|
+
yield chat_chunk
|
|
1016
|
+
continue
|
|
1017
|
+
processed_chunk = self._post_process_completion_chunk(
|
|
1018
|
+
self.model_family,
|
|
1019
|
+
self.model_uid,
|
|
1020
|
+
chat_chunk,
|
|
1021
|
+
previous_texts=previous_tools_texts,
|
|
1022
|
+
)
|
|
1023
|
+
if processed_chunk:
|
|
1024
|
+
yield processed_chunk
|
|
1025
|
+
i += 1
|
|
1026
|
+
logger.debug("Chat finished, output: %s", full_text)
|
|
1027
|
+
|
|
946
1028
|
|
|
947
1029
|
def get_model_version(
|
|
948
1030
|
model_name: str,
|
|
@@ -273,13 +273,19 @@ if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.9.2"):
|
|
|
273
273
|
VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Instruct")
|
|
274
274
|
VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Thinking")
|
|
275
275
|
VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Coder")
|
|
276
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("Deepseek-V3.1")
|
|
276
277
|
|
|
277
278
|
if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.0"):
|
|
278
279
|
VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5")
|
|
279
280
|
VLLM_SUPPORTED_VISION_MODEL_LIST.append("glm-4.5v")
|
|
281
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("KAT-V1")
|
|
280
282
|
|
|
281
283
|
if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.0"):
|
|
282
284
|
VLLM_SUPPORTED_CHAT_MODELS.append("gpt-oss")
|
|
285
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("seed-oss")
|
|
286
|
+
|
|
287
|
+
if VLLM_INSTALLED and VLLM_VERSION > version.parse("0.10.1.1"):
|
|
288
|
+
VLLM_SUPPORTED_CHAT_MODELS.append("seed-oss")
|
|
283
289
|
|
|
284
290
|
|
|
285
291
|
class VLLMModel(LLM):
|
|
@@ -387,6 +393,7 @@ class VLLMModel(LLM):
|
|
|
387
393
|
self.prepare_parse_reasoning_content(
|
|
388
394
|
reasoning_content, enable_thinking=enable_thinking
|
|
389
395
|
)
|
|
396
|
+
self.prepare_parse_tool_calls()
|
|
390
397
|
|
|
391
398
|
if (
|
|
392
399
|
isinstance(self.model_spec, LlamaCppLLMSpecV2)
|
|
@@ -767,7 +774,6 @@ class VLLMModel(LLM):
|
|
|
767
774
|
sanitized = VLLMGenerateConfig()
|
|
768
775
|
|
|
769
776
|
response_format = generate_config.pop("response_format", None)
|
|
770
|
-
guided_decoding_backend = generate_config.get("guided_decoding_backend", None)
|
|
771
777
|
guided_json_object = None
|
|
772
778
|
guided_json = None
|
|
773
779
|
|
|
@@ -778,8 +784,6 @@ class VLLMModel(LLM):
|
|
|
778
784
|
json_schema = response_format.get("json_schema")
|
|
779
785
|
assert json_schema is not None
|
|
780
786
|
guided_json = json_schema.get("json_schema")
|
|
781
|
-
if guided_decoding_backend is None:
|
|
782
|
-
guided_decoding_backend = "outlines"
|
|
783
787
|
|
|
784
788
|
sanitized.setdefault("lora_name", generate_config.get("lora_name", None))
|
|
785
789
|
sanitized.setdefault("n", generate_config.get("n", 1))
|
|
@@ -827,10 +831,6 @@ class VLLMModel(LLM):
|
|
|
827
831
|
"guided_json_object",
|
|
828
832
|
generate_config.get("guided_json_object", guided_json_object),
|
|
829
833
|
)
|
|
830
|
-
sanitized.setdefault(
|
|
831
|
-
"guided_decoding_backend",
|
|
832
|
-
generate_config.get("guided_decoding_backend", guided_decoding_backend),
|
|
833
|
-
)
|
|
834
834
|
|
|
835
835
|
return sanitized
|
|
836
836
|
|
|
@@ -1285,59 +1285,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1285
1285
|
|
|
1286
1286
|
return processed_messages
|
|
1287
1287
|
|
|
1288
|
-
async def _async_to_tool_completion_chunks(
|
|
1289
|
-
self,
|
|
1290
|
-
chunks: AsyncGenerator[CompletionChunk, None],
|
|
1291
|
-
ctx: Optional[Dict[str, Any]] = {},
|
|
1292
|
-
) -> AsyncGenerator[ChatCompletionChunk, None]:
|
|
1293
|
-
def set_context():
|
|
1294
|
-
if ctx:
|
|
1295
|
-
chat_context_var.set(ctx)
|
|
1296
|
-
|
|
1297
|
-
i = 0
|
|
1298
|
-
previous_texts = [""]
|
|
1299
|
-
tool_call = False
|
|
1300
|
-
tool_call_texts = [""]
|
|
1301
|
-
full_text = ""
|
|
1302
|
-
if self.reasoning_parser:
|
|
1303
|
-
set_context()
|
|
1304
|
-
chunks = self.reasoning_parser.prepare_reasoning_content_streaming(chunks)
|
|
1305
|
-
async for chunk in chunks:
|
|
1306
|
-
set_context()
|
|
1307
|
-
if i == 0:
|
|
1308
|
-
for first_chunk in self._get_first_chat_completion_chunk(
|
|
1309
|
-
chunk, self.reasoning_parser
|
|
1310
|
-
):
|
|
1311
|
-
yield first_chunk
|
|
1312
|
-
# usage
|
|
1313
|
-
choices = chunk.get("choices")
|
|
1314
|
-
if not choices:
|
|
1315
|
-
yield self._get_final_chat_completion_chunk(chunk)
|
|
1316
|
-
else:
|
|
1317
|
-
full_text += chunk["choices"][0]["text"]
|
|
1318
|
-
if self.is_tool_call_chunk_start(chunk):
|
|
1319
|
-
tool_call = True
|
|
1320
|
-
if tool_call:
|
|
1321
|
-
tool_call_text = tool_call_texts[-1]
|
|
1322
|
-
tool_call_text += chunk["choices"][0]["text"]
|
|
1323
|
-
tool_call_texts.append(tool_call_text)
|
|
1324
|
-
if self.is_tool_call_chunk_end(chunk):
|
|
1325
|
-
yield self._post_process_completion_chunk(
|
|
1326
|
-
self.model_family,
|
|
1327
|
-
self.model_uid,
|
|
1328
|
-
chunk,
|
|
1329
|
-
reasoning_parser=self.reasoning_parser,
|
|
1330
|
-
tool_call_text=tool_call_text,
|
|
1331
|
-
)
|
|
1332
|
-
tool_call = False
|
|
1333
|
-
tool_call_texts = [""]
|
|
1334
|
-
else:
|
|
1335
|
-
yield self._to_chat_completion_chunk(
|
|
1336
|
-
chunk, self.reasoning_parser, previous_texts
|
|
1337
|
-
)
|
|
1338
|
-
i += 1
|
|
1339
|
-
logger.debug("Chat finished, output: %s", full_text)
|
|
1340
|
-
|
|
1341
1288
|
@vllm_check
|
|
1342
1289
|
async def async_chat(
|
|
1343
1290
|
self,
|
|
@@ -1402,7 +1349,7 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
1402
1349
|
assert not isinstance(c, AsyncGenerator)
|
|
1403
1350
|
if tools:
|
|
1404
1351
|
return self._post_process_completion(
|
|
1405
|
-
self.model_family, self.model_uid, c
|
|
1352
|
+
self.model_family, self.model_uid, c
|
|
1406
1353
|
)
|
|
1407
1354
|
return self._to_chat_completion(c, self.reasoning_parser)
|
|
1408
1355
|
|
xinference/model/rerank/core.py
CHANGED
|
@@ -97,6 +97,8 @@ class RerankModel:
|
|
|
97
97
|
model_uid: str,
|
|
98
98
|
model_path: str,
|
|
99
99
|
model_family: RerankModelFamilyV2,
|
|
100
|
+
quantization: Optional[str],
|
|
101
|
+
*,
|
|
100
102
|
device: Optional[str] = None,
|
|
101
103
|
use_fp16: bool = False,
|
|
102
104
|
**kwargs,
|
|
@@ -105,6 +107,7 @@ class RerankModel:
|
|
|
105
107
|
self._model_spec = model_family.model_specs[0]
|
|
106
108
|
self._model_uid = model_uid
|
|
107
109
|
self._model_path = model_path
|
|
110
|
+
self._quantization = quantization
|
|
108
111
|
self._device = device
|
|
109
112
|
self._use_fp16 = use_fp16
|
|
110
113
|
self._model = None
|
|
@@ -72,7 +72,7 @@ class SentenceTransformerRerankModel(RerankModel):
|
|
|
72
72
|
enable_flash_attn = self._kwargs.pop(
|
|
73
73
|
"enable_flash_attn", is_flash_attn_available()
|
|
74
74
|
)
|
|
75
|
-
if
|
|
75
|
+
if enable_flash_attn:
|
|
76
76
|
logger.warning(
|
|
77
77
|
"flash_attn can only support fp16 and bf16, will force set `use_fp16` to True"
|
|
78
78
|
)
|
|
@@ -3,6 +3,7 @@ import uuid
|
|
|
3
3
|
from typing import List, Optional
|
|
4
4
|
|
|
5
5
|
from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
|
|
6
|
+
from ...utils import cache_clean
|
|
6
7
|
from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
|
|
7
8
|
|
|
8
9
|
SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "Qwen3"]
|
|
@@ -22,9 +23,27 @@ class VLLMRerankModel(RerankModel):
|
|
|
22
23
|
|
|
23
24
|
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
24
25
|
|
|
26
|
+
if self.model_family.model_name in {
|
|
27
|
+
"Qwen3-Reranker-0.6B",
|
|
28
|
+
"Qwen3-Reranker-4B",
|
|
29
|
+
"Qwen3-Reranker-8B",
|
|
30
|
+
}:
|
|
31
|
+
if "hf_overrides" not in self._kwargs:
|
|
32
|
+
self._kwargs["hf_overrides"] = {
|
|
33
|
+
"architectures": ["Qwen3ForSequenceClassification"],
|
|
34
|
+
"classifier_from_token": ["no", "yes"],
|
|
35
|
+
"is_original_qwen3_reranker": True,
|
|
36
|
+
}
|
|
37
|
+
elif isinstance(self._kwargs["hf_overrides"], dict):
|
|
38
|
+
self._kwargs["hf_overrides"].update(
|
|
39
|
+
architectures=["Qwen3ForSequenceClassification"],
|
|
40
|
+
classifier_from_token=["no", "yes"],
|
|
41
|
+
is_original_qwen3_reranker=True,
|
|
42
|
+
)
|
|
25
43
|
self._model = LLM(model=self._model_path, task="score", **self._kwargs)
|
|
26
44
|
self._tokenizer = self._model.get_tokenizer()
|
|
27
45
|
|
|
46
|
+
@cache_clean
|
|
28
47
|
def rerank(
|
|
29
48
|
self,
|
|
30
49
|
documents: List[str],
|
|
@@ -51,14 +70,45 @@ class VLLMRerankModel(RerankModel):
|
|
|
51
70
|
"""
|
|
52
71
|
if kwargs:
|
|
53
72
|
raise RuntimeError("Unexpected keyword arguments: {}".format(kwargs))
|
|
73
|
+
assert self._model is not None
|
|
54
74
|
documents_size = len(documents)
|
|
55
75
|
query_list = [query] * documents_size
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
76
|
+
|
|
77
|
+
if self.model_family.model_name in {
|
|
78
|
+
"Qwen3-Reranker-0.6B",
|
|
79
|
+
"Qwen3-Reranker-4B",
|
|
80
|
+
"Qwen3-Reranker-8B",
|
|
81
|
+
}:
|
|
82
|
+
instruction = "Given a web search query, retrieve relevant passages that answer the query"
|
|
83
|
+
prefix = (
|
|
84
|
+
"<|im_start|>system\nJudge whether the Document meets the requirements based on"
|
|
85
|
+
" the Query and the Instruct provided. "
|
|
86
|
+
'Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
|
|
87
|
+
)
|
|
88
|
+
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
|
|
89
|
+
query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
|
|
90
|
+
document_template = "<Document>: {doc}{suffix}"
|
|
91
|
+
processed_queries = [
|
|
92
|
+
query_template.format(
|
|
93
|
+
prefix=prefix, instruction=instruction, query=query
|
|
94
|
+
)
|
|
95
|
+
for query in query_list
|
|
96
|
+
]
|
|
97
|
+
processed_documents = [
|
|
98
|
+
document_template.format(doc=doc, suffix=suffix) for doc in documents
|
|
99
|
+
]
|
|
100
|
+
outputs = self._model.score(
|
|
101
|
+
processed_documents,
|
|
102
|
+
processed_queries,
|
|
103
|
+
use_tqdm=False,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
else:
|
|
107
|
+
outputs = self._model.score(
|
|
108
|
+
documents,
|
|
109
|
+
query_list,
|
|
110
|
+
use_tqdm=False,
|
|
111
|
+
)
|
|
62
112
|
scores = map(lambda scoreoutput: scoreoutput.outputs.score, outputs)
|
|
63
113
|
documents = list(map(lambda doc: Document(text=doc), documents))
|
|
64
114
|
document_parts = list(zip(range(documents_size), scores, documents))
|
xinference/model/utils.py
CHANGED
|
@@ -619,8 +619,7 @@ def is_flash_attn_available() -> bool:
|
|
|
619
619
|
f"GPU compute capability {compute_capability} < 8.0, "
|
|
620
620
|
"flash_attn may not work optimally"
|
|
621
621
|
)
|
|
622
|
-
|
|
623
|
-
# This threshold can be adjusted based on actual requirements
|
|
622
|
+
return False
|
|
624
623
|
|
|
625
624
|
# Try to import flash_attn core module to verify correct installation
|
|
626
625
|
try:
|
|
@@ -224,7 +224,7 @@
|
|
|
224
224
|
},
|
|
225
225
|
"virtualenv": {
|
|
226
226
|
"packages": [
|
|
227
|
-
"
|
|
227
|
+
"diffusers==0.35.1",
|
|
228
228
|
"ftfy",
|
|
229
229
|
"imageio-ffmpeg",
|
|
230
230
|
"imageio",
|
|
@@ -241,5 +241,99 @@
|
|
|
241
241
|
"model_revision": "master"
|
|
242
242
|
}
|
|
243
243
|
}
|
|
244
|
+
},
|
|
245
|
+
{
|
|
246
|
+
"version": 2,
|
|
247
|
+
"model_name": "Wan2.2-A14B",
|
|
248
|
+
"model_family": "Wan",
|
|
249
|
+
"model_ability": [
|
|
250
|
+
"text2video"
|
|
251
|
+
],
|
|
252
|
+
"default_model_config": {
|
|
253
|
+
"torch_dtype": "bfloat16"
|
|
254
|
+
},
|
|
255
|
+
"default_generate_config": {},
|
|
256
|
+
"virtualenv": {
|
|
257
|
+
"packages": [
|
|
258
|
+
"diffusers==0.35.1",
|
|
259
|
+
"ftfy",
|
|
260
|
+
"imageio-ffmpeg",
|
|
261
|
+
"imageio",
|
|
262
|
+
"#system_numpy#"
|
|
263
|
+
]
|
|
264
|
+
},
|
|
265
|
+
"model_src": {
|
|
266
|
+
"huggingface": {
|
|
267
|
+
"model_id": "Wan-AI/Wan2.2-T2V-A14B-Diffusers",
|
|
268
|
+
"model_revision": "5be7df9619b54f4e2667b2755bc6a756675b5cd7"
|
|
269
|
+
},
|
|
270
|
+
"modelscope": {
|
|
271
|
+
"model_id": "Wan-AI/Wan2.2-T2V-A14B-Diffusers",
|
|
272
|
+
"model_revision": "master"
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
},
|
|
276
|
+
{
|
|
277
|
+
"version": 2,
|
|
278
|
+
"model_name": "Wan2.2-i2v-A14B",
|
|
279
|
+
"model_family": "Wan",
|
|
280
|
+
"model_ability": [
|
|
281
|
+
"image2video"
|
|
282
|
+
],
|
|
283
|
+
"default_model_config": {
|
|
284
|
+
"torch_dtype": "bfloat16"
|
|
285
|
+
},
|
|
286
|
+
"default_generate_config": {},
|
|
287
|
+
"virtualenv": {
|
|
288
|
+
"packages": [
|
|
289
|
+
"diffusers==0.35.1",
|
|
290
|
+
"ftfy",
|
|
291
|
+
"imageio-ffmpeg",
|
|
292
|
+
"imageio",
|
|
293
|
+
"#system_numpy#"
|
|
294
|
+
]
|
|
295
|
+
},
|
|
296
|
+
"model_src": {
|
|
297
|
+
"huggingface": {
|
|
298
|
+
"model_id": "Wan-AI/Wan2.2-I2V-A14B-Diffusers",
|
|
299
|
+
"model_revision": "596658fd9ca6b7b71d5057529bbf319ecbc61d74"
|
|
300
|
+
},
|
|
301
|
+
"modelscope": {
|
|
302
|
+
"model_id": "Wan-AI/Wan2.2-I2V-A14B-Diffusers",
|
|
303
|
+
"model_revision": "master"
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
},
|
|
307
|
+
{
|
|
308
|
+
"version": 2,
|
|
309
|
+
"model_name": "Wan2.2-ti2v-5B",
|
|
310
|
+
"model_family": "Wan",
|
|
311
|
+
"model_ability": [
|
|
312
|
+
"text2video",
|
|
313
|
+
"image2video"
|
|
314
|
+
],
|
|
315
|
+
"default_model_config": {
|
|
316
|
+
"torch_dtype": "bfloat16"
|
|
317
|
+
},
|
|
318
|
+
"default_generate_config": {},
|
|
319
|
+
"virtualenv": {
|
|
320
|
+
"packages": [
|
|
321
|
+
"diffusers==0.35.1",
|
|
322
|
+
"ftfy",
|
|
323
|
+
"imageio-ffmpeg",
|
|
324
|
+
"imageio",
|
|
325
|
+
"#system_numpy#"
|
|
326
|
+
]
|
|
327
|
+
},
|
|
328
|
+
"model_src": {
|
|
329
|
+
"huggingface": {
|
|
330
|
+
"model_id": "Wan-AI/Wan2.2-TI2V-5B-Diffusers",
|
|
331
|
+
"model_revision": "b8fff7315c768468a5333511427288870b2e9635"
|
|
332
|
+
},
|
|
333
|
+
"modelscope": {
|
|
334
|
+
"model_id": "Wan-AI/Wan2.2-TI2V-5B-Diffusers",
|
|
335
|
+
"model_revision": "master"
|
|
336
|
+
}
|
|
337
|
+
}
|
|
244
338
|
}
|
|
245
339
|
]
|
|
@@ -61,8 +61,7 @@ def main():
|
|
|
61
61
|
model = CosyVoice(args.model_dir)
|
|
62
62
|
except Exception:
|
|
63
63
|
try:
|
|
64
|
-
|
|
65
|
-
model = CosyVoice2(args.model_dir, use_flow_cache=True)
|
|
64
|
+
model = CosyVoice2(args.model_dir)
|
|
66
65
|
except Exception:
|
|
67
66
|
raise TypeError('no valid model_type!')
|
|
68
67
|
|
|
@@ -93,9 +92,9 @@ def main():
|
|
|
93
92
|
else:
|
|
94
93
|
# 3. export flow encoder
|
|
95
94
|
flow_encoder = model.model.flow.encoder
|
|
96
|
-
script = get_optimized_script(flow_encoder
|
|
95
|
+
script = get_optimized_script(flow_encoder)
|
|
97
96
|
script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
|
|
98
|
-
script = get_optimized_script(flow_encoder.half()
|
|
97
|
+
script = get_optimized_script(flow_encoder.half())
|
|
99
98
|
script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
|
|
100
99
|
logging.info('successfully export flow_encoder')
|
|
101
100
|
|