xinference 1.7.1.post1__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/client/restful/async_restful_client.py +8 -13
- xinference/client/restful/restful_client.py +6 -2
- xinference/core/chat_interface.py +6 -4
- xinference/core/media_interface.py +5 -0
- xinference/core/model.py +1 -5
- xinference/core/supervisor.py +117 -68
- xinference/core/worker.py +49 -37
- xinference/deploy/test/test_cmdline.py +2 -6
- xinference/model/audio/__init__.py +26 -23
- xinference/model/audio/chattts.py +3 -2
- xinference/model/audio/core.py +49 -98
- xinference/model/audio/cosyvoice.py +3 -2
- xinference/model/audio/custom.py +28 -73
- xinference/model/audio/f5tts.py +3 -2
- xinference/model/audio/f5tts_mlx.py +3 -2
- xinference/model/audio/fish_speech.py +3 -2
- xinference/model/audio/funasr.py +17 -4
- xinference/model/audio/kokoro.py +3 -2
- xinference/model/audio/megatts.py +3 -2
- xinference/model/audio/melotts.py +3 -2
- xinference/model/audio/model_spec.json +572 -171
- xinference/model/audio/utils.py +0 -6
- xinference/model/audio/whisper.py +3 -2
- xinference/model/audio/whisper_mlx.py +3 -2
- xinference/model/cache_manager.py +141 -0
- xinference/model/core.py +6 -49
- xinference/model/custom.py +174 -0
- xinference/model/embedding/__init__.py +67 -56
- xinference/model/embedding/cache_manager.py +35 -0
- xinference/model/embedding/core.py +104 -84
- xinference/model/embedding/custom.py +55 -78
- xinference/model/embedding/embed_family.py +80 -31
- xinference/model/embedding/flag/core.py +21 -5
- xinference/model/embedding/llama_cpp/__init__.py +0 -0
- xinference/model/embedding/llama_cpp/core.py +234 -0
- xinference/model/embedding/model_spec.json +968 -103
- xinference/model/embedding/sentence_transformers/core.py +30 -20
- xinference/model/embedding/vllm/core.py +11 -5
- xinference/model/flexible/__init__.py +8 -2
- xinference/model/flexible/core.py +26 -119
- xinference/model/flexible/custom.py +69 -0
- xinference/model/flexible/launchers/image_process_launcher.py +1 -0
- xinference/model/flexible/launchers/modelscope_launcher.py +5 -1
- xinference/model/flexible/launchers/transformers_launcher.py +15 -3
- xinference/model/flexible/launchers/yolo_launcher.py +5 -1
- xinference/model/image/__init__.py +20 -20
- xinference/model/image/cache_manager.py +62 -0
- xinference/model/image/core.py +70 -182
- xinference/model/image/custom.py +28 -72
- xinference/model/image/model_spec.json +402 -119
- xinference/model/image/ocr/got_ocr2.py +3 -2
- xinference/model/image/stable_diffusion/core.py +22 -7
- xinference/model/image/stable_diffusion/mlx.py +6 -6
- xinference/model/image/utils.py +2 -2
- xinference/model/llm/__init__.py +71 -94
- xinference/model/llm/cache_manager.py +292 -0
- xinference/model/llm/core.py +37 -111
- xinference/model/llm/custom.py +88 -0
- xinference/model/llm/llama_cpp/core.py +5 -7
- xinference/model/llm/llm_family.json +16260 -8151
- xinference/model/llm/llm_family.py +138 -839
- xinference/model/llm/lmdeploy/core.py +5 -7
- xinference/model/llm/memory.py +3 -4
- xinference/model/llm/mlx/core.py +6 -8
- xinference/model/llm/reasoning_parser.py +3 -1
- xinference/model/llm/sglang/core.py +32 -14
- xinference/model/llm/transformers/chatglm.py +3 -7
- xinference/model/llm/transformers/core.py +49 -27
- xinference/model/llm/transformers/deepseek_v2.py +2 -2
- xinference/model/llm/transformers/gemma3.py +2 -2
- xinference/model/llm/transformers/multimodal/cogagent.py +2 -2
- xinference/model/llm/transformers/multimodal/deepseek_vl2.py +2 -2
- xinference/model/llm/transformers/multimodal/gemma3.py +2 -2
- xinference/model/llm/transformers/multimodal/glm4_1v.py +167 -0
- xinference/model/llm/transformers/multimodal/glm4v.py +2 -2
- xinference/model/llm/transformers/multimodal/intern_vl.py +2 -2
- xinference/model/llm/transformers/multimodal/minicpmv26.py +3 -3
- xinference/model/llm/transformers/multimodal/ovis2.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen-omni.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen2_audio.py +2 -2
- xinference/model/llm/transformers/multimodal/qwen2_vl.py +2 -2
- xinference/model/llm/transformers/opt.py +3 -7
- xinference/model/llm/utils.py +34 -49
- xinference/model/llm/vllm/core.py +77 -27
- xinference/model/llm/vllm/xavier/engine.py +5 -3
- xinference/model/llm/vllm/xavier/scheduler.py +10 -6
- xinference/model/llm/vllm/xavier/transfer.py +1 -1
- xinference/model/rerank/__init__.py +26 -25
- xinference/model/rerank/core.py +47 -87
- xinference/model/rerank/custom.py +25 -71
- xinference/model/rerank/model_spec.json +158 -33
- xinference/model/rerank/utils.py +2 -2
- xinference/model/utils.py +115 -54
- xinference/model/video/__init__.py +13 -17
- xinference/model/video/core.py +44 -102
- xinference/model/video/diffusers.py +4 -3
- xinference/model/video/model_spec.json +90 -21
- xinference/types.py +5 -3
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.7d24df53.js +3 -0
- xinference/web/ui/build/static/js/main.7d24df53.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/2704ff66a5f73ca78b341eb3edec60154369df9d87fbc8c6dd60121abc5e1b0a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/607dfef23d33e6b594518c0c6434567639f24f356b877c80c60575184ec50ed0.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9be3d56173aacc3efd0b497bcb13c4f6365de30069176ee9403b40e717542326.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/9f9dd6c32c78a222d07da5987ae902effe16bcf20aac00774acdccc4de3c9ff2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b2ab5ee972c60d15eb9abf5845705f8ab7e1d125d324d9a9b1bcae5d6fd7ffb2.json +1 -0
- xinference/web/ui/src/locales/en.json +0 -1
- xinference/web/ui/src/locales/ja.json +0 -1
- xinference/web/ui/src/locales/ko.json +0 -1
- xinference/web/ui/src/locales/zh.json +0 -1
- {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/METADATA +9 -11
- {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/RECORD +119 -119
- xinference/model/audio/model_spec_modelscope.json +0 -231
- xinference/model/embedding/model_spec_modelscope.json +0 -293
- xinference/model/embedding/utils.py +0 -18
- xinference/model/image/model_spec_modelscope.json +0 -375
- xinference/model/llm/llama_cpp/memory.py +0 -457
- xinference/model/llm/llm_family_csghub.json +0 -56
- xinference/model/llm/llm_family_modelscope.json +0 -8700
- xinference/model/llm/llm_family_openmind_hub.json +0 -1019
- xinference/model/rerank/model_spec_modelscope.json +0 -85
- xinference/model/video/model_spec_modelscope.json +0 -184
- xinference/web/ui/build/static/js/main.9b12b7f9.js +0 -3
- xinference/web/ui/build/static/js/main.9b12b7f9.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1460361af6975e63576708039f1cb732faf9c672d97c494d4055fc6331460be0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/4efd8dda58fda83ed9546bf2f587df67f8d98e639117bee2d9326a9a1d9bebb2.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/55b9fb40b57fa926e8f05f31c2f96467e76e5ad62f033dca97c03f9e8c4eb4fe.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5b2dafe5aa9e1105e0244a2b6751807342fa86aa0144b4e84d947a1686102715.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/611fa2c6c53b66039991d06dfb0473b5ab37fc63b4564e0f6e1718523768a045.json +0 -1
- /xinference/web/ui/build/static/js/{main.9b12b7f9.js.LICENSE.txt → main.7d24df53.js.LICENSE.txt} +0 -0
- {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/WHEEL +0 -0
- {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.7.1.post1.dist-info → xinference-1.8.0.dist-info}/top_level.txt +0 -0
|
@@ -20,7 +20,7 @@ import torch
|
|
|
20
20
|
|
|
21
21
|
from ....types import ChatCompletion, ChatCompletionChunk, Completion, LoRA
|
|
22
22
|
from ..core import LLM
|
|
23
|
-
from ..llm_family import
|
|
23
|
+
from ..llm_family import LLMFamilyV2, LLMSpecV1
|
|
24
24
|
from ..utils import ChatModelMixin, generate_chat_completion, generate_completion_chunk
|
|
25
25
|
|
|
26
26
|
logger = logging.getLogger(__name__)
|
|
@@ -76,14 +76,12 @@ class LMDeployModel(LLM):
|
|
|
76
76
|
def __init__(
|
|
77
77
|
self,
|
|
78
78
|
model_uid: str,
|
|
79
|
-
model_family: "
|
|
80
|
-
model_spec: "LLMSpecV1",
|
|
81
|
-
quantization: str,
|
|
79
|
+
model_family: "LLMFamilyV2",
|
|
82
80
|
model_path: str,
|
|
83
81
|
model_config: Optional[LMDeployModelConfig] = None,
|
|
84
82
|
peft_model: Optional[List[LoRA]] = None,
|
|
85
83
|
):
|
|
86
|
-
super().__init__(model_uid, model_family,
|
|
84
|
+
super().__init__(model_uid, model_family, model_path)
|
|
87
85
|
self._model_config: LMDeployModelConfig = self._sanitize_model_config(
|
|
88
86
|
model_config
|
|
89
87
|
)
|
|
@@ -119,7 +117,7 @@ class LMDeployModel(LLM):
|
|
|
119
117
|
|
|
120
118
|
@classmethod
|
|
121
119
|
def match_json(
|
|
122
|
-
cls, llm_family: "
|
|
120
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
123
121
|
) -> bool:
|
|
124
122
|
return False
|
|
125
123
|
|
|
@@ -172,7 +170,7 @@ class LMDeployChatModel(LMDeployModel, ChatModelMixin):
|
|
|
172
170
|
|
|
173
171
|
@classmethod
|
|
174
172
|
def match_json(
|
|
175
|
-
cls, llm_family: "
|
|
173
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
176
174
|
) -> bool:
|
|
177
175
|
if llm_spec.model_format == "awq":
|
|
178
176
|
# Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
|
xinference/model/llm/memory.py
CHANGED
|
@@ -214,16 +214,15 @@ def get_model_layers_info(
|
|
|
214
214
|
logger.debug("get_model_layers_info by default size=%s", model_size_in_billions)
|
|
215
215
|
size_in_billions = convert_model_size_to_float(model_size_in_billions)
|
|
216
216
|
return _get_default_layers_from_size(size_in_billions)
|
|
217
|
-
|
|
217
|
+
llm_family = match_llm(
|
|
218
218
|
model_name=model_name,
|
|
219
219
|
model_format=model_format,
|
|
220
220
|
model_size_in_billions=model_size_in_billions,
|
|
221
221
|
quantization=quantization,
|
|
222
222
|
)
|
|
223
|
-
if not
|
|
223
|
+
if not llm_family:
|
|
224
224
|
return None
|
|
225
|
-
|
|
226
|
-
config_path = cache_model_config(llm_family, llm_spec)
|
|
225
|
+
config_path = cache_model_config(llm_family)
|
|
227
226
|
return load_model_config_json(config_path)
|
|
228
227
|
|
|
229
228
|
|
xinference/model/llm/mlx/core.py
CHANGED
|
@@ -48,7 +48,7 @@ from ....types import (
|
|
|
48
48
|
LoRA,
|
|
49
49
|
)
|
|
50
50
|
from ..core import LLM, chat_context_var
|
|
51
|
-
from ..llm_family import
|
|
51
|
+
from ..llm_family import LLMFamilyV2, LLMSpecV1
|
|
52
52
|
from ..utils import (
|
|
53
53
|
DEEPSEEK_TOOL_CALL_FAMILY,
|
|
54
54
|
QWEN_TOOL_CALL_FAMILY,
|
|
@@ -98,14 +98,12 @@ class MLXModel(LLM):
|
|
|
98
98
|
def __init__(
|
|
99
99
|
self,
|
|
100
100
|
model_uid: str,
|
|
101
|
-
model_family: "
|
|
102
|
-
model_spec: "LLMSpecV1",
|
|
103
|
-
quantization: str,
|
|
101
|
+
model_family: "LLMFamilyV2",
|
|
104
102
|
model_path: str,
|
|
105
103
|
model_config: Optional[MLXModelConfig] = None,
|
|
106
104
|
peft_model: Optional[List[LoRA]] = None,
|
|
107
105
|
):
|
|
108
|
-
super().__init__(model_uid, model_family,
|
|
106
|
+
super().__init__(model_uid, model_family, model_path)
|
|
109
107
|
self._use_fast_tokenizer = True
|
|
110
108
|
self._model_config: MLXModelConfig = self._sanitize_model_config(model_config)
|
|
111
109
|
# for distributed
|
|
@@ -370,7 +368,7 @@ class MLXModel(LLM):
|
|
|
370
368
|
|
|
371
369
|
@classmethod
|
|
372
370
|
def match_json(
|
|
373
|
-
cls, llm_family: "
|
|
371
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
374
372
|
) -> bool:
|
|
375
373
|
if llm_spec.model_format not in ["mlx"]:
|
|
376
374
|
return False
|
|
@@ -670,7 +668,7 @@ class MLXChatModel(MLXModel, ChatModelMixin):
|
|
|
670
668
|
|
|
671
669
|
@classmethod
|
|
672
670
|
def match_json(
|
|
673
|
-
cls, llm_family: "
|
|
671
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
674
672
|
) -> bool:
|
|
675
673
|
if llm_spec.model_format not in ["mlx"]:
|
|
676
674
|
return False
|
|
@@ -734,7 +732,7 @@ class MLXVisionModel(MLXModel, ChatModelMixin):
|
|
|
734
732
|
|
|
735
733
|
@classmethod
|
|
736
734
|
def match_json(
|
|
737
|
-
cls, llm_family: "
|
|
735
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
738
736
|
) -> bool:
|
|
739
737
|
if llm_spec.model_format not in ["mlx"]:
|
|
740
738
|
return False
|
|
@@ -165,7 +165,9 @@ class ReasoningParser:
|
|
|
165
165
|
Returns:
|
|
166
166
|
bool: True if reasoning content should be extracted, False otherwise
|
|
167
167
|
"""
|
|
168
|
-
|
|
168
|
+
if self.is_enable_thinking():
|
|
169
|
+
return self.reasoning_content
|
|
170
|
+
return False
|
|
169
171
|
|
|
170
172
|
def _create_chat_completion_chunk(
|
|
171
173
|
self, chunk: Union[Dict[str, Any], CompletionChunk], content: str
|
|
@@ -32,10 +32,15 @@ from ....types import (
|
|
|
32
32
|
CompletionChunk,
|
|
33
33
|
CompletionUsage,
|
|
34
34
|
)
|
|
35
|
-
from .. import LLM,
|
|
35
|
+
from .. import LLM, LLMFamilyV2, LLMSpecV1
|
|
36
36
|
from ..core import chat_context_var
|
|
37
|
-
from ..llm_family import
|
|
38
|
-
from ..utils import
|
|
37
|
+
from ..llm_family import CustomLLMFamilyV2
|
|
38
|
+
from ..utils import (
|
|
39
|
+
DEEPSEEK_TOOL_CALL_FAMILY,
|
|
40
|
+
QWEN_TOOL_CALL_FAMILY,
|
|
41
|
+
ChatModelMixin,
|
|
42
|
+
generate_completion_chunk,
|
|
43
|
+
)
|
|
39
44
|
|
|
40
45
|
logger = logging.getLogger(__name__)
|
|
41
46
|
|
|
@@ -131,13 +136,11 @@ class SGLANGModel(LLM):
|
|
|
131
136
|
def __init__(
|
|
132
137
|
self,
|
|
133
138
|
model_uid: str,
|
|
134
|
-
model_family: "
|
|
135
|
-
model_spec: "LLMSpecV1",
|
|
136
|
-
quantization: str,
|
|
139
|
+
model_family: "LLMFamilyV2",
|
|
137
140
|
model_path: str,
|
|
138
141
|
model_config: Optional[SGLANGModelConfig],
|
|
139
142
|
):
|
|
140
|
-
super().__init__(model_uid, model_family,
|
|
143
|
+
super().__init__(model_uid, model_family, model_path)
|
|
141
144
|
self._model_config = model_config
|
|
142
145
|
self._engine = None
|
|
143
146
|
self._address = model_config.pop("address", None) # type: ignore
|
|
@@ -319,7 +322,7 @@ class SGLANGModel(LLM):
|
|
|
319
322
|
|
|
320
323
|
@classmethod
|
|
321
324
|
def match_json(
|
|
322
|
-
cls, llm_family: "
|
|
325
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
323
326
|
) -> bool:
|
|
324
327
|
if not cls._has_cuda_device():
|
|
325
328
|
return False
|
|
@@ -330,7 +333,7 @@ class SGLANGModel(LLM):
|
|
|
330
333
|
if llm_spec.model_format == "pytorch":
|
|
331
334
|
if quantization != "none" and not (quantization is None):
|
|
332
335
|
return False
|
|
333
|
-
if isinstance(llm_family,
|
|
336
|
+
if isinstance(llm_family, CustomLLMFamilyV2):
|
|
334
337
|
if llm_family.model_family not in SGLANG_SUPPORTED_MODELS:
|
|
335
338
|
return False
|
|
336
339
|
else:
|
|
@@ -547,14 +550,14 @@ class SGLANGModel(LLM):
|
|
|
547
550
|
class SGLANGChatModel(SGLANGModel, ChatModelMixin):
|
|
548
551
|
@classmethod
|
|
549
552
|
def match_json(
|
|
550
|
-
cls, llm_family: "
|
|
553
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
551
554
|
) -> bool:
|
|
552
555
|
if llm_spec.model_format not in ["pytorch", "gptq", "awq", "fp8"]:
|
|
553
556
|
return False
|
|
554
557
|
if llm_spec.model_format == "pytorch":
|
|
555
558
|
if quantization != "none" and not (quantization is None):
|
|
556
559
|
return False
|
|
557
|
-
if isinstance(llm_family,
|
|
560
|
+
if isinstance(llm_family, CustomLLMFamilyV2):
|
|
558
561
|
if llm_family.model_family not in SGLANG_SUPPORTED_CHAT_MODELS:
|
|
559
562
|
return False
|
|
560
563
|
else:
|
|
@@ -583,6 +586,9 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
|
|
|
583
586
|
request_id: Optional[str] = None,
|
|
584
587
|
) -> Union[ChatCompletion, AsyncGenerator[ChatCompletionChunk, None]]:
|
|
585
588
|
assert self.model_family.chat_template is not None
|
|
589
|
+
# fix: Object of type list_iterator is not JSON serializable
|
|
590
|
+
tools = list(generate_config.pop("tools", [])) if generate_config else None
|
|
591
|
+
model_family = self.model_family.model_family or self.model_family.model_name
|
|
586
592
|
chat_template_kwargs = (
|
|
587
593
|
self._get_chat_template_kwargs_from_generate_config(
|
|
588
594
|
generate_config, self.reasoning_parser
|
|
@@ -591,6 +597,12 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
|
|
|
591
597
|
)
|
|
592
598
|
chat_context_var.set(chat_template_kwargs)
|
|
593
599
|
full_context_kwargs = chat_template_kwargs.copy()
|
|
600
|
+
if tools:
|
|
601
|
+
if (
|
|
602
|
+
model_family in QWEN_TOOL_CALL_FAMILY
|
|
603
|
+
or model_family in DEEPSEEK_TOOL_CALL_FAMILY
|
|
604
|
+
):
|
|
605
|
+
full_context_kwargs["tools"] = tools
|
|
594
606
|
full_prompt = self.get_full_context(
|
|
595
607
|
messages, self.model_family.chat_template, **full_context_kwargs
|
|
596
608
|
)
|
|
@@ -599,17 +611,23 @@ class SGLANGChatModel(SGLANGModel, ChatModelMixin):
|
|
|
599
611
|
if stream:
|
|
600
612
|
agen = await self.async_generate(full_prompt, generate_config=generate_config) # type: ignore
|
|
601
613
|
assert isinstance(agen, AsyncGenerator)
|
|
602
|
-
return self._async_to_chat_completion_chunks(
|
|
614
|
+
return self._async_to_chat_completion_chunks(
|
|
615
|
+
agen, self.reasoning_parser, chat_template_kwargs
|
|
616
|
+
)
|
|
603
617
|
else:
|
|
604
618
|
c = await self.async_generate(full_prompt, generate_config=generate_config) # type: ignore
|
|
605
619
|
assert not isinstance(c, AsyncGenerator)
|
|
620
|
+
if tools:
|
|
621
|
+
return self._post_process_completion(
|
|
622
|
+
self.model_family, self.model_uid, c, self.reasoning_parser
|
|
623
|
+
)
|
|
606
624
|
return self._to_chat_completion(c, self.reasoning_parser)
|
|
607
625
|
|
|
608
626
|
|
|
609
627
|
class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
|
|
610
628
|
@classmethod
|
|
611
629
|
def match_json(
|
|
612
|
-
cls, llm_family: "
|
|
630
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
613
631
|
) -> bool:
|
|
614
632
|
if not cls._has_cuda_device():
|
|
615
633
|
return False
|
|
@@ -620,7 +638,7 @@ class SGLANGVisionModel(SGLANGModel, ChatModelMixin):
|
|
|
620
638
|
if llm_spec.model_format == "pytorch":
|
|
621
639
|
if quantization != "none" and not (quantization is None):
|
|
622
640
|
return False
|
|
623
|
-
if isinstance(llm_family,
|
|
641
|
+
if isinstance(llm_family, CustomLLMFamilyV2):
|
|
624
642
|
if llm_family.model_family not in SGLANG_SUPPORTED_VISION_MODEL_LIST:
|
|
625
643
|
return False
|
|
626
644
|
else:
|
|
@@ -23,7 +23,7 @@ import torch
|
|
|
23
23
|
from ....core.scheduler import InferenceRequest
|
|
24
24
|
from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig
|
|
25
25
|
from ..core import chat_context_var
|
|
26
|
-
from ..llm_family import
|
|
26
|
+
from ..llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
|
|
27
27
|
from ..utils import (
|
|
28
28
|
GLM4_TOOL_CALL_FAMILY,
|
|
29
29
|
generate_chat_completion,
|
|
@@ -40,9 +40,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
40
40
|
def __init__(
|
|
41
41
|
self,
|
|
42
42
|
model_uid: str,
|
|
43
|
-
model_family: "
|
|
44
|
-
model_spec: "LLMSpecV1",
|
|
45
|
-
quantization: str,
|
|
43
|
+
model_family: "LLMFamilyV2",
|
|
46
44
|
model_path: str,
|
|
47
45
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
48
46
|
peft_model: Optional[List[LoRA]] = None,
|
|
@@ -50,8 +48,6 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
50
48
|
super().__init__(
|
|
51
49
|
model_uid,
|
|
52
50
|
model_family,
|
|
53
|
-
model_spec,
|
|
54
|
-
quantization,
|
|
55
51
|
model_path,
|
|
56
52
|
pytorch_model_config=pytorch_model_config,
|
|
57
53
|
peft_model=peft_model,
|
|
@@ -88,7 +84,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
88
84
|
|
|
89
85
|
@classmethod
|
|
90
86
|
def match_json(
|
|
91
|
-
cls, llm_family: "
|
|
87
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
92
88
|
) -> bool:
|
|
93
89
|
if llm_spec.model_format != "pytorch":
|
|
94
90
|
return False
|
|
@@ -38,7 +38,7 @@ from ....types import (
|
|
|
38
38
|
)
|
|
39
39
|
from ...utils import select_device
|
|
40
40
|
from ..core import LLM, chat_context_var
|
|
41
|
-
from ..llm_family import
|
|
41
|
+
from ..llm_family import LLMFamilyV2, LLMSpecV1
|
|
42
42
|
from ..utils import (
|
|
43
43
|
DEEPSEEK_TOOL_CALL_FAMILY,
|
|
44
44
|
LLAMA3_TOOL_CALL_FAMILY,
|
|
@@ -92,14 +92,12 @@ class PytorchModel(LLM):
|
|
|
92
92
|
def __init__(
|
|
93
93
|
self,
|
|
94
94
|
model_uid: str,
|
|
95
|
-
model_family: "
|
|
96
|
-
model_spec: "LLMSpecV1",
|
|
97
|
-
quantization: str,
|
|
95
|
+
model_family: "LLMFamilyV2",
|
|
98
96
|
model_path: str,
|
|
99
97
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
100
98
|
peft_model: Optional[List[LoRA]] = None,
|
|
101
99
|
):
|
|
102
|
-
super().__init__(model_uid, model_family,
|
|
100
|
+
super().__init__(model_uid, model_family, model_path)
|
|
103
101
|
self._use_fast_tokenizer = True
|
|
104
102
|
self._pytorch_model_config: PytorchModelConfig = self._sanitize_model_config(
|
|
105
103
|
pytorch_model_config
|
|
@@ -345,7 +343,7 @@ class PytorchModel(LLM):
|
|
|
345
343
|
|
|
346
344
|
@classmethod
|
|
347
345
|
def match_json(
|
|
348
|
-
cls, llm_family: "
|
|
346
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
349
347
|
) -> bool:
|
|
350
348
|
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
351
349
|
return False
|
|
@@ -367,14 +365,26 @@ class PytorchModel(LLM):
|
|
|
367
365
|
data = []
|
|
368
366
|
for r in reqs:
|
|
369
367
|
real_len = seq_length - r.padding_len
|
|
370
|
-
x = torch.cat(
|
|
371
|
-
[
|
|
372
|
-
torch.full((r.padding_len,), 0, dtype=torch.long),
|
|
373
|
-
torch.ones((real_len,), dtype=torch.long),
|
|
374
|
-
]
|
|
375
|
-
)
|
|
376
|
-
data.append(x)
|
|
377
368
|
r.extra_kwargs["attention_mask_seq_len"] = real_len
|
|
369
|
+
|
|
370
|
+
if self._tokenizer.padding_side == "left":
|
|
371
|
+
# [PAD][PAD]...[TOKEN]
|
|
372
|
+
x = torch.cat(
|
|
373
|
+
[
|
|
374
|
+
torch.full((r.padding_len,), 0, dtype=torch.long),
|
|
375
|
+
torch.ones((real_len,), dtype=torch.long),
|
|
376
|
+
]
|
|
377
|
+
)
|
|
378
|
+
else: # right padding
|
|
379
|
+
# [TOKEN]...[PAD][PAD]
|
|
380
|
+
x = torch.cat(
|
|
381
|
+
[
|
|
382
|
+
torch.ones((real_len,), dtype=torch.long),
|
|
383
|
+
torch.full((r.padding_len,), 0, dtype=torch.long),
|
|
384
|
+
]
|
|
385
|
+
)
|
|
386
|
+
data.append(x)
|
|
387
|
+
|
|
378
388
|
return torch.stack(data).to(self._device)
|
|
379
389
|
|
|
380
390
|
def build_decode_attention_mask(
|
|
@@ -388,14 +398,30 @@ class PytorchModel(LLM):
|
|
|
388
398
|
data = []
|
|
389
399
|
for r in reqs:
|
|
390
400
|
r.extra_kwargs["attention_mask_seq_len"] += 1
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
401
|
+
if self._tokenizer.padding_side == "left":
|
|
402
|
+
attention_mask_seq_len = r.extra_kwargs["attention_mask_seq_len"]
|
|
403
|
+
pad_len = seq_length - attention_mask_seq_len
|
|
404
|
+
assert pad_len > 0, (
|
|
405
|
+
f"pad_len must be greater than 0, got {pad_len} = "
|
|
406
|
+
f"seq_length({seq_length}) - attention_mask_seq_len({attention_mask_seq_len})"
|
|
407
|
+
)
|
|
408
|
+
x = torch.cat(
|
|
409
|
+
[
|
|
410
|
+
torch.full((pad_len,), 0, dtype=torch.long),
|
|
411
|
+
torch.ones((attention_mask_seq_len,), dtype=torch.long),
|
|
412
|
+
]
|
|
413
|
+
)
|
|
414
|
+
else:
|
|
415
|
+
max_len = max(r.extra_kwargs["attention_mask_seq_len"] for r in reqs)
|
|
416
|
+
real_len = r.extra_kwargs["attention_mask_seq_len"]
|
|
417
|
+
pad_len = max_len - real_len
|
|
418
|
+
|
|
419
|
+
x = torch.cat(
|
|
420
|
+
[
|
|
421
|
+
torch.ones((real_len,), dtype=torch.long),
|
|
422
|
+
torch.full((pad_len,), 0, dtype=torch.long),
|
|
423
|
+
]
|
|
424
|
+
)
|
|
399
425
|
data.append(x)
|
|
400
426
|
return torch.stack(data).to(self._device)
|
|
401
427
|
|
|
@@ -668,9 +694,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
668
694
|
def __init__(
|
|
669
695
|
self,
|
|
670
696
|
model_uid: str,
|
|
671
|
-
model_family: "
|
|
672
|
-
model_spec: "LLMSpecV1",
|
|
673
|
-
quantization: str,
|
|
697
|
+
model_family: "LLMFamilyV2",
|
|
674
698
|
model_path: str,
|
|
675
699
|
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
676
700
|
peft_model: Optional[List[LoRA]] = None,
|
|
@@ -678,8 +702,6 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
678
702
|
super().__init__(
|
|
679
703
|
model_uid,
|
|
680
704
|
model_family,
|
|
681
|
-
model_spec,
|
|
682
|
-
quantization,
|
|
683
705
|
model_path,
|
|
684
706
|
pytorch_model_config,
|
|
685
707
|
peft_model,
|
|
@@ -702,7 +724,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
702
724
|
|
|
703
725
|
@classmethod
|
|
704
726
|
def match_json(
|
|
705
|
-
cls, llm_family: "
|
|
727
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
706
728
|
) -> bool:
|
|
707
729
|
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
708
730
|
return False
|
|
@@ -15,7 +15,7 @@ import logging
|
|
|
15
15
|
|
|
16
16
|
import torch
|
|
17
17
|
|
|
18
|
-
from ..llm_family import
|
|
18
|
+
from ..llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
|
|
19
19
|
from .core import PytorchChatModel, register_non_default_model
|
|
20
20
|
|
|
21
21
|
logger = logging.getLogger(__name__)
|
|
@@ -61,7 +61,7 @@ class DeepSeekV2PytorchChatModel(PytorchChatModel):
|
|
|
61
61
|
|
|
62
62
|
@classmethod
|
|
63
63
|
def match_json(
|
|
64
|
-
cls, llm_family: "
|
|
64
|
+
cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
|
|
65
65
|
) -> bool:
|
|
66
66
|
if llm_spec.model_format != "pytorch":
|
|
67
67
|
return False
|
|
@@ -15,7 +15,7 @@ import logging
|
|
|
15
15
|
from typing import Dict, List, Set
|
|
16
16
|
|
|
17
17
|
from ....core.scheduler import InferenceRequest
|
|
18
|
-
from ..llm_family import
|
|
18
|
+
from ..llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
|
|
19
19
|
from .core import PytorchChatModel, register_non_default_model
|
|
20
20
|
|
|
21
21
|
logger = logging.getLogger(__name__)
|
|
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
|
|
|
26
26
|
class Gemma3TextChatModel(PytorchChatModel):
|
|
27
27
|
@classmethod
|
|
28
28
|
def match_json(
|
|
29
|
-
cls, model_family: "
|
|
29
|
+
cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
|
|
30
30
|
) -> bool:
|
|
31
31
|
if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
32
32
|
return False
|
|
@@ -21,7 +21,7 @@ import torch
|
|
|
21
21
|
|
|
22
22
|
from .....model.utils import select_device
|
|
23
23
|
from ...core import chat_context_var
|
|
24
|
-
from ...llm_family import
|
|
24
|
+
from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
|
|
25
25
|
from ...utils import _decode_image, parse_messages
|
|
26
26
|
from ..core import register_non_default_model
|
|
27
27
|
from .core import PytorchMultiModalModel
|
|
@@ -47,7 +47,7 @@ class CogAgentChatModel(PytorchMultiModalModel):
|
|
|
47
47
|
|
|
48
48
|
@classmethod
|
|
49
49
|
def match_json(
|
|
50
|
-
cls, model_family: "
|
|
50
|
+
cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
|
|
51
51
|
) -> bool:
|
|
52
52
|
family = model_family.model_family or model_family.model_name
|
|
53
53
|
if "cogagent" in family.lower():
|
|
@@ -23,7 +23,7 @@ import requests
|
|
|
23
23
|
import torch
|
|
24
24
|
|
|
25
25
|
from .....model.utils import select_device
|
|
26
|
-
from ...llm_family import
|
|
26
|
+
from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
|
|
27
27
|
from ..core import register_non_default_model
|
|
28
28
|
from .core import PytorchMultiModalModel
|
|
29
29
|
|
|
@@ -39,7 +39,7 @@ class DeepSeekVL2ChatModel(PytorchMultiModalModel):
|
|
|
39
39
|
|
|
40
40
|
@classmethod
|
|
41
41
|
def match_json(
|
|
42
|
-
cls, model_family: "
|
|
42
|
+
cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
|
|
43
43
|
) -> bool:
|
|
44
44
|
llm_family = model_family.model_family or model_family.model_name
|
|
45
45
|
if "deepseek-vl2" == llm_family.lower():
|
|
@@ -17,7 +17,7 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple
|
|
|
17
17
|
|
|
18
18
|
from .....model.utils import select_device
|
|
19
19
|
from .....types import PytorchModelConfig
|
|
20
|
-
from ...llm_family import
|
|
20
|
+
from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
|
|
21
21
|
from ..core import register_non_default_model
|
|
22
22
|
from .core import PytorchMultiModalModel
|
|
23
23
|
|
|
@@ -29,7 +29,7 @@ logger = logging.getLogger(__name__)
|
|
|
29
29
|
class Gemma3ChatModel(PytorchMultiModalModel):
|
|
30
30
|
@classmethod
|
|
31
31
|
def match_json(
|
|
32
|
-
cls, model_family: "
|
|
32
|
+
cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
|
|
33
33
|
) -> bool:
|
|
34
34
|
if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
|
|
35
35
|
return False
|