sglang 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +55 -2
- sglang/api.py +3 -5
- sglang/backend/anthropic.py +33 -13
- sglang/backend/openai.py +2 -1
- sglang/backend/runtime_endpoint.py +18 -5
- sglang/backend/vertexai.py +1 -0
- sglang/global_config.py +1 -0
- sglang/lang/chat_template.py +74 -0
- sglang/lang/interpreter.py +40 -16
- sglang/lang/ir.py +1 -1
- sglang/lang/tracer.py +6 -4
- sglang/launch_server.py +2 -1
- sglang/srt/constrained/fsm_cache.py +15 -3
- sglang/srt/constrained/jump_forward.py +1 -0
- sglang/srt/conversation.py +2 -2
- sglang/srt/hf_transformers_utils.py +2 -1
- sglang/srt/layers/context_flashattention_nopad.py +1 -0
- sglang/srt/layers/extend_attention.py +1 -0
- sglang/srt/layers/logits_processor.py +114 -54
- sglang/srt/layers/radix_attention.py +2 -1
- sglang/srt/layers/token_attention.py +1 -0
- sglang/srt/managers/detokenizer_manager.py +5 -1
- sglang/srt/managers/io_struct.py +12 -0
- sglang/srt/managers/router/infer_batch.py +70 -33
- sglang/srt/managers/router/manager.py +7 -2
- sglang/srt/managers/router/model_rpc.py +116 -73
- sglang/srt/managers/router/model_runner.py +121 -155
- sglang/srt/managers/router/radix_cache.py +46 -38
- sglang/srt/managers/tokenizer_manager.py +56 -11
- sglang/srt/memory_pool.py +5 -14
- sglang/srt/model_config.py +7 -0
- sglang/srt/models/commandr.py +376 -0
- sglang/srt/models/dbrx.py +413 -0
- sglang/srt/models/dbrx_config.py +281 -0
- sglang/srt/models/gemma.py +22 -20
- sglang/srt/models/llama2.py +23 -21
- sglang/srt/models/llava.py +12 -10
- sglang/srt/models/mixtral.py +27 -25
- sglang/srt/models/qwen.py +23 -21
- sglang/srt/models/qwen2.py +23 -21
- sglang/srt/models/stablelm.py +292 -0
- sglang/srt/models/yivl.py +6 -5
- sglang/srt/openai_api_adapter.py +356 -0
- sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +36 -20
- sglang/srt/sampling_params.py +2 -0
- sglang/srt/server.py +68 -439
- sglang/srt/server_args.py +76 -49
- sglang/srt/utils.py +88 -32
- sglang/srt/weight_utils.py +402 -0
- sglang/test/test_programs.py +8 -7
- sglang/test/test_utils.py +196 -8
- {sglang-0.1.13.dist-info → sglang-0.1.15.dist-info}/METADATA +13 -15
- sglang-0.1.15.dist-info/RECORD +69 -0
- {sglang-0.1.13.dist-info → sglang-0.1.15.dist-info}/WHEEL +1 -1
- sglang-0.1.13.dist-info/RECORD +0 -63
- {sglang-0.1.13.dist-info → sglang-0.1.15.dist-info}/LICENSE +0 -0
- {sglang-0.1.13.dist-info → sglang-0.1.15.dist-info}/top_level.txt +0 -0
sglang/srt/models/qwen2.py
CHANGED
@@ -1,34 +1,36 @@
|
|
1
1
|
# Adapted from llama2.py
|
2
2
|
# Modify details for the adaptation of Qwen2 model.
|
3
3
|
"""Inference-only Qwen2 model compatible with HuggingFace weights."""
|
4
|
-
from typing import Any, Dict,
|
4
|
+
from typing import Any, Dict, Optional, Tuple
|
5
5
|
|
6
6
|
import torch
|
7
|
-
from sglang.srt.layers.logits_processor import LogitsProcessor
|
8
|
-
from sglang.srt.layers.radix_attention import RadixAttention
|
9
|
-
from sglang.srt.managers.router.model_runner import InputMetadata
|
10
7
|
from torch import nn
|
11
8
|
from vllm.model_executor.layers.activation import SiluAndMul
|
12
9
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
13
10
|
from vllm.model_executor.layers.linear import (
|
14
|
-
LinearMethodBase,
|
15
11
|
MergedColumnParallelLinear,
|
16
12
|
QKVParallelLinear,
|
17
13
|
RowParallelLinear,
|
18
14
|
)
|
15
|
+
from vllm.model_executor.layers.quantization.base_config import (
|
16
|
+
QuantizationConfig)
|
19
17
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
20
18
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
21
19
|
ParallelLMHead,
|
22
20
|
VocabParallelEmbedding,
|
23
21
|
)
|
24
|
-
from vllm.
|
22
|
+
from vllm.distributed import (
|
25
23
|
get_tensor_model_parallel_world_size,
|
26
24
|
)
|
27
|
-
from
|
25
|
+
from sglang.srt.weight_utils import (
|
28
26
|
default_weight_loader,
|
29
27
|
hf_model_weights_iterator,
|
30
28
|
)
|
31
29
|
|
30
|
+
from sglang.srt.layers.logits_processor import LogitsProcessor
|
31
|
+
from sglang.srt.layers.radix_attention import RadixAttention
|
32
|
+
from sglang.srt.managers.router.model_runner import InputMetadata
|
33
|
+
|
32
34
|
Qwen2Config = None
|
33
35
|
|
34
36
|
|
@@ -38,17 +40,17 @@ class Qwen2MLP(nn.Module):
|
|
38
40
|
hidden_size: int,
|
39
41
|
intermediate_size: int,
|
40
42
|
hidden_act: str,
|
41
|
-
|
43
|
+
quant_config: Optional[QuantizationConfig] = None,
|
42
44
|
) -> None:
|
43
45
|
super().__init__()
|
44
46
|
self.gate_up_proj = MergedColumnParallelLinear(
|
45
47
|
hidden_size,
|
46
48
|
[intermediate_size] * 2,
|
47
49
|
bias=False,
|
48
|
-
|
50
|
+
quant_config=quant_config,
|
49
51
|
)
|
50
52
|
self.down_proj = RowParallelLinear(
|
51
|
-
intermediate_size, hidden_size, bias=False,
|
53
|
+
intermediate_size, hidden_size, bias=False, quant_config=quant_config,
|
52
54
|
)
|
53
55
|
if hidden_act != "silu":
|
54
56
|
raise ValueError(
|
@@ -74,7 +76,7 @@ class Qwen2Attention(nn.Module):
|
|
74
76
|
rope_theta: float = 1000000,
|
75
77
|
rope_scaling: Optional[Dict[str, Any]] = None,
|
76
78
|
max_position_embeddings: int = 32768,
|
77
|
-
|
79
|
+
quant_config: Optional[QuantizationConfig] = None,
|
78
80
|
) -> None:
|
79
81
|
super().__init__()
|
80
82
|
self.hidden_size = hidden_size
|
@@ -105,13 +107,13 @@ class Qwen2Attention(nn.Module):
|
|
105
107
|
self.total_num_heads,
|
106
108
|
self.total_num_kv_heads,
|
107
109
|
bias=True,
|
108
|
-
|
110
|
+
quant_config=quant_config,
|
109
111
|
)
|
110
112
|
self.o_proj = RowParallelLinear(
|
111
113
|
self.total_num_heads * self.head_dim,
|
112
114
|
hidden_size,
|
113
115
|
bias=False,
|
114
|
-
|
116
|
+
quant_config=quant_config,
|
115
117
|
)
|
116
118
|
|
117
119
|
self.rotary_emb = get_rope(
|
@@ -148,7 +150,7 @@ class Qwen2DecoderLayer(nn.Module):
|
|
148
150
|
self,
|
149
151
|
config: Qwen2Config,
|
150
152
|
layer_id: int = 0,
|
151
|
-
|
153
|
+
quant_config: Optional[QuantizationConfig] = None,
|
152
154
|
) -> None:
|
153
155
|
super().__init__()
|
154
156
|
self.hidden_size = config.hidden_size
|
@@ -163,13 +165,13 @@ class Qwen2DecoderLayer(nn.Module):
|
|
163
165
|
rope_theta=rope_theta,
|
164
166
|
rope_scaling=rope_scaling,
|
165
167
|
max_position_embeddings=max_position_embeddings,
|
166
|
-
|
168
|
+
quant_config=quant_config,
|
167
169
|
)
|
168
170
|
self.mlp = Qwen2MLP(
|
169
171
|
hidden_size=self.hidden_size,
|
170
172
|
intermediate_size=config.intermediate_size,
|
171
173
|
hidden_act=config.hidden_act,
|
172
|
-
|
174
|
+
quant_config=quant_config,
|
173
175
|
)
|
174
176
|
self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
175
177
|
self.post_attention_layernorm = RMSNorm(
|
@@ -205,7 +207,7 @@ class Qwen2Model(nn.Module):
|
|
205
207
|
def __init__(
|
206
208
|
self,
|
207
209
|
config: Qwen2Config,
|
208
|
-
|
210
|
+
quant_config: Optional[QuantizationConfig] = None,
|
209
211
|
) -> None:
|
210
212
|
super().__init__()
|
211
213
|
self.config = config
|
@@ -217,7 +219,7 @@ class Qwen2Model(nn.Module):
|
|
217
219
|
)
|
218
220
|
self.layers = nn.ModuleList(
|
219
221
|
[
|
220
|
-
Qwen2DecoderLayer(config, i,
|
222
|
+
Qwen2DecoderLayer(config, i, quant_config=quant_config)
|
221
223
|
for i in range(config.num_hidden_layers)
|
222
224
|
]
|
223
225
|
)
|
@@ -251,12 +253,12 @@ class Qwen2ForCausalLM(nn.Module):
|
|
251
253
|
def __init__(
|
252
254
|
self,
|
253
255
|
config: Qwen2Config,
|
254
|
-
|
256
|
+
quant_config: Optional[QuantizationConfig] = None,
|
255
257
|
) -> None:
|
256
258
|
super().__init__()
|
257
259
|
self.config = config
|
258
|
-
self.
|
259
|
-
self.model = Qwen2Model(config,
|
260
|
+
self.quant_config = quant_config
|
261
|
+
self.model = Qwen2Model(config, quant_config=quant_config)
|
260
262
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
261
263
|
self.logits_processor = LogitsProcessor(config)
|
262
264
|
|
@@ -0,0 +1,292 @@
|
|
1
|
+
# This code is based on:
|
2
|
+
# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/stablelm.py
|
3
|
+
"""Inference-only StableLM-2 (https://huggingface.co/stabilityai/stablelm-2-1_6b)
|
4
|
+
model compatible with HuggingFace weights."""
|
5
|
+
from typing import Optional, Tuple
|
6
|
+
|
7
|
+
import torch
|
8
|
+
from torch import nn
|
9
|
+
from transformers import PretrainedConfig
|
10
|
+
from vllm.model_executor.layers.activation import SiluAndMul
|
11
|
+
from vllm.model_executor.layers.linear import (
|
12
|
+
MergedColumnParallelLinear,
|
13
|
+
QKVParallelLinear,
|
14
|
+
RowParallelLinear,
|
15
|
+
)
|
16
|
+
from vllm.model_executor.layers.quantization.base_config import (
|
17
|
+
QuantizationConfig)
|
18
|
+
from vllm.model_executor.layers.rotary_embedding import get_rope
|
19
|
+
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
20
|
+
ParallelLMHead,
|
21
|
+
VocabParallelEmbedding,
|
22
|
+
)
|
23
|
+
from vllm.distributed import (
|
24
|
+
get_tensor_model_parallel_world_size,
|
25
|
+
)
|
26
|
+
from sglang.srt.weight_utils import (
|
27
|
+
default_weight_loader,
|
28
|
+
hf_model_weights_iterator,
|
29
|
+
)
|
30
|
+
|
31
|
+
from sglang.srt.layers.logits_processor import LogitsProcessor
|
32
|
+
from sglang.srt.layers.radix_attention import RadixAttention
|
33
|
+
from sglang.srt.managers.router.model_runner import InputMetadata
|
34
|
+
|
35
|
+
|
36
|
+
class StablelmMLP(nn.Module):
|
37
|
+
def __init__(
|
38
|
+
self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None,
|
39
|
+
) -> None:
|
40
|
+
super().__init__()
|
41
|
+
self.config = config
|
42
|
+
self.hidden_size = config.hidden_size
|
43
|
+
self.intermediate_size = config.intermediate_size
|
44
|
+
self.gate_up_proj = MergedColumnParallelLinear(
|
45
|
+
config.hidden_size,
|
46
|
+
[config.intermediate_size] * 2,
|
47
|
+
bias=False,
|
48
|
+
quant_config=quant_config,
|
49
|
+
)
|
50
|
+
self.down_proj = RowParallelLinear(
|
51
|
+
config.intermediate_size, config.hidden_size, bias=False, quant_config=quant_config,
|
52
|
+
)
|
53
|
+
self.act_fn = SiluAndMul()
|
54
|
+
|
55
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
56
|
+
gate_up, _ = self.gate_up_proj(x)
|
57
|
+
x = self.act_fn(gate_up)
|
58
|
+
x, _ = self.down_proj(x)
|
59
|
+
return x
|
60
|
+
|
61
|
+
|
62
|
+
class StablelmAttention(nn.Module):
|
63
|
+
def __init__(
|
64
|
+
self,
|
65
|
+
config: PretrainedConfig,
|
66
|
+
layer_id: int = 0,
|
67
|
+
quant_config: Optional[QuantizationConfig] = None,
|
68
|
+
) -> None:
|
69
|
+
super().__init__()
|
70
|
+
self.config = config
|
71
|
+
self.hidden_size = config.hidden_size
|
72
|
+
tp_size = get_tensor_model_parallel_world_size()
|
73
|
+
self.total_num_heads = config.num_attention_heads
|
74
|
+
self.num_heads = self.total_num_heads // tp_size
|
75
|
+
|
76
|
+
self.total_num_key_value_heads = config.num_key_value_heads
|
77
|
+
if self.total_num_key_value_heads >= tp_size:
|
78
|
+
# Number of KV heads is greater than TP size, so we partition
|
79
|
+
# the KV heads across multiple tensor parallel GPUs.
|
80
|
+
assert self.total_num_key_value_heads % tp_size == 0
|
81
|
+
else:
|
82
|
+
# Number of KV heads is less than TP size, so we replicate
|
83
|
+
# the KV heads across multiple tensor parallel GPUs.
|
84
|
+
assert tp_size % self.total_num_key_value_heads == 0
|
85
|
+
self.num_key_value_heads = max(1, self.total_num_key_value_heads // tp_size)
|
86
|
+
self.head_dim = self.hidden_size // self.total_num_heads
|
87
|
+
self.max_position_embeddings = config.max_position_embeddings
|
88
|
+
rope_pct = getattr(
|
89
|
+
config, "rope_pct", getattr(config, "partial_rotary_factor", 1)
|
90
|
+
)
|
91
|
+
self.rotary_ndims = int(self.head_dim * rope_pct)
|
92
|
+
self.scaling = self.head_dim**-0.5
|
93
|
+
self.q_size = self.num_heads * self.head_dim
|
94
|
+
self.kv_size = self.num_key_value_heads * self.head_dim
|
95
|
+
self.qkv_bias = getattr(config, "use_qkv_bias", False)
|
96
|
+
if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
|
97
|
+
raise ValueError(
|
98
|
+
f"hidden_size must be divisible by num_heads "
|
99
|
+
f"(got `hidden_size`: {self.hidden_size}"
|
100
|
+
f" and `num_heads`: {self.num_heads})."
|
101
|
+
)
|
102
|
+
|
103
|
+
self.qkv_proj = QKVParallelLinear(
|
104
|
+
self.hidden_size,
|
105
|
+
self.head_dim,
|
106
|
+
self.total_num_heads,
|
107
|
+
self.total_num_key_value_heads,
|
108
|
+
self.qkv_bias,
|
109
|
+
)
|
110
|
+
self.o_proj = RowParallelLinear(
|
111
|
+
self.total_num_heads * self.head_dim,
|
112
|
+
self.hidden_size,
|
113
|
+
bias=False,
|
114
|
+
)
|
115
|
+
self.rotary_emb = get_rope(
|
116
|
+
self.head_dim,
|
117
|
+
rotary_dim=self.rotary_ndims,
|
118
|
+
max_position=self.config.max_position_embeddings,
|
119
|
+
base=self.config.rope_theta,
|
120
|
+
)
|
121
|
+
self.attn = RadixAttention(
|
122
|
+
self.num_heads,
|
123
|
+
self.head_dim,
|
124
|
+
self.scaling,
|
125
|
+
num_kv_heads=self.num_key_value_heads,
|
126
|
+
layer_id=layer_id,
|
127
|
+
)
|
128
|
+
|
129
|
+
def forward(
|
130
|
+
self,
|
131
|
+
positions: torch.Tensor,
|
132
|
+
hidden_states: torch.Tensor,
|
133
|
+
input_metadata: InputMetadata,
|
134
|
+
) -> torch.Tensor:
|
135
|
+
qkv, _ = self.qkv_proj(hidden_states)
|
136
|
+
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
137
|
+
q, k = self.rotary_emb(positions, q, k)
|
138
|
+
attn_output = self.attn(q, k, v, input_metadata)
|
139
|
+
output, _ = self.o_proj(attn_output)
|
140
|
+
return output
|
141
|
+
|
142
|
+
|
143
|
+
class StablelmDecoderLayer(nn.Module):
|
144
|
+
def __init__(
|
145
|
+
self,
|
146
|
+
config: PretrainedConfig,
|
147
|
+
layer_id: int = 0,
|
148
|
+
quant_config: Optional[QuantizationConfig] = None,
|
149
|
+
) -> None:
|
150
|
+
super().__init__()
|
151
|
+
self.self_attn = StablelmAttention(config, layer_id=layer_id)
|
152
|
+
self.mlp = StablelmMLP(config, quant_config=quant_config)
|
153
|
+
norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05))
|
154
|
+
self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
|
155
|
+
self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
|
156
|
+
|
157
|
+
def forward(
|
158
|
+
self,
|
159
|
+
positions: torch.Tensor,
|
160
|
+
hidden_states: torch.Tensor,
|
161
|
+
input_metadata: InputMetadata,
|
162
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
163
|
+
# Self Attention
|
164
|
+
residual = hidden_states
|
165
|
+
hidden_states = self.input_layernorm(hidden_states)
|
166
|
+
hidden_states = self.self_attn(
|
167
|
+
positions=positions,
|
168
|
+
hidden_states=hidden_states,
|
169
|
+
input_metadata=input_metadata,
|
170
|
+
)
|
171
|
+
hidden_states = residual + hidden_states
|
172
|
+
|
173
|
+
# Fully Connected
|
174
|
+
residual = hidden_states
|
175
|
+
hidden_states = self.post_attention_layernorm(hidden_states)
|
176
|
+
hidden_states = self.mlp(hidden_states)
|
177
|
+
hidden_states = residual + hidden_states
|
178
|
+
|
179
|
+
return hidden_states, residual
|
180
|
+
|
181
|
+
|
182
|
+
class StableLMEpochModel(nn.Module):
|
183
|
+
def __init__(
|
184
|
+
self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None,
|
185
|
+
) -> None:
|
186
|
+
super().__init__()
|
187
|
+
self.embed_tokens = VocabParallelEmbedding(
|
188
|
+
config.vocab_size,
|
189
|
+
config.hidden_size,
|
190
|
+
)
|
191
|
+
self.layers = nn.ModuleList(
|
192
|
+
[
|
193
|
+
StablelmDecoderLayer(config, i, quant_config=quant_config)
|
194
|
+
for i in range(config.num_hidden_layers)
|
195
|
+
]
|
196
|
+
)
|
197
|
+
norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05))
|
198
|
+
self.norm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
|
199
|
+
|
200
|
+
def forward(
|
201
|
+
self,
|
202
|
+
input_ids: torch.Tensor,
|
203
|
+
positions: torch.Tensor,
|
204
|
+
input_metadata: InputMetadata,
|
205
|
+
input_embeds: torch.Tensor = None,
|
206
|
+
) -> torch.Tensor:
|
207
|
+
if input_embeds is None:
|
208
|
+
hidden_states = self.embed_tokens(input_ids)
|
209
|
+
else:
|
210
|
+
hidden_states = input_embeds
|
211
|
+
for i in range(len(self.layers)):
|
212
|
+
layer = self.layers[i]
|
213
|
+
hidden_states, residual = layer(
|
214
|
+
positions,
|
215
|
+
hidden_states,
|
216
|
+
input_metadata,
|
217
|
+
)
|
218
|
+
hidden_states = self.norm(hidden_states)
|
219
|
+
return hidden_states
|
220
|
+
|
221
|
+
|
222
|
+
class StableLmForCausalLM(nn.Module):
|
223
|
+
def __init__(
|
224
|
+
self,
|
225
|
+
config: PretrainedConfig,
|
226
|
+
quant_config: Optional[QuantizationConfig] = None,
|
227
|
+
) -> None:
|
228
|
+
super().__init__()
|
229
|
+
self.config = config
|
230
|
+
self.quant_config = quant_config
|
231
|
+
self.model = StableLMEpochModel(config, quant_config=quant_config)
|
232
|
+
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
233
|
+
self.logits_processor = LogitsProcessor(config)
|
234
|
+
|
235
|
+
def forward(
|
236
|
+
self,
|
237
|
+
input_ids: torch.Tensor,
|
238
|
+
positions: torch.Tensor,
|
239
|
+
input_metadata: InputMetadata,
|
240
|
+
input_embeds: torch.Tensor = None,
|
241
|
+
) -> torch.Tensor:
|
242
|
+
hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
|
243
|
+
return self.logits_processor(
|
244
|
+
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
245
|
+
)
|
246
|
+
|
247
|
+
def load_weights(
|
248
|
+
self,
|
249
|
+
model_name_or_path: str,
|
250
|
+
cache_dir: Optional[str] = None,
|
251
|
+
load_format: str = "auto",
|
252
|
+
revision: Optional[str] = None,
|
253
|
+
):
|
254
|
+
stacked_params_mapping = [
|
255
|
+
# (param_name, shard_name, shard_id)
|
256
|
+
("qkv_proj", "q_proj", "q"),
|
257
|
+
("qkv_proj", "k_proj", "k"),
|
258
|
+
("qkv_proj", "v_proj", "v"),
|
259
|
+
("gate_up_proj", "gate_proj", 0),
|
260
|
+
("gate_up_proj", "up_proj", 1),
|
261
|
+
]
|
262
|
+
params_dict = dict(self.named_parameters())
|
263
|
+
for name, loaded_weight in hf_model_weights_iterator(
|
264
|
+
model_name_or_path, cache_dir, load_format, revision
|
265
|
+
):
|
266
|
+
if "rotary_emb.inv_freq" in name:
|
267
|
+
continue
|
268
|
+
if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
|
269
|
+
# Models trained using ColossalAI may include these tensors in
|
270
|
+
# the checkpoint. Skip them.
|
271
|
+
continue
|
272
|
+
for param_name, weight_name, shard_id in stacked_params_mapping:
|
273
|
+
if weight_name not in name:
|
274
|
+
continue
|
275
|
+
name = name.replace(weight_name, param_name)
|
276
|
+
# Skip loading extra bias for GPTQ models.
|
277
|
+
if name.endswith(".bias") and name not in params_dict:
|
278
|
+
continue
|
279
|
+
param = params_dict[name]
|
280
|
+
weight_loader = param.weight_loader
|
281
|
+
weight_loader(param, loaded_weight, shard_id)
|
282
|
+
break
|
283
|
+
else:
|
284
|
+
# Skip loading extra bias for GPTQ models.
|
285
|
+
if name.endswith(".bias") and name not in params_dict:
|
286
|
+
continue
|
287
|
+
param = params_dict[name]
|
288
|
+
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
289
|
+
weight_loader(param, loaded_weight)
|
290
|
+
|
291
|
+
|
292
|
+
EntryClass = StableLmForCausalLM
|
sglang/srt/models/yivl.py
CHANGED
@@ -5,16 +5,17 @@ from typing import List, Optional
|
|
5
5
|
|
6
6
|
import torch
|
7
7
|
import torch.nn as nn
|
8
|
+
from transformers import CLIPVisionModel, LlavaConfig
|
9
|
+
from sglang.srt.weight_utils import (
|
10
|
+
default_weight_loader,
|
11
|
+
hf_model_weights_iterator,
|
12
|
+
)
|
13
|
+
|
8
14
|
from sglang.srt.models.llava import (
|
9
15
|
LlavaLlamaForCausalLM,
|
10
16
|
clip_vision_embed_forward,
|
11
17
|
monkey_path_clip_vision_embed_forward,
|
12
18
|
)
|
13
|
-
from transformers import CLIPVisionModel, LlavaConfig
|
14
|
-
from vllm.model_executor.weight_utils import (
|
15
|
-
default_weight_loader,
|
16
|
-
hf_model_weights_iterator,
|
17
|
-
)
|
18
19
|
|
19
20
|
|
20
21
|
class YiVLForCausalLM(LlavaLlamaForCausalLM):
|