sglang 0.1.20__py3-none-any.whl → 0.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -8
- sglang/api.py +1 -1
- sglang/backend/runtime_endpoint.py +14 -4
- sglang/backend/vertexai.py +5 -4
- sglang/bench.py +627 -0
- sglang/bench_latency.py +22 -20
- sglang/bench_serving.py +758 -0
- sglang/check_env.py +171 -0
- sglang/global_config.py +3 -1
- sglang/lang/backend/__init__.py +0 -0
- sglang/lang/backend/anthropic.py +77 -0
- sglang/lang/backend/base_backend.py +80 -0
- sglang/lang/backend/litellm.py +90 -0
- sglang/lang/backend/openai.py +438 -0
- sglang/lang/backend/runtime_endpoint.py +283 -0
- sglang/lang/backend/vertexai.py +149 -0
- sglang/lang/chat_template.py +2 -2
- sglang/lang/ir.py +3 -3
- sglang/lang/tracer.py +1 -1
- sglang/launch_server.py +1 -1
- sglang/launch_server_llavavid.py +1 -4
- sglang/srt/conversation.py +1 -1
- sglang/srt/layers/context_flashattention_nopad.py +0 -29
- sglang/srt/layers/extend_attention.py +0 -39
- sglang/srt/layers/linear.py +869 -0
- sglang/srt/layers/quantization/__init__.py +49 -0
- sglang/srt/layers/quantization/fp8.py +662 -0
- sglang/srt/layers/radix_attention.py +31 -5
- sglang/srt/layers/token_attention.py +1 -51
- sglang/srt/managers/controller/cuda_graph_runner.py +44 -18
- sglang/srt/managers/controller/infer_batch.py +76 -72
- sglang/srt/managers/controller/manager_multi.py +109 -98
- sglang/srt/managers/controller/manager_single.py +105 -50
- sglang/srt/managers/controller/model_runner.py +42 -18
- sglang/srt/managers/controller/radix_cache.py +4 -3
- sglang/srt/managers/controller/schedule_heuristic.py +4 -0
- sglang/srt/managers/controller/tp_worker.py +143 -156
- sglang/srt/managers/detokenizer_manager.py +49 -5
- sglang/srt/managers/io_struct.py +36 -17
- sglang/srt/managers/tokenizer_manager.py +228 -125
- sglang/srt/memory_pool.py +46 -58
- sglang/srt/model_loader/model_loader.py +277 -0
- sglang/srt/model_loader/utils.py +260 -0
- sglang/srt/models/chatglm.py +1 -0
- sglang/srt/models/dbrx.py +1 -0
- sglang/srt/models/grok.py +1 -0
- sglang/srt/models/internlm2.py +317 -0
- sglang/srt/models/llama2.py +65 -16
- sglang/srt/models/llama_classification.py +1 -0
- sglang/srt/models/llava.py +1 -0
- sglang/srt/models/llavavid.py +1 -0
- sglang/srt/models/minicpm.py +2 -8
- sglang/srt/models/mixtral.py +1 -0
- sglang/srt/models/mixtral_quant.py +1 -0
- sglang/srt/models/qwen.py +1 -0
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_moe.py +130 -108
- sglang/srt/models/stablelm.py +1 -0
- sglang/srt/openai_api/adapter.py +432 -0
- sglang/srt/openai_api/api_adapter.py +432 -0
- sglang/srt/openai_api/openai_api_adapter.py +431 -0
- sglang/srt/openai_api/openai_protocol.py +207 -0
- sglang/srt/openai_api/protocol.py +208 -0
- sglang/srt/openai_protocol.py +17 -0
- sglang/srt/sampling_params.py +2 -0
- sglang/srt/server.py +114 -90
- sglang/srt/server_args.py +27 -17
- sglang/srt/utils.py +17 -118
- sglang/test/test_conversation.py +1 -1
- sglang/test/test_openai_protocol.py +1 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +2 -2
- {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/METADATA +157 -159
- sglang-0.1.22.dist-info/RECORD +103 -0
- {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/WHEEL +1 -1
- sglang-0.1.20.dist-info/RECORD +0 -82
- {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/LICENSE +0 -0
- {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/top_level.txt +0 -0
sglang/srt/models/qwen2_moe.py
CHANGED
@@ -8,24 +8,28 @@ import torch
|
|
8
8
|
import torch.nn.functional as F
|
9
9
|
from torch import nn
|
10
10
|
from transformers import PretrainedConfig
|
11
|
-
|
12
11
|
from vllm.config import CacheConfig
|
13
|
-
from vllm.distributed import (
|
14
|
-
|
12
|
+
from vllm.distributed import (
|
13
|
+
get_tensor_model_parallel_world_size,
|
14
|
+
tensor_model_parallel_all_reduce,
|
15
|
+
)
|
15
16
|
from vllm.model_executor.layers.activation import SiluAndMul
|
16
17
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
17
18
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
18
|
-
from vllm.model_executor.layers.linear import (
|
19
|
-
|
20
|
-
|
21
|
-
|
19
|
+
from vllm.model_executor.layers.linear import (
|
20
|
+
MergedColumnParallelLinear,
|
21
|
+
QKVParallelLinear,
|
22
|
+
ReplicatedLinear,
|
23
|
+
RowParallelLinear,
|
24
|
+
)
|
22
25
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
23
|
-
from vllm.model_executor.layers.quantization.base_config import
|
24
|
-
QuantizationConfig)
|
26
|
+
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
25
27
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
26
28
|
from vllm.model_executor.layers.sampler import Sampler
|
27
29
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
28
|
-
ParallelLMHead,
|
30
|
+
ParallelLMHead,
|
31
|
+
VocabParallelEmbedding,
|
32
|
+
)
|
29
33
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
30
34
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
31
35
|
from vllm.sequence import IntermediateTensors, SamplerOutput
|
@@ -34,8 +38,8 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
34
38
|
from sglang.srt.layers.radix_attention import RadixAttention
|
35
39
|
from sglang.srt.managers.controller.model_runner import InputMetadata
|
36
40
|
|
37
|
-
class Qwen2MoeMLP(nn.Module):
|
38
41
|
|
42
|
+
class Qwen2MoeMLP(nn.Module):
|
39
43
|
def __init__(
|
40
44
|
self,
|
41
45
|
hidden_size: int,
|
@@ -46,17 +50,20 @@ class Qwen2MoeMLP(nn.Module):
|
|
46
50
|
) -> None:
|
47
51
|
super().__init__()
|
48
52
|
self.gate_up_proj = MergedColumnParallelLinear(
|
49
|
-
hidden_size, [intermediate_size] * 2,
|
53
|
+
hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config
|
54
|
+
)
|
55
|
+
self.down_proj = RowParallelLinear(
|
56
|
+
intermediate_size,
|
57
|
+
hidden_size,
|
50
58
|
bias=False,
|
51
|
-
quant_config=quant_config
|
52
|
-
|
53
|
-
|
54
|
-
bias=False,
|
55
|
-
quant_config=quant_config,
|
56
|
-
reduce_results=reduce_results)
|
59
|
+
quant_config=quant_config,
|
60
|
+
reduce_results=reduce_results,
|
61
|
+
)
|
57
62
|
if hidden_act != "silu":
|
58
|
-
raise ValueError(
|
59
|
-
|
63
|
+
raise ValueError(
|
64
|
+
f"Unsupported activation: {hidden_act}. "
|
65
|
+
"Only silu is supported for now."
|
66
|
+
)
|
60
67
|
self.act_fn = SiluAndMul()
|
61
68
|
|
62
69
|
def forward(self, x):
|
@@ -67,7 +74,6 @@ class Qwen2MoeMLP(nn.Module):
|
|
67
74
|
|
68
75
|
|
69
76
|
class Qwen2MoeSparseMoeBlock(nn.Module):
|
70
|
-
|
71
77
|
def __init__(
|
72
78
|
self,
|
73
79
|
config: PretrainedConfig,
|
@@ -79,20 +85,22 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
|
|
79
85
|
if self.tp_size > config.num_experts:
|
80
86
|
raise ValueError(
|
81
87
|
f"Tensor parallel size {self.tp_size} is greater than "
|
82
|
-
f"the number of experts {config.num_experts}."
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
88
|
+
f"the number of experts {config.num_experts}."
|
89
|
+
)
|
90
|
+
|
91
|
+
self.experts = FusedMoE(
|
92
|
+
num_experts=config.num_experts,
|
93
|
+
top_k=config.num_experts_per_tok,
|
94
|
+
hidden_size=config.hidden_size,
|
95
|
+
intermediate_size=config.moe_intermediate_size,
|
96
|
+
reduce_results=False,
|
97
|
+
renormalize=config.norm_topk_prob,
|
98
|
+
quant_config=quant_config,
|
99
|
+
)
|
100
|
+
|
101
|
+
self.gate = ReplicatedLinear(
|
102
|
+
config.hidden_size, config.num_experts, bias=False, quant_config=None
|
103
|
+
)
|
96
104
|
if config.shared_expert_intermediate_size > 0:
|
97
105
|
self.shared_expert = Qwen2MoeMLP(
|
98
106
|
hidden_size=config.hidden_size,
|
@@ -103,9 +111,7 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
|
|
103
111
|
)
|
104
112
|
else:
|
105
113
|
self.shared_expert = None
|
106
|
-
self.shared_expert_gate = torch.nn.Linear(config.hidden_size,
|
107
|
-
1,
|
108
|
-
bias=False)
|
114
|
+
self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)
|
109
115
|
|
110
116
|
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
111
117
|
num_tokens, hidden_dim = hidden_states.shape
|
@@ -114,24 +120,24 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
|
|
114
120
|
if self.shared_expert is not None:
|
115
121
|
shared_output = self.shared_expert(hidden_states)
|
116
122
|
if self.shared_expert_gate is not None:
|
117
|
-
shared_output =
|
118
|
-
self.shared_expert_gate(hidden_states)) * shared_output
|
123
|
+
shared_output = (
|
124
|
+
F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_output
|
125
|
+
)
|
119
126
|
|
120
127
|
# router_logits: (num_tokens, n_experts)
|
121
128
|
router_logits, _ = self.gate(hidden_states)
|
122
|
-
final_hidden_states = self.experts(
|
123
|
-
|
129
|
+
final_hidden_states = self.experts(
|
130
|
+
hidden_states=hidden_states, router_logits=router_logits
|
131
|
+
)
|
124
132
|
if shared_output is not None:
|
125
133
|
final_hidden_states = final_hidden_states + shared_output
|
126
134
|
if self.tp_size > 1:
|
127
|
-
final_hidden_states = tensor_model_parallel_all_reduce(
|
128
|
-
final_hidden_states)
|
135
|
+
final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
|
129
136
|
|
130
137
|
return final_hidden_states.view(num_tokens, hidden_dim)
|
131
138
|
|
132
139
|
|
133
140
|
class Qwen2MoeAttention(nn.Module):
|
134
|
-
|
135
141
|
def __init__(
|
136
142
|
self,
|
137
143
|
hidden_size: int,
|
@@ -190,17 +196,19 @@ class Qwen2MoeAttention(nn.Module):
|
|
190
196
|
base=rope_theta,
|
191
197
|
rope_scaling=rope_scaling,
|
192
198
|
)
|
193
|
-
self.attn = RadixAttention(
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
199
|
+
self.attn = RadixAttention(
|
200
|
+
self.num_heads,
|
201
|
+
self.head_dim,
|
202
|
+
self.scaling,
|
203
|
+
num_kv_heads=self.num_kv_heads,
|
204
|
+
layer_id=layer_id,
|
205
|
+
)
|
198
206
|
|
199
207
|
def forward(
|
200
208
|
self,
|
201
209
|
positions: torch.Tensor,
|
202
210
|
hidden_states: torch.Tensor,
|
203
|
-
input_metadata: InputMetadata
|
211
|
+
input_metadata: InputMetadata,
|
204
212
|
) -> torch.Tensor:
|
205
213
|
qkv, _ = self.qkv_proj(hidden_states)
|
206
214
|
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
@@ -211,7 +219,6 @@ class Qwen2MoeAttention(nn.Module):
|
|
211
219
|
|
212
220
|
|
213
221
|
class Qwen2MoeDecoderLayer(nn.Module):
|
214
|
-
|
215
222
|
def __init__(
|
216
223
|
self,
|
217
224
|
config: PretrainedConfig,
|
@@ -223,8 +230,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
|
|
223
230
|
self.hidden_size = config.hidden_size
|
224
231
|
rope_theta = getattr(config, "rope_theta", 10000)
|
225
232
|
rope_scaling = getattr(config, "rope_scaling", None)
|
226
|
-
max_position_embeddings = getattr(config, "max_position_embeddings",
|
227
|
-
8192)
|
233
|
+
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
228
234
|
self.self_attn = Qwen2MoeAttention(
|
229
235
|
hidden_size=self.hidden_size,
|
230
236
|
num_heads=config.num_attention_heads,
|
@@ -239,13 +245,13 @@ class Qwen2MoeDecoderLayer(nn.Module):
|
|
239
245
|
|
240
246
|
# Note: Qwen/Qwen2-57B-A14B-Instruct does not have
|
241
247
|
# `mlp_only_layers` in the config.
|
242
|
-
mlp_only_layers = (
|
243
|
-
|
248
|
+
mlp_only_layers = (
|
249
|
+
[] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers
|
250
|
+
)
|
244
251
|
if (layer_id not in mlp_only_layers) and (
|
245
|
-
|
246
|
-
|
247
|
-
self.mlp = Qwen2MoeSparseMoeBlock(config=config,
|
248
|
-
quant_config=quant_config)
|
252
|
+
config.num_experts > 0 and (layer_id + 1) % config.decoder_sparse_step == 0
|
253
|
+
):
|
254
|
+
self.mlp = Qwen2MoeSparseMoeBlock(config=config, quant_config=quant_config)
|
249
255
|
else:
|
250
256
|
self.mlp = Qwen2MoeMLP(
|
251
257
|
hidden_size=config.hidden_size,
|
@@ -253,10 +259,10 @@ class Qwen2MoeDecoderLayer(nn.Module):
|
|
253
259
|
hidden_act=config.hidden_act,
|
254
260
|
quant_config=quant_config,
|
255
261
|
)
|
256
|
-
self.input_layernorm = RMSNorm(config.hidden_size,
|
257
|
-
|
258
|
-
|
259
|
-
|
262
|
+
self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
263
|
+
self.post_attention_layernorm = RMSNorm(
|
264
|
+
config.hidden_size, eps=config.rms_norm_eps
|
265
|
+
)
|
260
266
|
|
261
267
|
def forward(
|
262
268
|
self,
|
@@ -270,23 +276,20 @@ class Qwen2MoeDecoderLayer(nn.Module):
|
|
270
276
|
residual = hidden_states
|
271
277
|
hidden_states = self.input_layernorm(hidden_states)
|
272
278
|
else:
|
273
|
-
hidden_states, residual = self.input_layernorm(
|
274
|
-
hidden_states, residual)
|
279
|
+
hidden_states, residual = self.input_layernorm(hidden_states, residual)
|
275
280
|
hidden_states = self.self_attn(
|
276
281
|
positions=positions,
|
277
282
|
hidden_states=hidden_states,
|
278
|
-
input_metadata=input_metadata
|
283
|
+
input_metadata=input_metadata,
|
279
284
|
)
|
280
285
|
|
281
286
|
# Fully Connected
|
282
|
-
hidden_states, residual = self.post_attention_layernorm(
|
283
|
-
hidden_states, residual)
|
287
|
+
hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
|
284
288
|
hidden_states = self.mlp(hidden_states)
|
285
289
|
return hidden_states, residual
|
286
290
|
|
287
291
|
|
288
292
|
class Qwen2MoeModel(nn.Module):
|
289
|
-
|
290
293
|
def __init__(
|
291
294
|
self,
|
292
295
|
config: PretrainedConfig,
|
@@ -301,13 +304,14 @@ class Qwen2MoeModel(nn.Module):
|
|
301
304
|
config.vocab_size,
|
302
305
|
config.hidden_size,
|
303
306
|
)
|
304
|
-
self.layers = nn.ModuleList(
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
307
|
+
self.layers = nn.ModuleList(
|
308
|
+
[
|
309
|
+
Qwen2MoeDecoderLayer(
|
310
|
+
config, layer_id, cache_config, quant_config=quant_config
|
311
|
+
)
|
312
|
+
for layer_id in range(config.num_hidden_layers)
|
313
|
+
]
|
314
|
+
)
|
311
315
|
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
312
316
|
|
313
317
|
def forward(
|
@@ -315,7 +319,7 @@ class Qwen2MoeModel(nn.Module):
|
|
315
319
|
input_ids: torch.Tensor,
|
316
320
|
positions: torch.Tensor,
|
317
321
|
input_metadata: InputMetadata,
|
318
|
-
input_embeds: torch.Tensor = None
|
322
|
+
input_embeds: torch.Tensor = None,
|
319
323
|
) -> torch.Tensor:
|
320
324
|
if input_embeds is None:
|
321
325
|
hidden_states = self.embed_tokens(input_ids)
|
@@ -324,10 +328,9 @@ class Qwen2MoeModel(nn.Module):
|
|
324
328
|
residual = None
|
325
329
|
for i in range(len(self.layers)):
|
326
330
|
layer = self.layers[i]
|
327
|
-
hidden_states, residual = layer(
|
328
|
-
|
329
|
-
|
330
|
-
residual)
|
331
|
+
hidden_states, residual = layer(
|
332
|
+
positions, hidden_states, input_metadata, residual
|
333
|
+
)
|
331
334
|
hidden_states, _ = self.norm(hidden_states, residual)
|
332
335
|
return hidden_states
|
333
336
|
|
@@ -346,28 +349,34 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
346
349
|
self.config = config
|
347
350
|
self.quant_config = quant_config
|
348
351
|
self.model = Qwen2MoeModel(config, cache_config, quant_config)
|
349
|
-
self.lm_head = ParallelLMHead(
|
350
|
-
|
351
|
-
|
352
|
+
self.lm_head = ParallelLMHead(
|
353
|
+
config.vocab_size, config.hidden_size, quant_config=quant_config
|
354
|
+
)
|
352
355
|
self.logits_processor = LogitsProcessor(config)
|
353
356
|
self.sampler = Sampler()
|
354
357
|
|
358
|
+
@torch.no_grad()
|
355
359
|
def forward(
|
356
360
|
self,
|
357
361
|
input_ids: torch.Tensor,
|
358
362
|
positions: torch.Tensor,
|
359
363
|
input_metadata: InputMetadata,
|
360
|
-
input_embeds: torch.Tensor = None
|
364
|
+
input_embeds: torch.Tensor = None,
|
365
|
+
) -> torch.Tensor:
|
366
|
+
hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
|
367
|
+
return self.logits_processor(
|
368
|
+
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
369
|
+
)
|
370
|
+
|
371
|
+
def compute_logits(
|
372
|
+
self,
|
373
|
+
input_ids: torch.Tensor,
|
374
|
+
hidden_states: torch.Tensor,
|
375
|
+
input_metadata: InputMetadata,
|
361
376
|
) -> torch.Tensor:
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
input_metadata)
|
366
|
-
|
367
|
-
def compute_logits(self, input_ids: torch.Tensor, hidden_states: torch.Tensor,
|
368
|
-
input_metadata: InputMetadata) -> torch.Tensor:
|
369
|
-
logits = self.logits_processor(input_ids, hidden_states, self.lm_head.weight,
|
370
|
-
input_metadata)
|
377
|
+
logits = self.logits_processor(
|
378
|
+
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
379
|
+
)
|
371
380
|
return logits
|
372
381
|
|
373
382
|
def sample(
|
@@ -391,18 +400,27 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
391
400
|
expert_params_mapping = [
|
392
401
|
# These are the weights for the experts
|
393
402
|
# (param_name, weight_name, expert_id, shard_id)
|
394
|
-
(
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
403
|
+
(
|
404
|
+
(
|
405
|
+
"experts.w13_weight"
|
406
|
+
if weight_name in ["gate_proj", "up_proj"]
|
407
|
+
else "experts.w2_weight"
|
408
|
+
),
|
409
|
+
f"experts.{expert_id}.{weight_name}.weight",
|
410
|
+
expert_id,
|
411
|
+
shard_id,
|
412
|
+
)
|
413
|
+
for expert_id in range(self.config.num_experts)
|
414
|
+
for shard_id, weight_name in enumerate(
|
415
|
+
["gate_proj", "down_proj", "up_proj"]
|
416
|
+
)
|
399
417
|
]
|
400
418
|
|
401
419
|
params_dict = dict(self.named_parameters())
|
402
420
|
for name, loaded_weight in weights:
|
403
421
|
if "rotary_emb.inv_freq" in name:
|
404
422
|
continue
|
405
|
-
for
|
423
|
+
for param_name, weight_name, shard_id in stacked_params_mapping:
|
406
424
|
# Skip non-stacked layers and experts (experts handled below).
|
407
425
|
if weight_name not in name:
|
408
426
|
continue
|
@@ -433,11 +451,13 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
433
451
|
name = name.replace(weight_name, param_name)
|
434
452
|
param = params_dict[name]
|
435
453
|
weight_loader = param.weight_loader
|
436
|
-
weight_loader(
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
454
|
+
weight_loader(
|
455
|
+
param,
|
456
|
+
loaded_weight,
|
457
|
+
weight_name,
|
458
|
+
shard_id=shard_id,
|
459
|
+
expert_id=expert_id,
|
460
|
+
)
|
441
461
|
break
|
442
462
|
else:
|
443
463
|
# Skip loading extra bias for GPTQ models.
|
@@ -447,8 +467,10 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
447
467
|
continue
|
448
468
|
|
449
469
|
param = params_dict[name]
|
450
|
-
weight_loader = getattr(
|
451
|
-
|
470
|
+
weight_loader = getattr(
|
471
|
+
param, "weight_loader", default_weight_loader
|
472
|
+
)
|
452
473
|
weight_loader(param, loaded_weight)
|
453
474
|
|
475
|
+
|
454
476
|
EntryClass = Qwen2MoeForCausalLM
|
sglang/srt/models/stablelm.py
CHANGED