sglang 0.4.9.post5__py3-none-any.whl → 0.4.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/model_config.py +6 -0
- sglang/srt/configs/step3_vl.py +172 -0
- sglang/srt/conversation.py +23 -0
- sglang/srt/disaggregation/decode.py +2 -8
- sglang/srt/disaggregation/prefill.py +2 -6
- sglang/srt/distributed/parallel_state.py +86 -1
- sglang/srt/entrypoints/engine.py +14 -18
- sglang/srt/entrypoints/http_server.py +23 -3
- sglang/srt/entrypoints/openai/protocol.py +3 -1
- sglang/srt/entrypoints/openai/serving_base.py +5 -2
- sglang/srt/entrypoints/openai/serving_chat.py +2 -21
- sglang/srt/eplb/expert_distribution.py +5 -0
- sglang/srt/eplb/expert_location.py +17 -6
- sglang/srt/eplb/expert_location_dispatch.py +1 -0
- sglang/srt/eplb/expert_location_updater.py +2 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/step3_detector.py +436 -0
- sglang/srt/hf_transformers_utils.py +2 -0
- sglang/srt/jinja_template_utils.py +4 -1
- sglang/srt/layers/moe/cutlass_moe.py +2 -1
- sglang/srt/layers/moe/ep_moe/layer.py +98 -603
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
- sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
- sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
- sglang/srt/layers/moe/topk.py +6 -2
- sglang/srt/layers/quantization/fp8.py +0 -18
- sglang/srt/layers/quantization/modelopt_quant.py +2 -0
- sglang/srt/layers/quantization/unquant.py +0 -8
- sglang/srt/layers/quantization/w4afp8.py +1 -0
- sglang/srt/managers/cache_controller.py +143 -45
- sglang/srt/managers/data_parallel_controller.py +6 -0
- sglang/srt/managers/io_struct.py +12 -2
- sglang/srt/managers/scheduler.py +116 -669
- sglang/srt/managers/scheduler_input_blocker.py +106 -0
- sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
- sglang/srt/managers/template_manager.py +62 -19
- sglang/srt/managers/tokenizer_manager.py +166 -83
- sglang/srt/managers/tp_worker.py +9 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
- sglang/srt/mem_cache/hicache_storage.py +45 -11
- sglang/srt/mem_cache/hiradix_cache.py +15 -4
- sglang/srt/mem_cache/memory_pool_host.py +73 -1
- sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
- sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
- sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
- sglang/srt/model_executor/model_runner.py +20 -13
- sglang/srt/models/arcee.py +532 -0
- sglang/srt/models/deepseek_v2.py +15 -56
- sglang/srt/models/glm4_moe.py +3 -1
- sglang/srt/models/granitemoe.py +3 -0
- sglang/srt/models/grok.py +3 -0
- sglang/srt/models/hunyuan.py +1 -0
- sglang/srt/models/llama4.py +3 -0
- sglang/srt/models/mixtral.py +3 -0
- sglang/srt/models/olmoe.py +3 -0
- sglang/srt/models/phimoe.py +1 -0
- sglang/srt/models/qwen3_moe.py +12 -69
- sglang/srt/models/step3_vl.py +994 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -16
- sglang/srt/multimodal/processors/step3_vl.py +515 -0
- sglang/srt/poll_based_barrier.py +31 -0
- sglang/srt/reasoning_parser.py +2 -1
- sglang/srt/server_args.py +18 -13
- sglang/srt/speculative/eagle_worker.py +2 -0
- sglang/srt/two_batch_overlap.py +8 -3
- sglang/test/test_utils.py +53 -0
- sglang/utils.py +0 -11
- sglang/version.py +1 -1
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/METADATA +4 -4
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/RECORD +84 -64
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0
sglang/srt/models/grok.py
CHANGED
@@ -78,6 +78,7 @@ class Grok1MoE(nn.Module):
|
|
78
78
|
def __init__(
|
79
79
|
self,
|
80
80
|
config: PretrainedConfig,
|
81
|
+
layer_id: int,
|
81
82
|
num_experts: int,
|
82
83
|
top_k: int,
|
83
84
|
hidden_size: int,
|
@@ -128,6 +129,7 @@ class Grok1MoE(nn.Module):
|
|
128
129
|
self.experts = MoEImpl(
|
129
130
|
num_experts=num_experts,
|
130
131
|
top_k=top_k,
|
132
|
+
layer_id=layer_id,
|
131
133
|
hidden_size=hidden_size,
|
132
134
|
intermediate_size=intermediate_size,
|
133
135
|
params_dtype=params_dtype,
|
@@ -331,6 +333,7 @@ class Grok1DecoderLayer(nn.Module):
|
|
331
333
|
)
|
332
334
|
self.block_sparse_moe = Grok1MoE(
|
333
335
|
config=config,
|
336
|
+
layer_id=layer_id,
|
334
337
|
num_experts=config.num_local_experts,
|
335
338
|
top_k=config.num_experts_per_tok,
|
336
339
|
hidden_size=config.hidden_size,
|
sglang/srt/models/hunyuan.py
CHANGED
sglang/srt/models/llama4.py
CHANGED
@@ -87,6 +87,7 @@ class Llama4MoE(nn.Module):
|
|
87
87
|
def __init__(
|
88
88
|
self,
|
89
89
|
config: Llama4TextConfig,
|
90
|
+
layer_id: int,
|
90
91
|
quant_config: Optional[QuantizationConfig] = None,
|
91
92
|
prefix: str = "",
|
92
93
|
):
|
@@ -114,6 +115,7 @@ class Llama4MoE(nn.Module):
|
|
114
115
|
num_experts=config.num_local_experts,
|
115
116
|
hidden_size=config.hidden_size,
|
116
117
|
intermediate_size=intermediate_size_moe,
|
118
|
+
layer_id=layer_id,
|
117
119
|
reduce_results=False,
|
118
120
|
quant_config=quant_config,
|
119
121
|
apply_router_weight_on_input=True,
|
@@ -373,6 +375,7 @@ class Llama4DecoderLayer(nn.Module):
|
|
373
375
|
if is_moe_layer:
|
374
376
|
self.feed_forward = Llama4MoE(
|
375
377
|
config=config,
|
378
|
+
layer_id=layer_id,
|
376
379
|
quant_config=quant_config,
|
377
380
|
prefix=add_prefix("feed_forward", prefix),
|
378
381
|
)
|
sglang/srt/models/mixtral.py
CHANGED
@@ -69,6 +69,7 @@ class MixtralMoE(nn.Module):
|
|
69
69
|
top_k: int,
|
70
70
|
hidden_size: int,
|
71
71
|
intermediate_size: int,
|
72
|
+
layer_id: int,
|
72
73
|
params_dtype: Optional[torch.dtype] = None,
|
73
74
|
quant_config: Optional[QuantizationConfig] = None,
|
74
75
|
tp_size: Optional[int] = None,
|
@@ -97,6 +98,7 @@ class MixtralMoE(nn.Module):
|
|
97
98
|
self.experts = MoEImpl(
|
98
99
|
num_experts=num_experts,
|
99
100
|
top_k=top_k,
|
101
|
+
layer_id=layer_id,
|
100
102
|
hidden_size=hidden_size,
|
101
103
|
intermediate_size=intermediate_size,
|
102
104
|
params_dtype=params_dtype,
|
@@ -226,6 +228,7 @@ class MixtralDecoderLayer(nn.Module):
|
|
226
228
|
top_k=config.num_experts_per_tok,
|
227
229
|
hidden_size=config.hidden_size,
|
228
230
|
intermediate_size=config.intermediate_size,
|
231
|
+
layer_id=layer_id,
|
229
232
|
quant_config=quant_config,
|
230
233
|
prefix=add_prefix("block_sparse_moe", prefix),
|
231
234
|
)
|
sglang/srt/models/olmoe.py
CHANGED
@@ -63,6 +63,7 @@ class OlmoeMoE(nn.Module):
|
|
63
63
|
params_dtype: Optional[torch.dtype] = None,
|
64
64
|
quant_config: Optional[QuantizationConfig] = None,
|
65
65
|
tp_size: Optional[int] = None,
|
66
|
+
layer_id: int = 0,
|
66
67
|
prefix: str = "",
|
67
68
|
):
|
68
69
|
super().__init__()
|
@@ -89,6 +90,7 @@ class OlmoeMoE(nn.Module):
|
|
89
90
|
reduce_results=True,
|
90
91
|
quant_config=quant_config,
|
91
92
|
tp_size=tp_size,
|
93
|
+
layer_id=layer_id,
|
92
94
|
prefix=add_prefix("experts", prefix),
|
93
95
|
)
|
94
96
|
|
@@ -224,6 +226,7 @@ class OlmoeDecoderLayer(nn.Module):
|
|
224
226
|
top_k=config.num_experts_per_tok,
|
225
227
|
hidden_size=config.hidden_size,
|
226
228
|
intermediate_size=config.intermediate_size,
|
229
|
+
layer_id=layer_id,
|
227
230
|
quant_config=quant_config,
|
228
231
|
prefix=add_prefix("mlp", prefix),
|
229
232
|
)
|
sglang/srt/models/phimoe.py
CHANGED
sglang/srt/models/qwen3_moe.py
CHANGED
@@ -144,19 +144,6 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
144
144
|
)
|
145
145
|
self.top_k = config.num_experts_per_tok
|
146
146
|
|
147
|
-
self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
|
148
|
-
group=parallel_state.get_tp_group().device_group,
|
149
|
-
router_topk=self.top_k,
|
150
|
-
permute_fusion=True,
|
151
|
-
num_experts=self.num_experts,
|
152
|
-
num_local_experts=config.num_experts // self.tp_size,
|
153
|
-
hidden_size=config.hidden_size,
|
154
|
-
params_dtype=config.torch_dtype,
|
155
|
-
deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]],
|
156
|
-
async_finish=True, # TODO
|
157
|
-
return_recv_hook=True,
|
158
|
-
)
|
159
|
-
|
160
147
|
def forward(
|
161
148
|
self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
|
162
149
|
) -> torch.Tensor:
|
@@ -207,41 +194,12 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
207
194
|
topk_weights = torch.empty(
|
208
195
|
(0, self.top_k), dtype=torch.float32, device=hidden_states.device
|
209
196
|
)
|
210
|
-
if self.ep_size > 1:
|
211
|
-
# TODO(ch-wan): allow users to set num_max_dispatch_tokens_per_rank value
|
212
|
-
(
|
213
|
-
hidden_states,
|
214
|
-
topk_idx,
|
215
|
-
topk_weights,
|
216
|
-
reorder_topk_ids,
|
217
|
-
num_recv_tokens_per_expert,
|
218
|
-
seg_indptr,
|
219
|
-
masked_m,
|
220
|
-
expected_m,
|
221
|
-
) = self.deepep_dispatcher.dispatch(
|
222
|
-
hidden_states=hidden_states,
|
223
|
-
topk_idx=topk_idx,
|
224
|
-
topk_weights=topk_weights,
|
225
|
-
forward_batch=forward_batch,
|
226
|
-
)
|
227
197
|
final_hidden_states = self.experts(
|
228
198
|
hidden_states=hidden_states,
|
229
199
|
topk_idx=topk_idx,
|
230
200
|
topk_weights=topk_weights,
|
231
|
-
reorder_topk_ids=reorder_topk_ids,
|
232
|
-
seg_indptr=seg_indptr,
|
233
|
-
masked_m=masked_m,
|
234
|
-
expected_m=expected_m,
|
235
|
-
num_recv_tokens_per_expert=num_recv_tokens_per_expert,
|
236
201
|
forward_batch=forward_batch,
|
237
202
|
)
|
238
|
-
if self.ep_size > 1:
|
239
|
-
final_hidden_states = self.deepep_dispatcher.combine(
|
240
|
-
hidden_states=final_hidden_states,
|
241
|
-
topk_idx=topk_idx,
|
242
|
-
topk_weights=topk_weights,
|
243
|
-
forward_batch=forward_batch,
|
244
|
-
)
|
245
203
|
return final_hidden_states
|
246
204
|
|
247
205
|
def op_gate(self, state):
|
@@ -278,8 +236,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
278
236
|
|
279
237
|
def op_dispatch_a(self, state):
|
280
238
|
if self.ep_size > 1:
|
281
|
-
|
282
|
-
self.deepep_dispatcher.dispatch_a(
|
239
|
+
self.experts.deepep_dispatcher.dispatch_a(
|
283
240
|
hidden_states=state.pop("hidden_states_mlp_input"),
|
284
241
|
topk_idx=state.pop("topk_idx_local"),
|
285
242
|
topk_weights=state.pop("topk_weights_local"),
|
@@ -292,46 +249,32 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
292
249
|
with get_global_expert_distribution_recorder().with_current_layer(
|
293
250
|
self.layer_id
|
294
251
|
):
|
295
|
-
(
|
296
|
-
state.hidden_states_experts_input,
|
297
|
-
state.topk_idx_dispatched,
|
298
|
-
state.topk_weights_dispatched,
|
299
|
-
state.reorder_topk_ids,
|
300
|
-
state.num_recv_tokens_per_expert,
|
301
|
-
state.seg_indptr,
|
302
|
-
state.masked_m,
|
303
|
-
state.expected_m,
|
304
|
-
) = self.deepep_dispatcher.dispatch_b(
|
252
|
+
state.dispatch_output = self.experts.deepep_dispatcher.dispatch_b(
|
305
253
|
tbo_subbatch_index=state.get("tbo_subbatch_index"),
|
306
254
|
)
|
307
255
|
|
308
256
|
def op_experts(self, state):
|
309
|
-
state.hidden_states_experts_output = self.experts(
|
310
|
-
|
311
|
-
topk_idx=state.topk_idx_dispatched,
|
312
|
-
topk_weights=state.topk_weights_dispatched,
|
313
|
-
reorder_topk_ids=state.pop("reorder_topk_ids"),
|
314
|
-
seg_indptr=state.pop("seg_indptr"),
|
315
|
-
masked_m=state.pop("masked_m"),
|
316
|
-
expected_m=state.pop("expected_m"),
|
317
|
-
num_recv_tokens_per_expert=state.pop("num_recv_tokens_per_expert"),
|
318
|
-
forward_batch=state.forward_batch,
|
257
|
+
state.hidden_states_experts_output = self.experts.moe_impl(
|
258
|
+
dispatch_output=state.dispatch_output,
|
319
259
|
)
|
320
260
|
|
321
261
|
def op_combine_a(self, state):
|
322
262
|
if self.ep_size > 1:
|
323
|
-
self.deepep_dispatcher.combine_a(
|
263
|
+
self.experts.deepep_dispatcher.combine_a(
|
324
264
|
hidden_states=state.pop("hidden_states_experts_output"),
|
325
|
-
topk_idx=state.
|
326
|
-
topk_weights=state.
|
265
|
+
topk_idx=state.dispatch_output.topk_idx,
|
266
|
+
topk_weights=state.dispatch_output.topk_weights,
|
327
267
|
forward_batch=state.forward_batch,
|
328
268
|
tbo_subbatch_index=state.get("tbo_subbatch_index"),
|
329
269
|
)
|
270
|
+
state.pop("dispatch_output")
|
330
271
|
|
331
272
|
def op_combine_b(self, state):
|
332
273
|
if self.ep_size > 1:
|
333
|
-
state.hidden_states_after_combine =
|
334
|
-
|
274
|
+
state.hidden_states_after_combine = (
|
275
|
+
self.experts.deepep_dispatcher.combine_b(
|
276
|
+
tbo_subbatch_index=state.get("tbo_subbatch_index"),
|
277
|
+
)
|
335
278
|
)
|
336
279
|
|
337
280
|
def op_output(self, state):
|