sglang 0.4.9.post5__py3-none-any.whl → 0.4.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/srt/configs/__init__.py +8 -0
  3. sglang/srt/configs/model_config.py +6 -0
  4. sglang/srt/configs/step3_vl.py +172 -0
  5. sglang/srt/conversation.py +23 -0
  6. sglang/srt/disaggregation/decode.py +2 -8
  7. sglang/srt/disaggregation/prefill.py +2 -6
  8. sglang/srt/distributed/parallel_state.py +86 -1
  9. sglang/srt/entrypoints/engine.py +14 -18
  10. sglang/srt/entrypoints/http_server.py +23 -3
  11. sglang/srt/entrypoints/openai/protocol.py +3 -1
  12. sglang/srt/entrypoints/openai/serving_base.py +5 -2
  13. sglang/srt/entrypoints/openai/serving_chat.py +2 -21
  14. sglang/srt/eplb/expert_distribution.py +5 -0
  15. sglang/srt/eplb/expert_location.py +17 -6
  16. sglang/srt/eplb/expert_location_dispatch.py +1 -0
  17. sglang/srt/eplb/expert_location_updater.py +2 -0
  18. sglang/srt/function_call/function_call_parser.py +2 -0
  19. sglang/srt/function_call/step3_detector.py +436 -0
  20. sglang/srt/hf_transformers_utils.py +2 -0
  21. sglang/srt/jinja_template_utils.py +4 -1
  22. sglang/srt/layers/moe/cutlass_moe.py +2 -1
  23. sglang/srt/layers/moe/ep_moe/layer.py +98 -603
  24. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
  25. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  27. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
  28. sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
  29. sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
  30. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
  31. sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
  32. sglang/srt/layers/moe/topk.py +6 -2
  33. sglang/srt/layers/quantization/fp8.py +0 -18
  34. sglang/srt/layers/quantization/modelopt_quant.py +2 -0
  35. sglang/srt/layers/quantization/unquant.py +0 -8
  36. sglang/srt/layers/quantization/w4afp8.py +1 -0
  37. sglang/srt/managers/cache_controller.py +143 -45
  38. sglang/srt/managers/data_parallel_controller.py +6 -0
  39. sglang/srt/managers/io_struct.py +12 -2
  40. sglang/srt/managers/scheduler.py +116 -669
  41. sglang/srt/managers/scheduler_input_blocker.py +106 -0
  42. sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
  43. sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
  44. sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
  45. sglang/srt/managers/template_manager.py +62 -19
  46. sglang/srt/managers/tokenizer_manager.py +166 -83
  47. sglang/srt/managers/tp_worker.py +9 -0
  48. sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
  49. sglang/srt/mem_cache/hicache_storage.py +45 -11
  50. sglang/srt/mem_cache/hiradix_cache.py +15 -4
  51. sglang/srt/mem_cache/memory_pool_host.py +73 -1
  52. sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
  53. sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
  54. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
  55. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
  56. sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
  57. sglang/srt/model_executor/model_runner.py +20 -13
  58. sglang/srt/models/arcee.py +532 -0
  59. sglang/srt/models/deepseek_v2.py +15 -56
  60. sglang/srt/models/glm4_moe.py +3 -1
  61. sglang/srt/models/granitemoe.py +3 -0
  62. sglang/srt/models/grok.py +3 -0
  63. sglang/srt/models/hunyuan.py +1 -0
  64. sglang/srt/models/llama4.py +3 -0
  65. sglang/srt/models/mixtral.py +3 -0
  66. sglang/srt/models/olmoe.py +3 -0
  67. sglang/srt/models/phimoe.py +1 -0
  68. sglang/srt/models/qwen3_moe.py +12 -69
  69. sglang/srt/models/step3_vl.py +994 -0
  70. sglang/srt/multimodal/processors/base_processor.py +15 -16
  71. sglang/srt/multimodal/processors/step3_vl.py +515 -0
  72. sglang/srt/poll_based_barrier.py +31 -0
  73. sglang/srt/reasoning_parser.py +2 -1
  74. sglang/srt/server_args.py +18 -13
  75. sglang/srt/speculative/eagle_worker.py +2 -0
  76. sglang/srt/two_batch_overlap.py +8 -3
  77. sglang/test/test_utils.py +53 -0
  78. sglang/utils.py +0 -11
  79. sglang/version.py +1 -1
  80. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/METADATA +4 -4
  81. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/RECORD +84 -64
  82. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
  83. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
  84. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0
sglang/srt/models/grok.py CHANGED
@@ -78,6 +78,7 @@ class Grok1MoE(nn.Module):
78
78
  def __init__(
79
79
  self,
80
80
  config: PretrainedConfig,
81
+ layer_id: int,
81
82
  num_experts: int,
82
83
  top_k: int,
83
84
  hidden_size: int,
@@ -128,6 +129,7 @@ class Grok1MoE(nn.Module):
128
129
  self.experts = MoEImpl(
129
130
  num_experts=num_experts,
130
131
  top_k=top_k,
132
+ layer_id=layer_id,
131
133
  hidden_size=hidden_size,
132
134
  intermediate_size=intermediate_size,
133
135
  params_dtype=params_dtype,
@@ -331,6 +333,7 @@ class Grok1DecoderLayer(nn.Module):
331
333
  )
332
334
  self.block_sparse_moe = Grok1MoE(
333
335
  config=config,
336
+ layer_id=layer_id,
334
337
  num_experts=config.num_local_experts,
335
338
  top_k=config.num_experts_per_tok,
336
339
  hidden_size=config.hidden_size,
@@ -163,6 +163,7 @@ class HunYuanSparseMoeBlock(nn.Module):
163
163
  hidden_size=config.hidden_size,
164
164
  intermediate_size=intermediate_size,
165
165
  reduce_results=False,
166
+ layer_id=layer_id,
166
167
  quant_config=quant_config,
167
168
  )
168
169
 
@@ -87,6 +87,7 @@ class Llama4MoE(nn.Module):
87
87
  def __init__(
88
88
  self,
89
89
  config: Llama4TextConfig,
90
+ layer_id: int,
90
91
  quant_config: Optional[QuantizationConfig] = None,
91
92
  prefix: str = "",
92
93
  ):
@@ -114,6 +115,7 @@ class Llama4MoE(nn.Module):
114
115
  num_experts=config.num_local_experts,
115
116
  hidden_size=config.hidden_size,
116
117
  intermediate_size=intermediate_size_moe,
118
+ layer_id=layer_id,
117
119
  reduce_results=False,
118
120
  quant_config=quant_config,
119
121
  apply_router_weight_on_input=True,
@@ -373,6 +375,7 @@ class Llama4DecoderLayer(nn.Module):
373
375
  if is_moe_layer:
374
376
  self.feed_forward = Llama4MoE(
375
377
  config=config,
378
+ layer_id=layer_id,
376
379
  quant_config=quant_config,
377
380
  prefix=add_prefix("feed_forward", prefix),
378
381
  )
@@ -69,6 +69,7 @@ class MixtralMoE(nn.Module):
69
69
  top_k: int,
70
70
  hidden_size: int,
71
71
  intermediate_size: int,
72
+ layer_id: int,
72
73
  params_dtype: Optional[torch.dtype] = None,
73
74
  quant_config: Optional[QuantizationConfig] = None,
74
75
  tp_size: Optional[int] = None,
@@ -97,6 +98,7 @@ class MixtralMoE(nn.Module):
97
98
  self.experts = MoEImpl(
98
99
  num_experts=num_experts,
99
100
  top_k=top_k,
101
+ layer_id=layer_id,
100
102
  hidden_size=hidden_size,
101
103
  intermediate_size=intermediate_size,
102
104
  params_dtype=params_dtype,
@@ -226,6 +228,7 @@ class MixtralDecoderLayer(nn.Module):
226
228
  top_k=config.num_experts_per_tok,
227
229
  hidden_size=config.hidden_size,
228
230
  intermediate_size=config.intermediate_size,
231
+ layer_id=layer_id,
229
232
  quant_config=quant_config,
230
233
  prefix=add_prefix("block_sparse_moe", prefix),
231
234
  )
@@ -63,6 +63,7 @@ class OlmoeMoE(nn.Module):
63
63
  params_dtype: Optional[torch.dtype] = None,
64
64
  quant_config: Optional[QuantizationConfig] = None,
65
65
  tp_size: Optional[int] = None,
66
+ layer_id: int = 0,
66
67
  prefix: str = "",
67
68
  ):
68
69
  super().__init__()
@@ -89,6 +90,7 @@ class OlmoeMoE(nn.Module):
89
90
  reduce_results=True,
90
91
  quant_config=quant_config,
91
92
  tp_size=tp_size,
93
+ layer_id=layer_id,
92
94
  prefix=add_prefix("experts", prefix),
93
95
  )
94
96
 
@@ -224,6 +226,7 @@ class OlmoeDecoderLayer(nn.Module):
224
226
  top_k=config.num_experts_per_tok,
225
227
  hidden_size=config.hidden_size,
226
228
  intermediate_size=config.intermediate_size,
229
+ layer_id=layer_id,
227
230
  quant_config=quant_config,
228
231
  prefix=add_prefix("mlp", prefix),
229
232
  )
@@ -210,6 +210,7 @@ class PhiMoE(nn.Module):
210
210
  self.experts = FusedMoE(
211
211
  num_experts=num_experts,
212
212
  top_k=top_k,
213
+ layer_id=layer_id,
213
214
  hidden_size=hidden_size,
214
215
  intermediate_size=intermediate_size,
215
216
  reduce_results=True,
@@ -144,19 +144,6 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
144
144
  )
145
145
  self.top_k = config.num_experts_per_tok
146
146
 
147
- self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
148
- group=parallel_state.get_tp_group().device_group,
149
- router_topk=self.top_k,
150
- permute_fusion=True,
151
- num_experts=self.num_experts,
152
- num_local_experts=config.num_experts // self.tp_size,
153
- hidden_size=config.hidden_size,
154
- params_dtype=config.torch_dtype,
155
- deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]],
156
- async_finish=True, # TODO
157
- return_recv_hook=True,
158
- )
159
-
160
147
  def forward(
161
148
  self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
162
149
  ) -> torch.Tensor:
@@ -207,41 +194,12 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
207
194
  topk_weights = torch.empty(
208
195
  (0, self.top_k), dtype=torch.float32, device=hidden_states.device
209
196
  )
210
- if self.ep_size > 1:
211
- # TODO(ch-wan): allow users to set num_max_dispatch_tokens_per_rank value
212
- (
213
- hidden_states,
214
- topk_idx,
215
- topk_weights,
216
- reorder_topk_ids,
217
- num_recv_tokens_per_expert,
218
- seg_indptr,
219
- masked_m,
220
- expected_m,
221
- ) = self.deepep_dispatcher.dispatch(
222
- hidden_states=hidden_states,
223
- topk_idx=topk_idx,
224
- topk_weights=topk_weights,
225
- forward_batch=forward_batch,
226
- )
227
197
  final_hidden_states = self.experts(
228
198
  hidden_states=hidden_states,
229
199
  topk_idx=topk_idx,
230
200
  topk_weights=topk_weights,
231
- reorder_topk_ids=reorder_topk_ids,
232
- seg_indptr=seg_indptr,
233
- masked_m=masked_m,
234
- expected_m=expected_m,
235
- num_recv_tokens_per_expert=num_recv_tokens_per_expert,
236
201
  forward_batch=forward_batch,
237
202
  )
238
- if self.ep_size > 1:
239
- final_hidden_states = self.deepep_dispatcher.combine(
240
- hidden_states=final_hidden_states,
241
- topk_idx=topk_idx,
242
- topk_weights=topk_weights,
243
- forward_batch=forward_batch,
244
- )
245
203
  return final_hidden_states
246
204
 
247
205
  def op_gate(self, state):
@@ -278,8 +236,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
278
236
 
279
237
  def op_dispatch_a(self, state):
280
238
  if self.ep_size > 1:
281
- # TODO(ch-wan): allow users to set num_max_dispatch_tokens_per_rank value
282
- self.deepep_dispatcher.dispatch_a(
239
+ self.experts.deepep_dispatcher.dispatch_a(
283
240
  hidden_states=state.pop("hidden_states_mlp_input"),
284
241
  topk_idx=state.pop("topk_idx_local"),
285
242
  topk_weights=state.pop("topk_weights_local"),
@@ -292,46 +249,32 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
292
249
  with get_global_expert_distribution_recorder().with_current_layer(
293
250
  self.layer_id
294
251
  ):
295
- (
296
- state.hidden_states_experts_input,
297
- state.topk_idx_dispatched,
298
- state.topk_weights_dispatched,
299
- state.reorder_topk_ids,
300
- state.num_recv_tokens_per_expert,
301
- state.seg_indptr,
302
- state.masked_m,
303
- state.expected_m,
304
- ) = self.deepep_dispatcher.dispatch_b(
252
+ state.dispatch_output = self.experts.deepep_dispatcher.dispatch_b(
305
253
  tbo_subbatch_index=state.get("tbo_subbatch_index"),
306
254
  )
307
255
 
308
256
  def op_experts(self, state):
309
- state.hidden_states_experts_output = self.experts(
310
- hidden_states=state.pop("hidden_states_experts_input"),
311
- topk_idx=state.topk_idx_dispatched,
312
- topk_weights=state.topk_weights_dispatched,
313
- reorder_topk_ids=state.pop("reorder_topk_ids"),
314
- seg_indptr=state.pop("seg_indptr"),
315
- masked_m=state.pop("masked_m"),
316
- expected_m=state.pop("expected_m"),
317
- num_recv_tokens_per_expert=state.pop("num_recv_tokens_per_expert"),
318
- forward_batch=state.forward_batch,
257
+ state.hidden_states_experts_output = self.experts.moe_impl(
258
+ dispatch_output=state.dispatch_output,
319
259
  )
320
260
 
321
261
  def op_combine_a(self, state):
322
262
  if self.ep_size > 1:
323
- self.deepep_dispatcher.combine_a(
263
+ self.experts.deepep_dispatcher.combine_a(
324
264
  hidden_states=state.pop("hidden_states_experts_output"),
325
- topk_idx=state.pop("topk_idx_dispatched"),
326
- topk_weights=state.pop("topk_weights_dispatched"),
265
+ topk_idx=state.dispatch_output.topk_idx,
266
+ topk_weights=state.dispatch_output.topk_weights,
327
267
  forward_batch=state.forward_batch,
328
268
  tbo_subbatch_index=state.get("tbo_subbatch_index"),
329
269
  )
270
+ state.pop("dispatch_output")
330
271
 
331
272
  def op_combine_b(self, state):
332
273
  if self.ep_size > 1:
333
- state.hidden_states_after_combine = self.deepep_dispatcher.combine_b(
334
- tbo_subbatch_index=state.get("tbo_subbatch_index"),
274
+ state.hidden_states_after_combine = (
275
+ self.experts.deepep_dispatcher.combine_b(
276
+ tbo_subbatch_index=state.get("tbo_subbatch_index"),
277
+ )
335
278
  )
336
279
 
337
280
  def op_output(self, state):