sglang 0.4.10__py3-none-any.whl → 0.4.10.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sglang/bench_offline_throughput.py +20 -0
  2. sglang/compile_deep_gemm.py +8 -1
  3. sglang/global_config.py +5 -1
  4. sglang/srt/configs/model_config.py +1 -0
  5. sglang/srt/conversation.py +0 -112
  6. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +1 -0
  7. sglang/srt/disaggregation/launch_lb.py +5 -20
  8. sglang/srt/disaggregation/mooncake/conn.py +33 -15
  9. sglang/srt/disaggregation/prefill.py +1 -0
  10. sglang/srt/distributed/device_communicators/pynccl.py +7 -0
  11. sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
  12. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
  13. sglang/srt/distributed/parallel_state.py +11 -0
  14. sglang/srt/entrypoints/engine.py +4 -2
  15. sglang/srt/entrypoints/http_server.py +35 -15
  16. sglang/srt/eplb/expert_distribution.py +4 -2
  17. sglang/srt/hf_transformers_utils.py +25 -10
  18. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  19. sglang/srt/layers/attention/flashattention_backend.py +7 -11
  20. sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
  21. sglang/srt/layers/attention/utils.py +6 -1
  22. sglang/srt/layers/attention/vision.py +27 -10
  23. sglang/srt/layers/communicator.py +14 -4
  24. sglang/srt/layers/linear.py +7 -1
  25. sglang/srt/layers/logits_processor.py +9 -1
  26. sglang/srt/layers/moe/ep_moe/layer.py +29 -68
  27. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  28. sglang/srt/layers/moe/fused_moe_triton/layer.py +82 -25
  29. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +0 -31
  30. sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
  31. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
  32. sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
  33. sglang/srt/layers/moe/utils.py +43 -0
  34. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
  35. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
  36. sglang/srt/layers/quantization/fp8.py +57 -1
  37. sglang/srt/layers/quantization/fp8_kernel.py +0 -4
  38. sglang/srt/layers/quantization/w8a8_int8.py +4 -1
  39. sglang/srt/layers/vocab_parallel_embedding.py +7 -1
  40. sglang/srt/lora/lora_registry.py +7 -0
  41. sglang/srt/managers/cache_controller.py +43 -39
  42. sglang/srt/managers/data_parallel_controller.py +52 -2
  43. sglang/srt/managers/io_struct.py +6 -1
  44. sglang/srt/managers/schedule_batch.py +3 -2
  45. sglang/srt/managers/schedule_policy.py +3 -1
  46. sglang/srt/managers/scheduler.py +145 -6
  47. sglang/srt/managers/template_manager.py +25 -22
  48. sglang/srt/managers/tokenizer_manager.py +114 -62
  49. sglang/srt/managers/utils.py +45 -1
  50. sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
  51. sglang/srt/mem_cache/hicache_storage.py +13 -12
  52. sglang/srt/mem_cache/hiradix_cache.py +21 -4
  53. sglang/srt/mem_cache/memory_pool.py +15 -118
  54. sglang/srt/mem_cache/memory_pool_host.py +350 -33
  55. sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
  56. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +8 -2
  57. sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
  58. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +163 -0
  59. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +238 -0
  60. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +216 -0
  61. sglang/srt/model_executor/cuda_graph_runner.py +42 -4
  62. sglang/srt/model_executor/forward_batch_info.py +13 -3
  63. sglang/srt/model_executor/model_runner.py +13 -1
  64. sglang/srt/model_loader/weight_utils.py +2 -0
  65. sglang/srt/models/deepseek_v2.py +28 -23
  66. sglang/srt/models/glm4_moe.py +85 -22
  67. sglang/srt/models/grok.py +3 -3
  68. sglang/srt/models/llama4.py +13 -2
  69. sglang/srt/models/mixtral.py +3 -3
  70. sglang/srt/models/mllama4.py +428 -19
  71. sglang/srt/models/qwen2_moe.py +1 -4
  72. sglang/srt/models/qwen3_moe.py +7 -8
  73. sglang/srt/models/step3_vl.py +1 -4
  74. sglang/srt/multimodal/processors/base_processor.py +4 -3
  75. sglang/srt/multimodal/processors/gemma3n.py +0 -7
  76. sglang/srt/operations_strategy.py +1 -1
  77. sglang/srt/server_args.py +115 -21
  78. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
  79. sglang/srt/two_batch_overlap.py +6 -4
  80. sglang/srt/utils.py +4 -24
  81. sglang/srt/weight_sync/utils.py +1 -1
  82. sglang/test/attention/test_trtllm_mla_backend.py +945 -0
  83. sglang/test/runners.py +2 -2
  84. sglang/test/test_utils.py +3 -3
  85. sglang/version.py +1 -1
  86. {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/METADATA +3 -2
  87. {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/RECORD +92 -81
  88. /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
  89. /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
  90. {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/WHEEL +0 -0
  91. {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/licenses/LICENSE +0 -0
  92. {sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/top_level.txt +0 -0
@@ -23,6 +23,7 @@ from torch import nn
23
23
  from transformers import PretrainedConfig
24
24
 
25
25
  from sglang.srt.distributed import (
26
+ get_moe_expert_parallel_world_size,
26
27
  get_tensor_model_parallel_rank,
27
28
  get_tensor_model_parallel_world_size,
28
29
  parallel_state,
@@ -50,9 +51,8 @@ from sglang.srt.layers.linear import (
50
51
  )
51
52
  from sglang.srt.layers.logits_processor import LogitsProcessor
52
53
  from sglang.srt.layers.moe.ep_moe.layer import (
53
- DeepEPMoE,
54
54
  get_moe_impl_class,
55
- use_flashinfer_trtllm_moe,
55
+ should_use_flashinfer_trtllm_moe,
56
56
  )
57
57
  from sglang.srt.layers.moe.topk import TopK
58
58
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
@@ -83,7 +83,6 @@ from sglang.srt.two_batch_overlap import (
83
83
  )
84
84
  from sglang.srt.utils import (
85
85
  BumpAllocator,
86
- DeepEPMode,
87
86
  LazyValue,
88
87
  add_prefix,
89
88
  bind_or_assign,
@@ -388,6 +387,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
388
387
  ):
389
388
  nn.Module.__init__(self)
390
389
  self.tp_size = get_tensor_model_parallel_world_size()
390
+ self.ep_size = get_moe_expert_parallel_world_size()
391
391
  self.routed_scaling_factor = config.routed_scaling_factor
392
392
  self.n_shared_experts = config.n_shared_experts
393
393
  self.num_fused_shared_experts = (
@@ -426,7 +426,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
426
426
  correction_bias=self.gate.e_score_correction_bias,
427
427
  routed_scaling_factor=self.routed_scaling_factor,
428
428
  )
429
- if not use_flashinfer_trtllm_moe
429
+ if not should_use_flashinfer_trtllm_moe()
430
430
  else None
431
431
  )
432
432
 
@@ -443,15 +443,14 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
443
443
  routed_scaling_factor=self.routed_scaling_factor,
444
444
  prefix=add_prefix("experts", prefix),
445
445
  **(
446
- dict(deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]])
447
- if global_server_args_dict["enable_deepep_moe"]
446
+ dict(deepep_mode=global_server_args_dict["deepep_mode"])
447
+ if global_server_args_dict["moe_a2a_backend"].is_deepep()
448
448
  else {}
449
449
  ),
450
450
  # Additional args for FusedMoE
451
451
  **(
452
452
  dict(
453
453
  enable_flashinfer_cutlass_moe=True,
454
- enable_ep_moe=global_server_args_dict["enable_ep_moe"],
455
454
  )
456
455
  if global_server_args_dict["enable_flashinfer_cutlass_moe"]
457
456
  else {}
@@ -465,7 +464,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
465
464
  topk_group=config.topk_group,
466
465
  correction_bias=self.gate.e_score_correction_bias,
467
466
  )
468
- if use_flashinfer_trtllm_moe
467
+ if should_use_flashinfer_trtllm_moe()
469
468
  else {}
470
469
  ),
471
470
  )
@@ -482,11 +481,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
482
481
  quant_config=quant_config,
483
482
  reduce_results=False,
484
483
  prefix=add_prefix("shared_experts", prefix),
485
- **(
486
- dict(tp_rank=0, tp_size=1)
487
- if global_server_args_dict["enable_deepep_moe"]
488
- else {}
489
- ),
484
+ **(dict(tp_rank=0, tp_size=1) if self.ep_size > 1 else {}),
490
485
  )
491
486
  is_packed_weight = hasattr(
492
487
  self.shared_experts.gate_up_proj.quant_method, "quant_config"
@@ -502,9 +497,9 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
502
497
 
503
498
  self.top_k = config.num_experts_per_tok
504
499
 
505
- if global_server_args_dict["enable_deepep_moe"]:
500
+ if global_server_args_dict["moe_a2a_backend"].is_deepep():
506
501
  # TODO: we will support tp < ep in the future
507
- self.ep_size = get_tensor_model_parallel_world_size()
502
+ self.ep_size = get_moe_expert_parallel_world_size()
508
503
  self.num_experts = (
509
504
  config.n_routed_experts
510
505
  + global_server_args_dict["ep_num_redundant_experts"]
@@ -526,12 +521,83 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
526
521
  num_local_experts=config.n_routed_experts // self.tp_size,
527
522
  hidden_size=config.hidden_size,
528
523
  params_dtype=config.torch_dtype,
529
- deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]],
524
+ deepep_mode=global_server_args_dict["deepep_mode"],
530
525
  async_finish=True,
531
526
  return_recv_hook=True,
532
527
  )
533
528
 
534
- self._enable_deepep_moe = global_server_args_dict["enable_deepep_moe"]
529
+ self._enable_deepep_moe = global_server_args_dict["moe_a2a_backend"].is_deepep()
530
+
531
+ def forward_normal_dual_stream(
532
+ self, hidden_states: torch.Tensor, can_fuse_mlp_allreduce: bool = False
533
+ ) -> torch.Tensor:
534
+
535
+ current_stream = torch.cuda.current_stream()
536
+ self.alt_stream.wait_stream(current_stream)
537
+ shared_output = self._forward_shared_experts(hidden_states)
538
+
539
+ with torch.cuda.stream(self.alt_stream):
540
+ # router_logits: (num_tokens, n_experts)
541
+ router_logits = self.gate(hidden_states)
542
+ kwargs = {"hidden_states": hidden_states}
543
+ if self.topk is not None:
544
+ kwargs["topk_output"] = self.topk(hidden_states, router_logits)
545
+ else:
546
+ kwargs["router_logits"] = router_logits
547
+ final_hidden_states = self.experts(**kwargs)
548
+ if not _is_cuda:
549
+ final_hidden_states *= self.routed_scaling_factor
550
+ current_stream.wait_stream(self.alt_stream)
551
+
552
+ if self.ep_size > 1:
553
+ if self.tp_size > 1 and not can_fuse_mlp_allreduce:
554
+ final_hidden_states = tensor_model_parallel_all_reduce(
555
+ final_hidden_states
556
+ )
557
+ final_hidden_states += shared_output
558
+ else:
559
+ final_hidden_states += shared_output
560
+ if self.tp_size > 1 and not can_fuse_mlp_allreduce:
561
+ final_hidden_states = tensor_model_parallel_all_reduce(
562
+ final_hidden_states
563
+ )
564
+ return final_hidden_states
565
+
566
+ def forward_normal(
567
+ self, hidden_states: torch.Tensor, can_fuse_mlp_allreduce: bool = False
568
+ ) -> torch.Tensor:
569
+ if hasattr(self, "shared_experts") and use_intel_amx_backend(
570
+ self.shared_experts.gate_up_proj
571
+ ):
572
+ return self.forward_cpu(hidden_states, can_fuse_mlp_allreduce)
573
+
574
+ shared_output = self._forward_shared_experts(hidden_states)
575
+ # router_logits: (num_tokens, n_experts)
576
+ router_logits = self.gate(hidden_states)
577
+ kwargs = {"hidden_states": hidden_states}
578
+ if self.topk is not None:
579
+ kwargs["topk_output"] = self.topk(hidden_states, router_logits)
580
+ else:
581
+ kwargs["router_logits"] = router_logits
582
+ final_hidden_states = self.experts(**kwargs)
583
+ if not _is_cuda and not _use_aiter:
584
+ # fused in biased_grouped_topk so we can skip here
585
+ final_hidden_states *= self.routed_scaling_factor
586
+ if self.ep_size > 1:
587
+ if self.tp_size > 1 and not can_fuse_mlp_allreduce:
588
+ final_hidden_states = tensor_model_parallel_all_reduce(
589
+ final_hidden_states
590
+ )
591
+ if shared_output is not None:
592
+ final_hidden_states += shared_output
593
+ else:
594
+ if shared_output is not None:
595
+ final_hidden_states += shared_output
596
+ if self.tp_size > 1 and not can_fuse_mlp_allreduce:
597
+ final_hidden_states = tensor_model_parallel_all_reduce(
598
+ final_hidden_states
599
+ )
600
+ return final_hidden_states
535
601
 
536
602
 
537
603
  class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
@@ -737,11 +803,8 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
737
803
  or self.config.n_shared_experts != 1
738
804
  ):
739
805
  disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."
740
- elif (
741
- global_server_args_dict["enable_deepep_moe"]
742
- or global_server_args_dict["enable_ep_moe"]
743
- ):
744
- disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization when in deepep_moe or ep_moe mode."
806
+ elif get_moe_expert_parallel_world_size() > 1:
807
+ disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
745
808
 
746
809
  if disable_reason is not None:
747
810
  global_server_args_dict["disable_shared_experts_fusion"] = True
sglang/srt/models/grok.py CHANGED
@@ -29,6 +29,7 @@ from torch import nn
29
29
  from transformers import PretrainedConfig
30
30
 
31
31
  from sglang.srt.distributed import (
32
+ get_moe_expert_parallel_world_size,
32
33
  get_tensor_model_parallel_rank,
33
34
  get_tensor_model_parallel_world_size,
34
35
  tensor_model_parallel_all_gather,
@@ -117,7 +118,7 @@ class Grok1MoE(nn.Module):
117
118
  )
118
119
 
119
120
  kwargs = {}
120
- if global_server_args_dict["enable_ep_moe"]:
121
+ if get_moe_expert_parallel_world_size() > 1:
121
122
  MoEImpl = EPMoE
122
123
  else:
123
124
  MoEImpl = FusedMoE
@@ -616,8 +617,7 @@ class Grok1ForCausalLM(nn.Module):
616
617
 
617
618
  # Params for weights, fp8 weight scales, fp8 activation scales
618
619
  # (param_name, weight_name, expert_id, shard_id)
619
- MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
620
- expert_params_mapping = MoEImpl.make_expert_params_mapping(
620
+ expert_params_mapping = FusedMoE.make_expert_params_mapping(
621
621
  ckpt_gate_proj_name="w1",
622
622
  ckpt_down_proj_name="w2",
623
623
  ckpt_up_proj_name="w3",
@@ -241,13 +241,22 @@ class Llama4Attention(nn.Module):
241
241
  if self.use_qk_norm
242
242
  else None
243
243
  )
244
+
245
+ qkv_quant_config = quant_config
246
+ o_quant_config = quant_config
247
+ if quant_config and hasattr(quant_config, "ignore") and quant_config.ignore:
248
+ if add_prefix("q_proj", prefix) in quant_config.ignore:
249
+ qkv_quant_config = None
250
+ if add_prefix("o_proj", prefix) in quant_config.ignore:
251
+ o_quant_config = None
252
+
244
253
  self.qkv_proj = QKVParallelLinear(
245
254
  hidden_size=hidden_size,
246
255
  head_size=self.head_dim,
247
256
  total_num_heads=self.total_num_heads,
248
257
  total_num_kv_heads=self.total_num_kv_heads,
249
258
  bias=bias,
250
- quant_config=quant_config,
259
+ quant_config=qkv_quant_config,
251
260
  prefix=add_prefix("qkv_proj", prefix),
252
261
  tp_rank=attn_tp_rank,
253
262
  tp_size=attn_tp_size,
@@ -257,7 +266,7 @@ class Llama4Attention(nn.Module):
257
266
  input_size=self.total_num_heads * self.head_dim,
258
267
  output_size=hidden_size,
259
268
  bias=bias_o_proj,
260
- quant_config=quant_config,
269
+ quant_config=o_quant_config,
261
270
  prefix=add_prefix("o_proj", prefix),
262
271
  tp_rank=attn_tp_rank,
263
272
  tp_size=attn_tp_size,
@@ -406,6 +415,8 @@ class Llama4DecoderLayer(nn.Module):
406
415
  )
407
416
 
408
417
  def _is_moe_layer(self, layer_id: int) -> bool:
418
+ if self.config.interleave_moe_layer_step == 0:
419
+ return self.config.num_local_experts > 0
409
420
  return (layer_id + 1) % self.config.interleave_moe_layer_step == 0
410
421
 
411
422
  def forward(
@@ -24,6 +24,7 @@ from torch import nn
24
24
  from transformers import MixtralConfig
25
25
 
26
26
  from sglang.srt.distributed import (
27
+ get_moe_expert_parallel_world_size,
27
28
  get_pp_group,
28
29
  get_tensor_model_parallel_world_size,
29
30
  tensor_model_parallel_all_reduce,
@@ -94,7 +95,7 @@ class MixtralMoE(nn.Module):
94
95
  renormalize=True,
95
96
  )
96
97
 
97
- MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
98
+ MoEImpl = EPMoE if get_moe_expert_parallel_world_size() > 1 else FusedMoE
98
99
  self.experts = MoEImpl(
99
100
  num_experts=num_experts,
100
101
  top_k=top_k,
@@ -398,8 +399,7 @@ class MixtralForCausalLM(nn.Module):
398
399
 
399
400
  # Params for weights, fp8 weight scales, fp8 activation scales
400
401
  # (param_name, weight_name, expert_id, shard_id)
401
- MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
402
- expert_params_mapping = MoEImpl.make_expert_params_mapping(
402
+ expert_params_mapping = FusedMoE.make_expert_params_mapping(
403
403
  ckpt_gate_proj_name="w1",
404
404
  ckpt_down_proj_name="w2",
405
405
  ckpt_up_proj_name="w3",