sglang 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. sglang/bench_one_batch.py +113 -17
  2. sglang/compile_deep_gemm.py +8 -1
  3. sglang/global_config.py +5 -1
  4. sglang/srt/configs/model_config.py +35 -0
  5. sglang/srt/conversation.py +9 -117
  6. sglang/srt/disaggregation/base/conn.py +5 -2
  7. sglang/srt/disaggregation/decode.py +6 -1
  8. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -0
  9. sglang/srt/disaggregation/mooncake/conn.py +243 -135
  10. sglang/srt/disaggregation/prefill.py +3 -0
  11. sglang/srt/distributed/device_communicators/pynccl.py +7 -0
  12. sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
  13. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
  14. sglang/srt/distributed/parallel_state.py +22 -9
  15. sglang/srt/entrypoints/context.py +244 -0
  16. sglang/srt/entrypoints/engine.py +8 -5
  17. sglang/srt/entrypoints/harmony_utils.py +370 -0
  18. sglang/srt/entrypoints/http_server.py +106 -15
  19. sglang/srt/entrypoints/openai/protocol.py +227 -1
  20. sglang/srt/entrypoints/openai/serving_chat.py +278 -42
  21. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  22. sglang/srt/entrypoints/openai/tool_server.py +174 -0
  23. sglang/srt/entrypoints/tool.py +87 -0
  24. sglang/srt/eplb/expert_distribution.py +4 -2
  25. sglang/srt/eplb/expert_location.py +5 -1
  26. sglang/srt/function_call/harmony_tool_parser.py +130 -0
  27. sglang/srt/hf_transformers_utils.py +55 -13
  28. sglang/srt/jinja_template_utils.py +8 -1
  29. sglang/srt/layers/attention/aiter_backend.py +5 -8
  30. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  31. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  32. sglang/srt/layers/attention/flashattention_backend.py +7 -11
  33. sglang/srt/layers/attention/triton_backend.py +85 -14
  34. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  35. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  36. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  37. sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
  38. sglang/srt/layers/attention/vision.py +40 -15
  39. sglang/srt/layers/communicator.py +35 -8
  40. sglang/srt/layers/dp_attention.py +12 -0
  41. sglang/srt/layers/linear.py +9 -8
  42. sglang/srt/layers/logits_processor.py +9 -1
  43. sglang/srt/layers/moe/cutlass_moe.py +20 -6
  44. sglang/srt/layers/moe/ep_moe/layer.py +87 -107
  45. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  47. sglang/srt/layers/moe/fused_moe_triton/layer.py +442 -58
  48. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +169 -15
  49. sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
  50. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
  51. sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
  52. sglang/srt/layers/moe/topk.py +12 -3
  53. sglang/srt/layers/moe/utils.py +59 -0
  54. sglang/srt/layers/quantization/__init__.py +22 -0
  55. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
  56. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
  57. sglang/srt/layers/quantization/fp4.py +557 -0
  58. sglang/srt/layers/quantization/fp8.py +8 -7
  59. sglang/srt/layers/quantization/fp8_kernel.py +0 -4
  60. sglang/srt/layers/quantization/fp8_utils.py +29 -0
  61. sglang/srt/layers/quantization/modelopt_quant.py +259 -64
  62. sglang/srt/layers/quantization/mxfp4.py +651 -0
  63. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  64. sglang/srt/layers/quantization/quark/__init__.py +0 -0
  65. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  66. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  67. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  68. sglang/srt/layers/quantization/quark/utils.py +107 -0
  69. sglang/srt/layers/quantization/unquant.py +60 -6
  70. sglang/srt/layers/quantization/w4afp8.py +1 -1
  71. sglang/srt/layers/rotary_embedding.py +225 -1
  72. sglang/srt/layers/utils.py +9 -0
  73. sglang/srt/layers/vocab_parallel_embedding.py +15 -4
  74. sglang/srt/lora/lora_manager.py +70 -14
  75. sglang/srt/lora/lora_registry.py +10 -2
  76. sglang/srt/lora/mem_pool.py +43 -5
  77. sglang/srt/managers/cache_controller.py +61 -32
  78. sglang/srt/managers/data_parallel_controller.py +52 -2
  79. sglang/srt/managers/detokenizer_manager.py +1 -1
  80. sglang/srt/managers/io_struct.py +21 -4
  81. sglang/srt/managers/mm_utils.py +5 -11
  82. sglang/srt/managers/schedule_batch.py +30 -8
  83. sglang/srt/managers/schedule_policy.py +3 -1
  84. sglang/srt/managers/scheduler.py +170 -18
  85. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  86. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  87. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  88. sglang/srt/managers/template_manager.py +59 -22
  89. sglang/srt/managers/tokenizer_manager.py +137 -67
  90. sglang/srt/managers/tp_worker.py +3 -0
  91. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  92. sglang/srt/managers/utils.py +45 -1
  93. sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
  94. sglang/srt/mem_cache/hicache_storage.py +13 -21
  95. sglang/srt/mem_cache/hiradix_cache.py +53 -5
  96. sglang/srt/mem_cache/memory_pool_host.py +1 -1
  97. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  98. sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
  99. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  100. sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
  101. sglang/srt/model_executor/cuda_graph_runner.py +24 -9
  102. sglang/srt/model_executor/forward_batch_info.py +48 -17
  103. sglang/srt/model_executor/model_runner.py +24 -2
  104. sglang/srt/model_loader/weight_utils.py +10 -0
  105. sglang/srt/models/bailing_moe.py +425 -0
  106. sglang/srt/models/deepseek_v2.py +95 -50
  107. sglang/srt/models/ernie4.py +426 -0
  108. sglang/srt/models/ernie4_eagle.py +203 -0
  109. sglang/srt/models/gemma3n_mm.py +39 -0
  110. sglang/srt/models/glm4_moe.py +102 -27
  111. sglang/srt/models/gpt_oss.py +1134 -0
  112. sglang/srt/models/grok.py +3 -3
  113. sglang/srt/models/llama4.py +13 -2
  114. sglang/srt/models/mixtral.py +3 -3
  115. sglang/srt/models/mllama4.py +428 -19
  116. sglang/srt/models/qwen2.py +6 -0
  117. sglang/srt/models/qwen2_moe.py +7 -4
  118. sglang/srt/models/qwen3_moe.py +39 -14
  119. sglang/srt/models/step3_vl.py +10 -1
  120. sglang/srt/models/transformers.py +2 -5
  121. sglang/srt/multimodal/processors/base_processor.py +4 -3
  122. sglang/srt/multimodal/processors/gemma3n.py +0 -7
  123. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  124. sglang/srt/operations_strategy.py +1 -1
  125. sglang/srt/reasoning_parser.py +18 -39
  126. sglang/srt/server_args.py +218 -23
  127. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
  128. sglang/srt/two_batch_overlap.py +163 -9
  129. sglang/srt/utils.py +41 -26
  130. sglang/srt/weight_sync/utils.py +1 -1
  131. sglang/test/runners.py +4 -4
  132. sglang/test/test_utils.py +4 -4
  133. sglang/version.py +1 -1
  134. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +18 -15
  135. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +143 -116
  136. /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
  137. /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
  138. /sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
  139. /sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
  140. /sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
  141. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
  142. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
  143. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
@@ -23,6 +23,7 @@ from torch import nn
23
23
  from transformers import PretrainedConfig
24
24
 
25
25
  from sglang.srt.distributed import (
26
+ get_moe_expert_parallel_world_size,
26
27
  get_tensor_model_parallel_rank,
27
28
  get_tensor_model_parallel_world_size,
28
29
  parallel_state,
@@ -49,12 +50,9 @@ from sglang.srt.layers.linear import (
49
50
  RowParallelLinear,
50
51
  )
51
52
  from sglang.srt.layers.logits_processor import LogitsProcessor
52
- from sglang.srt.layers.moe.ep_moe.layer import (
53
- DeepEPMoE,
54
- get_moe_impl_class,
55
- should_use_flashinfer_trtllm_moe,
56
- )
53
+ from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
57
54
  from sglang.srt.layers.moe.topk import TopK
55
+ from sglang.srt.layers.moe.utils import should_use_flashinfer_trtllm_moe
58
56
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
59
57
  from sglang.srt.layers.quantization.fp8_kernel import (
60
58
  is_fp8_fnuz,
@@ -83,7 +81,6 @@ from sglang.srt.two_batch_overlap import (
83
81
  )
84
82
  from sglang.srt.utils import (
85
83
  BumpAllocator,
86
- DeepEPMode,
87
84
  LazyValue,
88
85
  add_prefix,
89
86
  bind_or_assign,
@@ -163,7 +160,7 @@ class Glm4MoeMLP(nn.Module):
163
160
 
164
161
  gate_up, _ = self.gate_up_proj(x)
165
162
  x = self.act_fn(gate_up)
166
- x, _ = self.down_proj(x, can_fuse_mlp_allreduce=can_fuse_mlp_allreduce)
163
+ x, _ = self.down_proj(x, skip_all_reduce=can_fuse_mlp_allreduce)
167
164
  return x
168
165
 
169
166
 
@@ -344,7 +341,7 @@ class Glm4MoeGate(nn.Module):
344
341
  torch.empty((config.n_routed_experts, config.hidden_size))
345
342
  )
346
343
  self.e_score_correction_bias = nn.Parameter(
347
- torch.empty((config.n_routed_experts))
344
+ torch.empty((config.n_routed_experts), dtype=torch.float32)
348
345
  )
349
346
  if _is_cpu and _is_cpu_amx_available:
350
347
  self.quant_method = PackWeightMethod(weight_names=["weight"])
@@ -388,6 +385,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
388
385
  ):
389
386
  nn.Module.__init__(self)
390
387
  self.tp_size = get_tensor_model_parallel_world_size()
388
+ self.ep_size = get_moe_expert_parallel_world_size()
391
389
  self.routed_scaling_factor = config.routed_scaling_factor
392
390
  self.n_shared_experts = config.n_shared_experts
393
391
  self.num_fused_shared_experts = (
@@ -443,15 +441,14 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
443
441
  routed_scaling_factor=self.routed_scaling_factor,
444
442
  prefix=add_prefix("experts", prefix),
445
443
  **(
446
- dict(deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]])
447
- if global_server_args_dict["enable_deepep_moe"]
444
+ dict(deepep_mode=global_server_args_dict["deepep_mode"])
445
+ if global_server_args_dict["moe_a2a_backend"].is_deepep()
448
446
  else {}
449
447
  ),
450
448
  # Additional args for FusedMoE
451
449
  **(
452
450
  dict(
453
451
  enable_flashinfer_cutlass_moe=True,
454
- enable_ep_moe=global_server_args_dict["enable_ep_moe"],
455
452
  )
456
453
  if global_server_args_dict["enable_flashinfer_cutlass_moe"]
457
454
  else {}
@@ -482,11 +479,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
482
479
  quant_config=quant_config,
483
480
  reduce_results=False,
484
481
  prefix=add_prefix("shared_experts", prefix),
485
- **(
486
- dict(tp_rank=0, tp_size=1)
487
- if global_server_args_dict["enable_deepep_moe"]
488
- else {}
489
- ),
482
+ **(dict(tp_rank=0, tp_size=1) if self.ep_size > 1 else {}),
490
483
  )
491
484
  is_packed_weight = hasattr(
492
485
  self.shared_experts.gate_up_proj.quant_method, "quant_config"
@@ -502,9 +495,9 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
502
495
 
503
496
  self.top_k = config.num_experts_per_tok
504
497
 
505
- if global_server_args_dict["enable_deepep_moe"]:
498
+ if global_server_args_dict["moe_a2a_backend"].is_deepep():
506
499
  # TODO: we will support tp < ep in the future
507
- self.ep_size = get_tensor_model_parallel_world_size()
500
+ self.ep_size = get_moe_expert_parallel_world_size()
508
501
  self.num_experts = (
509
502
  config.n_routed_experts
510
503
  + global_server_args_dict["ep_num_redundant_experts"]
@@ -526,12 +519,97 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
526
519
  num_local_experts=config.n_routed_experts // self.tp_size,
527
520
  hidden_size=config.hidden_size,
528
521
  params_dtype=config.torch_dtype,
529
- deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]],
522
+ deepep_mode=global_server_args_dict["deepep_mode"],
530
523
  async_finish=True,
531
524
  return_recv_hook=True,
532
525
  )
533
526
 
534
- self._enable_deepep_moe = global_server_args_dict["enable_deepep_moe"]
527
+ self._enable_deepep_moe = global_server_args_dict["moe_a2a_backend"].is_deepep()
528
+
529
+ def forward_normal_dual_stream(
530
+ self,
531
+ hidden_states: torch.Tensor,
532
+ can_fuse_mlp_allreduce: bool = False,
533
+ use_reduce_scatter: bool = False,
534
+ ) -> torch.Tensor:
535
+
536
+ current_stream = torch.cuda.current_stream()
537
+ self.alt_stream.wait_stream(current_stream)
538
+ shared_output = self._forward_shared_experts(hidden_states)
539
+
540
+ with torch.cuda.stream(self.alt_stream):
541
+ # router_logits: (num_tokens, n_experts)
542
+ router_logits = self.gate(hidden_states)
543
+ kwargs = {"hidden_states": hidden_states}
544
+ if self.topk is not None:
545
+ kwargs["topk_output"] = self.topk(hidden_states, router_logits)
546
+ else:
547
+ kwargs["router_logits"] = router_logits
548
+ final_hidden_states = self.experts(**kwargs)
549
+ if not _is_cuda:
550
+ final_hidden_states *= self.routed_scaling_factor
551
+ current_stream.wait_stream(self.alt_stream)
552
+
553
+ if self.ep_size > 1:
554
+ if (
555
+ self.tp_size > 1
556
+ and not can_fuse_mlp_allreduce
557
+ and not use_reduce_scatter
558
+ ):
559
+ final_hidden_states = tensor_model_parallel_all_reduce(
560
+ final_hidden_states
561
+ )
562
+ final_hidden_states += shared_output
563
+ else:
564
+ final_hidden_states += shared_output
565
+ if (
566
+ self.tp_size > 1
567
+ and not can_fuse_mlp_allreduce
568
+ and not use_reduce_scatter
569
+ ):
570
+ final_hidden_states = tensor_model_parallel_all_reduce(
571
+ final_hidden_states
572
+ )
573
+ return final_hidden_states
574
+
575
+ def forward_normal(
576
+ self,
577
+ hidden_states: torch.Tensor,
578
+ can_fuse_mlp_allreduce: bool = False,
579
+ use_reduce_scatter: bool = False,
580
+ ) -> torch.Tensor:
581
+ if hasattr(self, "shared_experts") and use_intel_amx_backend(
582
+ self.shared_experts.gate_up_proj
583
+ ):
584
+ return self.forward_cpu(hidden_states, can_fuse_mlp_allreduce)
585
+
586
+ shared_output = self._forward_shared_experts(hidden_states)
587
+ # router_logits: (num_tokens, n_experts)
588
+ router_logits = self.gate(hidden_states)
589
+ kwargs = {"hidden_states": hidden_states}
590
+ if self.topk is not None:
591
+ kwargs["topk_output"] = self.topk(hidden_states, router_logits)
592
+ else:
593
+ kwargs["router_logits"] = router_logits
594
+ final_hidden_states = self.experts(**kwargs)
595
+ if not _is_cuda and not _use_aiter:
596
+ # fused in biased_grouped_topk so we can skip here
597
+ final_hidden_states *= self.routed_scaling_factor
598
+ if self.ep_size > 1:
599
+ if self.tp_size > 1 and not can_fuse_mlp_allreduce:
600
+ final_hidden_states = tensor_model_parallel_all_reduce(
601
+ final_hidden_states
602
+ )
603
+ if shared_output is not None:
604
+ final_hidden_states += shared_output
605
+ else:
606
+ if shared_output is not None:
607
+ final_hidden_states += shared_output
608
+ if self.tp_size > 1 and not can_fuse_mlp_allreduce:
609
+ final_hidden_states = tensor_model_parallel_all_reduce(
610
+ final_hidden_states
611
+ )
612
+ return final_hidden_states
535
613
 
536
614
 
537
615
  class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
@@ -617,6 +695,7 @@ class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
617
695
  layer_scatter_modes=self.layer_scatter_modes,
618
696
  input_layernorm=self.input_layernorm,
619
697
  post_attention_layernorm=self.post_attention_layernorm,
698
+ allow_reduce_scatter=True,
620
699
  )
621
700
 
622
701
  def forward(
@@ -721,7 +800,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
721
800
  )
722
801
 
723
802
  def determine_num_fused_shared_experts(
724
- self, architecture: str = "DeepseekV3ForCausalLM"
803
+ self, architecture: str = "Glm4MoeForCausalLM"
725
804
  ):
726
805
  self.num_fused_shared_experts = 0
727
806
  if global_server_args_dict["disable_shared_experts_fusion"]:
@@ -733,15 +812,11 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
733
812
  not _is_cuda
734
813
  or torch.cuda.get_device_capability("cuda") < (8, 0)
735
814
  or self.config.architectures[0] != architecture
736
- or self.config.n_routed_experts != 128
737
815
  or self.config.n_shared_experts != 1
738
816
  ):
739
817
  disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."
740
- elif (
741
- global_server_args_dict["enable_deepep_moe"]
742
- or global_server_args_dict["enable_ep_moe"]
743
- ):
744
- disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization when in deepep_moe or ep_moe mode."
818
+ elif get_moe_expert_parallel_world_size() > 1:
819
+ disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
745
820
 
746
821
  if disable_reason is not None:
747
822
  global_server_args_dict["disable_shared_experts_fusion"] = True