sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. sglang/bench_one_batch.py +113 -17
  2. sglang/srt/configs/model_config.py +35 -0
  3. sglang/srt/conversation.py +9 -5
  4. sglang/srt/disaggregation/base/conn.py +5 -2
  5. sglang/srt/disaggregation/decode.py +6 -1
  6. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
  7. sglang/srt/disaggregation/mooncake/conn.py +243 -135
  8. sglang/srt/disaggregation/prefill.py +2 -0
  9. sglang/srt/distributed/parallel_state.py +11 -9
  10. sglang/srt/entrypoints/context.py +244 -0
  11. sglang/srt/entrypoints/engine.py +4 -3
  12. sglang/srt/entrypoints/harmony_utils.py +370 -0
  13. sglang/srt/entrypoints/http_server.py +71 -0
  14. sglang/srt/entrypoints/openai/protocol.py +227 -1
  15. sglang/srt/entrypoints/openai/serving_chat.py +278 -42
  16. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  17. sglang/srt/entrypoints/openai/tool_server.py +174 -0
  18. sglang/srt/entrypoints/tool.py +87 -0
  19. sglang/srt/eplb/expert_location.py +5 -1
  20. sglang/srt/function_call/harmony_tool_parser.py +130 -0
  21. sglang/srt/hf_transformers_utils.py +30 -3
  22. sglang/srt/jinja_template_utils.py +8 -1
  23. sglang/srt/layers/attention/aiter_backend.py +5 -8
  24. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  25. sglang/srt/layers/attention/triton_backend.py +85 -14
  26. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  27. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  28. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  29. sglang/srt/layers/attention/vision.py +13 -5
  30. sglang/srt/layers/communicator.py +21 -4
  31. sglang/srt/layers/dp_attention.py +12 -0
  32. sglang/srt/layers/linear.py +2 -7
  33. sglang/srt/layers/moe/cutlass_moe.py +20 -6
  34. sglang/srt/layers/moe/ep_moe/layer.py +77 -73
  35. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  36. sglang/srt/layers/moe/fused_moe_triton/layer.py +416 -35
  37. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
  38. sglang/srt/layers/moe/topk.py +12 -3
  39. sglang/srt/layers/moe/utils.py +16 -0
  40. sglang/srt/layers/quantization/__init__.py +22 -0
  41. sglang/srt/layers/quantization/fp4.py +557 -0
  42. sglang/srt/layers/quantization/fp8.py +3 -6
  43. sglang/srt/layers/quantization/fp8_utils.py +29 -0
  44. sglang/srt/layers/quantization/modelopt_quant.py +259 -64
  45. sglang/srt/layers/quantization/mxfp4.py +651 -0
  46. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  47. sglang/srt/layers/quantization/quark/__init__.py +0 -0
  48. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  49. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  50. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  51. sglang/srt/layers/quantization/quark/utils.py +107 -0
  52. sglang/srt/layers/quantization/unquant.py +60 -6
  53. sglang/srt/layers/quantization/w4afp8.py +1 -1
  54. sglang/srt/layers/rotary_embedding.py +225 -1
  55. sglang/srt/layers/utils.py +9 -0
  56. sglang/srt/layers/vocab_parallel_embedding.py +8 -3
  57. sglang/srt/lora/lora_manager.py +70 -14
  58. sglang/srt/lora/lora_registry.py +3 -2
  59. sglang/srt/lora/mem_pool.py +43 -5
  60. sglang/srt/managers/cache_controller.py +55 -30
  61. sglang/srt/managers/detokenizer_manager.py +1 -1
  62. sglang/srt/managers/io_struct.py +15 -3
  63. sglang/srt/managers/mm_utils.py +5 -11
  64. sglang/srt/managers/schedule_batch.py +28 -7
  65. sglang/srt/managers/scheduler.py +26 -12
  66. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  67. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  68. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  69. sglang/srt/managers/template_manager.py +35 -1
  70. sglang/srt/managers/tokenizer_manager.py +24 -6
  71. sglang/srt/managers/tp_worker.py +3 -0
  72. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  73. sglang/srt/mem_cache/hiradix_cache.py +53 -5
  74. sglang/srt/mem_cache/memory_pool_host.py +1 -1
  75. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  76. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  77. sglang/srt/model_executor/cuda_graph_runner.py +7 -6
  78. sglang/srt/model_executor/forward_batch_info.py +35 -14
  79. sglang/srt/model_executor/model_runner.py +19 -2
  80. sglang/srt/model_loader/weight_utils.py +10 -0
  81. sglang/srt/models/bailing_moe.py +425 -0
  82. sglang/srt/models/deepseek_v2.py +72 -33
  83. sglang/srt/models/ernie4.py +426 -0
  84. sglang/srt/models/ernie4_eagle.py +203 -0
  85. sglang/srt/models/gemma3n_mm.py +39 -0
  86. sglang/srt/models/glm4_moe.py +24 -12
  87. sglang/srt/models/gpt_oss.py +1134 -0
  88. sglang/srt/models/qwen2.py +6 -0
  89. sglang/srt/models/qwen2_moe.py +6 -0
  90. sglang/srt/models/qwen3_moe.py +32 -6
  91. sglang/srt/models/step3_vl.py +9 -0
  92. sglang/srt/models/transformers.py +2 -5
  93. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  94. sglang/srt/reasoning_parser.py +18 -39
  95. sglang/srt/server_args.py +142 -7
  96. sglang/srt/two_batch_overlap.py +157 -5
  97. sglang/srt/utils.py +38 -2
  98. sglang/test/runners.py +2 -2
  99. sglang/test/test_utils.py +1 -1
  100. sglang/version.py +1 -1
  101. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +16 -14
  102. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +105 -84
  103. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
  104. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
  105. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
@@ -50,11 +50,9 @@ from sglang.srt.layers.linear import (
50
50
  RowParallelLinear,
51
51
  )
52
52
  from sglang.srt.layers.logits_processor import LogitsProcessor
53
- from sglang.srt.layers.moe.ep_moe.layer import (
54
- get_moe_impl_class,
55
- should_use_flashinfer_trtllm_moe,
56
- )
53
+ from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
57
54
  from sglang.srt.layers.moe.topk import TopK
55
+ from sglang.srt.layers.moe.utils import should_use_flashinfer_trtllm_moe
58
56
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
59
57
  from sglang.srt.layers.quantization.fp8_kernel import (
60
58
  is_fp8_fnuz,
@@ -162,7 +160,7 @@ class Glm4MoeMLP(nn.Module):
162
160
 
163
161
  gate_up, _ = self.gate_up_proj(x)
164
162
  x = self.act_fn(gate_up)
165
- x, _ = self.down_proj(x, can_fuse_mlp_allreduce=can_fuse_mlp_allreduce)
163
+ x, _ = self.down_proj(x, skip_all_reduce=can_fuse_mlp_allreduce)
166
164
  return x
167
165
 
168
166
 
@@ -343,7 +341,7 @@ class Glm4MoeGate(nn.Module):
343
341
  torch.empty((config.n_routed_experts, config.hidden_size))
344
342
  )
345
343
  self.e_score_correction_bias = nn.Parameter(
346
- torch.empty((config.n_routed_experts))
344
+ torch.empty((config.n_routed_experts), dtype=torch.float32)
347
345
  )
348
346
  if _is_cpu and _is_cpu_amx_available:
349
347
  self.quant_method = PackWeightMethod(weight_names=["weight"])
@@ -529,7 +527,10 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
529
527
  self._enable_deepep_moe = global_server_args_dict["moe_a2a_backend"].is_deepep()
530
528
 
531
529
  def forward_normal_dual_stream(
532
- self, hidden_states: torch.Tensor, can_fuse_mlp_allreduce: bool = False
530
+ self,
531
+ hidden_states: torch.Tensor,
532
+ can_fuse_mlp_allreduce: bool = False,
533
+ use_reduce_scatter: bool = False,
533
534
  ) -> torch.Tensor:
534
535
 
535
536
  current_stream = torch.cuda.current_stream()
@@ -550,21 +551,32 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
550
551
  current_stream.wait_stream(self.alt_stream)
551
552
 
552
553
  if self.ep_size > 1:
553
- if self.tp_size > 1 and not can_fuse_mlp_allreduce:
554
+ if (
555
+ self.tp_size > 1
556
+ and not can_fuse_mlp_allreduce
557
+ and not use_reduce_scatter
558
+ ):
554
559
  final_hidden_states = tensor_model_parallel_all_reduce(
555
560
  final_hidden_states
556
561
  )
557
562
  final_hidden_states += shared_output
558
563
  else:
559
564
  final_hidden_states += shared_output
560
- if self.tp_size > 1 and not can_fuse_mlp_allreduce:
565
+ if (
566
+ self.tp_size > 1
567
+ and not can_fuse_mlp_allreduce
568
+ and not use_reduce_scatter
569
+ ):
561
570
  final_hidden_states = tensor_model_parallel_all_reduce(
562
571
  final_hidden_states
563
572
  )
564
573
  return final_hidden_states
565
574
 
566
575
  def forward_normal(
567
- self, hidden_states: torch.Tensor, can_fuse_mlp_allreduce: bool = False
576
+ self,
577
+ hidden_states: torch.Tensor,
578
+ can_fuse_mlp_allreduce: bool = False,
579
+ use_reduce_scatter: bool = False,
568
580
  ) -> torch.Tensor:
569
581
  if hasattr(self, "shared_experts") and use_intel_amx_backend(
570
582
  self.shared_experts.gate_up_proj
@@ -683,6 +695,7 @@ class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
683
695
  layer_scatter_modes=self.layer_scatter_modes,
684
696
  input_layernorm=self.input_layernorm,
685
697
  post_attention_layernorm=self.post_attention_layernorm,
698
+ allow_reduce_scatter=True,
686
699
  )
687
700
 
688
701
  def forward(
@@ -787,7 +800,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
787
800
  )
788
801
 
789
802
  def determine_num_fused_shared_experts(
790
- self, architecture: str = "DeepseekV3ForCausalLM"
803
+ self, architecture: str = "Glm4MoeForCausalLM"
791
804
  ):
792
805
  self.num_fused_shared_experts = 0
793
806
  if global_server_args_dict["disable_shared_experts_fusion"]:
@@ -799,7 +812,6 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
799
812
  not _is_cuda
800
813
  or torch.cuda.get_device_capability("cuda") < (8, 0)
801
814
  or self.config.architectures[0] != architecture
802
- or self.config.n_routed_experts != 128
803
815
  or self.config.n_shared_experts != 1
804
816
  ):
805
817
  disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."