sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. sglang/__init__.py +8 -3
  2. sglang/bench_one_batch.py +119 -17
  3. sglang/lang/chat_template.py +18 -0
  4. sglang/srt/bench_utils.py +137 -0
  5. sglang/srt/configs/model_config.py +42 -7
  6. sglang/srt/conversation.py +9 -5
  7. sglang/srt/disaggregation/base/conn.py +5 -2
  8. sglang/srt/disaggregation/decode.py +14 -4
  9. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
  10. sglang/srt/disaggregation/mooncake/conn.py +286 -160
  11. sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
  12. sglang/srt/disaggregation/prefill.py +2 -0
  13. sglang/srt/distributed/parallel_state.py +15 -11
  14. sglang/srt/entrypoints/context.py +227 -0
  15. sglang/srt/entrypoints/engine.py +15 -9
  16. sglang/srt/entrypoints/harmony_utils.py +372 -0
  17. sglang/srt/entrypoints/http_server.py +74 -4
  18. sglang/srt/entrypoints/openai/protocol.py +218 -1
  19. sglang/srt/entrypoints/openai/serving_chat.py +41 -11
  20. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  21. sglang/srt/entrypoints/openai/tool_server.py +175 -0
  22. sglang/srt/entrypoints/tool.py +87 -0
  23. sglang/srt/eplb/expert_location.py +5 -1
  24. sglang/srt/function_call/ebnf_composer.py +1 -0
  25. sglang/srt/function_call/function_call_parser.py +2 -0
  26. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  27. sglang/srt/function_call/gpt_oss_detector.py +331 -0
  28. sglang/srt/function_call/kimik2_detector.py +3 -3
  29. sglang/srt/function_call/qwen3_coder_detector.py +219 -9
  30. sglang/srt/hf_transformers_utils.py +30 -3
  31. sglang/srt/jinja_template_utils.py +14 -1
  32. sglang/srt/layers/attention/aiter_backend.py +375 -115
  33. sglang/srt/layers/attention/ascend_backend.py +3 -0
  34. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  35. sglang/srt/layers/attention/flashattention_backend.py +18 -0
  36. sglang/srt/layers/attention/flashinfer_backend.py +52 -13
  37. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  38. sglang/srt/layers/attention/triton_backend.py +85 -14
  39. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  40. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  41. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  42. sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
  43. sglang/srt/layers/attention/vision.py +22 -6
  44. sglang/srt/layers/attention/wave_backend.py +627 -0
  45. sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
  46. sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
  47. sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
  48. sglang/srt/layers/communicator.py +29 -14
  49. sglang/srt/layers/dp_attention.py +12 -0
  50. sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
  51. sglang/srt/layers/linear.py +3 -7
  52. sglang/srt/layers/moe/cutlass_moe.py +12 -3
  53. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
  54. sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
  55. sglang/srt/layers/moe/ep_moe/layer.py +135 -73
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  59. sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
  60. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
  61. sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
  62. sglang/srt/layers/moe/topk.py +16 -4
  63. sglang/srt/layers/moe/utils.py +16 -0
  64. sglang/srt/layers/quantization/__init__.py +27 -3
  65. sglang/srt/layers/quantization/fp4.py +557 -0
  66. sglang/srt/layers/quantization/fp8.py +3 -6
  67. sglang/srt/layers/quantization/fp8_kernel.py +277 -0
  68. sglang/srt/layers/quantization/fp8_utils.py +51 -10
  69. sglang/srt/layers/quantization/modelopt_quant.py +258 -68
  70. sglang/srt/layers/quantization/mxfp4.py +654 -0
  71. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  72. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  73. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  74. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  75. sglang/srt/layers/quantization/quark/utils.py +107 -0
  76. sglang/srt/layers/quantization/unquant.py +60 -6
  77. sglang/srt/layers/quantization/w4afp8.py +21 -12
  78. sglang/srt/layers/quantization/w8a8_int8.py +48 -34
  79. sglang/srt/layers/rotary_embedding.py +506 -3
  80. sglang/srt/layers/utils.py +9 -0
  81. sglang/srt/layers/vocab_parallel_embedding.py +8 -3
  82. sglang/srt/lora/backend/base_backend.py +3 -23
  83. sglang/srt/lora/layers.py +60 -114
  84. sglang/srt/lora/lora.py +17 -62
  85. sglang/srt/lora/lora_manager.py +82 -62
  86. sglang/srt/lora/lora_registry.py +23 -11
  87. sglang/srt/lora/mem_pool.py +63 -68
  88. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  89. sglang/srt/lora/utils.py +25 -58
  90. sglang/srt/managers/cache_controller.py +75 -58
  91. sglang/srt/managers/detokenizer_manager.py +1 -1
  92. sglang/srt/managers/io_struct.py +20 -8
  93. sglang/srt/managers/mm_utils.py +6 -13
  94. sglang/srt/managers/multimodal_processor.py +1 -1
  95. sglang/srt/managers/schedule_batch.py +61 -25
  96. sglang/srt/managers/schedule_policy.py +6 -6
  97. sglang/srt/managers/scheduler.py +41 -19
  98. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  99. sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
  100. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  101. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  102. sglang/srt/managers/template_manager.py +35 -1
  103. sglang/srt/managers/tokenizer_manager.py +47 -30
  104. sglang/srt/managers/tp_worker.py +3 -0
  105. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  106. sglang/srt/mem_cache/allocator.py +61 -87
  107. sglang/srt/mem_cache/hicache_storage.py +1 -1
  108. sglang/srt/mem_cache/hiradix_cache.py +80 -22
  109. sglang/srt/mem_cache/lora_radix_cache.py +421 -0
  110. sglang/srt/mem_cache/memory_pool_host.py +34 -36
  111. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  112. sglang/srt/mem_cache/radix_cache.py +2 -5
  113. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  114. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
  115. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
  116. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
  117. sglang/srt/model_executor/cuda_graph_runner.py +29 -9
  118. sglang/srt/model_executor/forward_batch_info.py +61 -19
  119. sglang/srt/model_executor/model_runner.py +148 -37
  120. sglang/srt/model_loader/loader.py +18 -6
  121. sglang/srt/model_loader/weight_utils.py +10 -0
  122. sglang/srt/models/bailing_moe.py +425 -0
  123. sglang/srt/models/deepseek_v2.py +137 -59
  124. sglang/srt/models/ernie4.py +426 -0
  125. sglang/srt/models/ernie4_eagle.py +203 -0
  126. sglang/srt/models/gemma2.py +0 -34
  127. sglang/srt/models/gemma3n_mm.py +38 -0
  128. sglang/srt/models/glm4.py +6 -0
  129. sglang/srt/models/glm4_moe.py +28 -16
  130. sglang/srt/models/glm4v.py +589 -0
  131. sglang/srt/models/glm4v_moe.py +400 -0
  132. sglang/srt/models/gpt_oss.py +1251 -0
  133. sglang/srt/models/granite.py +0 -25
  134. sglang/srt/models/llama.py +0 -25
  135. sglang/srt/models/llama4.py +1 -1
  136. sglang/srt/models/qwen2.py +6 -0
  137. sglang/srt/models/qwen2_5_vl.py +7 -3
  138. sglang/srt/models/qwen2_audio.py +10 -9
  139. sglang/srt/models/qwen2_moe.py +6 -0
  140. sglang/srt/models/qwen3.py +0 -24
  141. sglang/srt/models/qwen3_moe.py +32 -6
  142. sglang/srt/models/registry.py +1 -1
  143. sglang/srt/models/step3_vl.py +9 -0
  144. sglang/srt/models/torch_native_llama.py +0 -24
  145. sglang/srt/models/transformers.py +2 -5
  146. sglang/srt/multimodal/processors/base_processor.py +23 -13
  147. sglang/srt/multimodal/processors/glm4v.py +132 -0
  148. sglang/srt/multimodal/processors/qwen_audio.py +4 -2
  149. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  150. sglang/srt/reasoning_parser.py +332 -37
  151. sglang/srt/server_args.py +186 -75
  152. sglang/srt/speculative/eagle_worker.py +16 -0
  153. sglang/srt/two_batch_overlap.py +169 -9
  154. sglang/srt/utils.py +41 -5
  155. sglang/srt/weight_sync/tensor_bucket.py +106 -0
  156. sglang/test/attention/test_trtllm_mla_backend.py +186 -36
  157. sglang/test/doc_patch.py +59 -0
  158. sglang/test/few_shot_gsm8k.py +1 -1
  159. sglang/test/few_shot_gsm8k_engine.py +1 -1
  160. sglang/test/run_eval.py +4 -1
  161. sglang/test/runners.py +2 -2
  162. sglang/test/simple_eval_common.py +6 -0
  163. sglang/test/simple_eval_gpqa.py +2 -0
  164. sglang/test/test_fp4_moe.py +118 -36
  165. sglang/test/test_utils.py +1 -1
  166. sglang/utils.py +1 -1
  167. sglang/version.py +1 -1
  168. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
  169. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
  170. sglang/srt/lora/backend/flashinfer_backend.py +0 -131
  171. /sglang/{api.py → lang/api.py} +0 -0
  172. /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
  173. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
  174. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
  175. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -492,5 +492,43 @@ class Gemma3nForConditionalGeneration(PreTrainedModel):
492
492
  loaded_params.add(name)
493
493
  return loaded_params
494
494
 
495
+ lora_pattern = re.compile(
496
+ r"^language_model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)"
497
+ )
498
+
499
+ def should_apply_lora(self, module_name: str) -> bool:
500
+ return bool(self.lora_pattern.match(module_name))
501
+
502
+ def get_hidden_dim(self, module_name):
503
+ # return input_dim, output_dim
504
+ if module_name == "qkv_proj":
505
+ return (
506
+ self.config.hidden_size,
507
+ self.config.head_dim
508
+ * (
509
+ self.config.num_attention_heads
510
+ + self.config.num_key_value_heads * 2
511
+ ),
512
+ )
513
+ elif module_name == "o_proj":
514
+ return (
515
+ self.config.head_dim * self.config.num_attention_heads,
516
+ self.config.hidden_size,
517
+ )
518
+ elif module_name == "gate_up_proj":
519
+ assert len(set(self.config.intermediate_size)) == 1, (
520
+ "Currently SGLang requires uniform intermediate size for all layers. "
521
+ "Please file an issue if you need support for non-uniform intermediate sizes."
522
+ )
523
+ return self.config.hidden_size, self.config.intermediate_size[0] * 2
524
+ elif module_name == "down_proj":
525
+ assert len(set(self.config.intermediate_size)) == 1, (
526
+ "Currently SGLang requires uniform intermediate size for all layers. "
527
+ "Please file an issue if you need support for non-uniform intermediate sizes."
528
+ )
529
+ return self.config.intermediate_size[0], self.config.hidden_size
530
+ else:
531
+ raise NotImplementedError()
532
+
495
533
 
496
534
  EntryClass = Gemma3nForConditionalGeneration
sglang/srt/models/glm4.py CHANGED
@@ -218,6 +218,12 @@ class Glm4Model(nn.Module):
218
218
 
219
219
  self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
220
220
 
221
+ def get_input_embeddings(self) -> nn.Embedding:
222
+ return self.embed_tokens
223
+
224
+ def dtype(self) -> torch.dtype:
225
+ return next(self.parameters()).dtype
226
+
221
227
  @torch.no_grad()
222
228
  def forward(
223
229
  self,
@@ -50,11 +50,9 @@ from sglang.srt.layers.linear import (
50
50
  RowParallelLinear,
51
51
  )
52
52
  from sglang.srt.layers.logits_processor import LogitsProcessor
53
- from sglang.srt.layers.moe.ep_moe.layer import (
54
- get_moe_impl_class,
55
- should_use_flashinfer_trtllm_moe,
56
- )
53
+ from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
57
54
  from sglang.srt.layers.moe.topk import TopK
55
+ from sglang.srt.layers.moe.utils import should_use_flashinfer_trtllm_moe
58
56
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
59
57
  from sglang.srt.layers.quantization.fp8_kernel import (
60
58
  is_fp8_fnuz,
@@ -156,13 +154,13 @@ class Glm4MoeMLP(nn.Module):
156
154
  )
157
155
  self.act_fn = SiluAndMul()
158
156
 
159
- def forward(self, x, forward_batch=None, can_fuse_mlp_allreduce=False):
157
+ def forward(self, x, forward_batch=None, should_allreduce_fusion=False):
160
158
  if (self.tp_size == 1) and x.shape[0] == 0:
161
159
  return x
162
160
 
163
161
  gate_up, _ = self.gate_up_proj(x)
164
162
  x = self.act_fn(gate_up)
165
- x, _ = self.down_proj(x, can_fuse_mlp_allreduce=can_fuse_mlp_allreduce)
163
+ x, _ = self.down_proj(x, skip_all_reduce=should_allreduce_fusion)
166
164
  return x
167
165
 
168
166
 
@@ -343,7 +341,7 @@ class Glm4MoeGate(nn.Module):
343
341
  torch.empty((config.n_routed_experts, config.hidden_size))
344
342
  )
345
343
  self.e_score_correction_bias = nn.Parameter(
346
- torch.empty((config.n_routed_experts))
344
+ torch.empty((config.n_routed_experts), dtype=torch.float32)
347
345
  )
348
346
  if _is_cpu and _is_cpu_amx_available:
349
347
  self.quant_method = PackWeightMethod(weight_names=["weight"])
@@ -529,7 +527,10 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
529
527
  self._enable_deepep_moe = global_server_args_dict["moe_a2a_backend"].is_deepep()
530
528
 
531
529
  def forward_normal_dual_stream(
532
- self, hidden_states: torch.Tensor, can_fuse_mlp_allreduce: bool = False
530
+ self,
531
+ hidden_states: torch.Tensor,
532
+ should_allreduce_fusion: bool = False,
533
+ use_reduce_scatter: bool = False,
533
534
  ) -> torch.Tensor:
534
535
 
535
536
  current_stream = torch.cuda.current_stream()
@@ -550,26 +551,37 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
550
551
  current_stream.wait_stream(self.alt_stream)
551
552
 
552
553
  if self.ep_size > 1:
553
- if self.tp_size > 1 and not can_fuse_mlp_allreduce:
554
+ if (
555
+ self.tp_size > 1
556
+ and not should_allreduce_fusion
557
+ and not use_reduce_scatter
558
+ ):
554
559
  final_hidden_states = tensor_model_parallel_all_reduce(
555
560
  final_hidden_states
556
561
  )
557
562
  final_hidden_states += shared_output
558
563
  else:
559
564
  final_hidden_states += shared_output
560
- if self.tp_size > 1 and not can_fuse_mlp_allreduce:
565
+ if (
566
+ self.tp_size > 1
567
+ and not should_allreduce_fusion
568
+ and not use_reduce_scatter
569
+ ):
561
570
  final_hidden_states = tensor_model_parallel_all_reduce(
562
571
  final_hidden_states
563
572
  )
564
573
  return final_hidden_states
565
574
 
566
575
  def forward_normal(
567
- self, hidden_states: torch.Tensor, can_fuse_mlp_allreduce: bool = False
576
+ self,
577
+ hidden_states: torch.Tensor,
578
+ should_allreduce_fusion: bool = False,
579
+ use_reduce_scatter: bool = False,
568
580
  ) -> torch.Tensor:
569
581
  if hasattr(self, "shared_experts") and use_intel_amx_backend(
570
582
  self.shared_experts.gate_up_proj
571
583
  ):
572
- return self.forward_cpu(hidden_states, can_fuse_mlp_allreduce)
584
+ return self.forward_cpu(hidden_states, should_allreduce_fusion)
573
585
 
574
586
  shared_output = self._forward_shared_experts(hidden_states)
575
587
  # router_logits: (num_tokens, n_experts)
@@ -584,7 +596,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
584
596
  # fused in biased_grouped_topk so we can skip here
585
597
  final_hidden_states *= self.routed_scaling_factor
586
598
  if self.ep_size > 1:
587
- if self.tp_size > 1 and not can_fuse_mlp_allreduce:
599
+ if self.tp_size > 1 and not should_allreduce_fusion:
588
600
  final_hidden_states = tensor_model_parallel_all_reduce(
589
601
  final_hidden_states
590
602
  )
@@ -593,7 +605,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
593
605
  else:
594
606
  if shared_output is not None:
595
607
  final_hidden_states += shared_output
596
- if self.tp_size > 1 and not can_fuse_mlp_allreduce:
608
+ if self.tp_size > 1 and not should_allreduce_fusion:
597
609
  final_hidden_states = tensor_model_parallel_all_reduce(
598
610
  final_hidden_states
599
611
  )
@@ -683,6 +695,7 @@ class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
683
695
  layer_scatter_modes=self.layer_scatter_modes,
684
696
  input_layernorm=self.input_layernorm,
685
697
  post_attention_layernorm=self.post_attention_layernorm,
698
+ allow_reduce_scatter=True,
686
699
  )
687
700
 
688
701
  def forward(
@@ -787,7 +800,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
787
800
  )
788
801
 
789
802
  def determine_num_fused_shared_experts(
790
- self, architecture: str = "DeepseekV3ForCausalLM"
803
+ self, architecture: str = "Glm4MoeForCausalLM"
791
804
  ):
792
805
  self.num_fused_shared_experts = 0
793
806
  if global_server_args_dict["disable_shared_experts_fusion"]:
@@ -799,7 +812,6 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
799
812
  not _is_cuda
800
813
  or torch.cuda.get_device_capability("cuda") < (8, 0)
801
814
  or self.config.architectures[0] != architecture
802
- or self.config.n_routed_experts != 128
803
815
  or self.config.n_shared_experts != 1
804
816
  ):
805
817
  disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."