PyPI - sglang - Versions diffs - 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl - Mend

sglang 0.5.0rc0py3-none-any.whl → 0.5.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (170) hide show

sglang/__init__.py +8 -3
sglang/bench_one_batch.py +6 -1
sglang/lang/chat_template.py +18 -0
sglang/srt/bench_utils.py +137 -0
sglang/srt/configs/model_config.py +8 -7
sglang/srt/disaggregation/decode.py +8 -4
sglang/srt/disaggregation/mooncake/conn.py +43 -25
sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
sglang/srt/distributed/parallel_state.py +4 -2
sglang/srt/entrypoints/context.py +3 -20
sglang/srt/entrypoints/engine.py +13 -8
sglang/srt/entrypoints/harmony_utils.py +2 -0
sglang/srt/entrypoints/http_server.py +68 -5
sglang/srt/entrypoints/openai/protocol.py +2 -9
sglang/srt/entrypoints/openai/serving_chat.py +60 -265
sglang/srt/entrypoints/openai/serving_completions.py +1 -0
sglang/srt/entrypoints/openai/tool_server.py +4 -3
sglang/srt/function_call/ebnf_composer.py +1 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +331 -0
sglang/srt/function_call/kimik2_detector.py +3 -3
sglang/srt/function_call/qwen3_coder_detector.py +219 -9
sglang/srt/jinja_template_utils.py +6 -0
sglang/srt/layers/attention/aiter_backend.py +370 -107
sglang/srt/layers/attention/ascend_backend.py +3 -0
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +18 -0
sglang/srt/layers/attention/flashinfer_backend.py +55 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -0
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +24 -27
sglang/srt/layers/attention/trtllm_mha_backend.py +8 -6
sglang/srt/layers/attention/trtllm_mla_backend.py +129 -25
sglang/srt/layers/attention/vision.py +9 -1
sglang/srt/layers/attention/wave_backend.py +627 -0
sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
sglang/srt/layers/communicator.py +11 -13
sglang/srt/layers/dp_attention.py +118 -27
sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/logits_processor.py +12 -18
sglang/srt/layers/moe/cutlass_moe.py +11 -16
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
sglang/srt/layers/moe/ep_moe/layer.py +60 -2
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
sglang/srt/layers/moe/topk.py +4 -1
sglang/srt/layers/multimodal.py +156 -40
sglang/srt/layers/quantization/__init__.py +10 -35
sglang/srt/layers/quantization/awq.py +15 -16
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -1
sglang/srt/layers/quantization/fp8_kernel.py +277 -0
sglang/srt/layers/quantization/fp8_utils.py +22 -10
sglang/srt/layers/quantization/gptq.py +12 -17
sglang/srt/layers/quantization/marlin_utils.py +15 -5
sglang/srt/layers/quantization/modelopt_quant.py +58 -41
sglang/srt/layers/quantization/mxfp4.py +20 -3
sglang/srt/layers/quantization/utils.py +52 -2
sglang/srt/layers/quantization/w4afp8.py +20 -11
sglang/srt/layers/quantization/w8a8_int8.py +48 -34
sglang/srt/layers/rotary_embedding.py +281 -2
sglang/srt/layers/sampler.py +5 -2
sglang/srt/lora/backend/base_backend.py +3 -23
sglang/srt/lora/layers.py +66 -116
sglang/srt/lora/lora.py +17 -62
sglang/srt/lora/lora_manager.py +12 -48
sglang/srt/lora/lora_registry.py +20 -9
sglang/srt/lora/mem_pool.py +20 -63
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/utils.py +25 -58
sglang/srt/managers/cache_controller.py +24 -29
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +20 -6
sglang/srt/managers/mm_utils.py +1 -2
sglang/srt/managers/multimodal_processor.py +1 -1
sglang/srt/managers/schedule_batch.py +43 -49
sglang/srt/managers/schedule_policy.py +6 -6
sglang/srt/managers/scheduler.py +18 -11
sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
sglang/srt/managers/tokenizer_manager.py +53 -44
sglang/srt/mem_cache/allocator.py +39 -214
sglang/srt/mem_cache/allocator_ascend.py +158 -0
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +1 -1
sglang/srt/mem_cache/hiradix_cache.py +34 -24
sglang/srt/mem_cache/lora_radix_cache.py +421 -0
sglang/srt/mem_cache/memory_pool_host.py +33 -35
sglang/srt/mem_cache/radix_cache.py +2 -5
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
sglang/srt/model_executor/cuda_graph_runner.py +29 -23
sglang/srt/model_executor/forward_batch_info.py +33 -14
sglang/srt/model_executor/model_runner.py +179 -81
sglang/srt/model_loader/loader.py +18 -6
sglang/srt/models/deepseek_nextn.py +2 -1
sglang/srt/models/deepseek_v2.py +79 -38
sglang/srt/models/gemma2.py +0 -34
sglang/srt/models/gemma3n_mm.py +8 -9
sglang/srt/models/glm4.py +6 -0
sglang/srt/models/glm4_moe.py +11 -11
sglang/srt/models/glm4_moe_nextn.py +2 -1
sglang/srt/models/glm4v.py +589 -0
sglang/srt/models/glm4v_moe.py +400 -0
sglang/srt/models/gpt_oss.py +142 -20
sglang/srt/models/granite.py +0 -25
sglang/srt/models/llama.py +10 -27
sglang/srt/models/llama4.py +19 -6
sglang/srt/models/qwen2.py +2 -2
sglang/srt/models/qwen2_5_vl.py +7 -3
sglang/srt/models/qwen2_audio.py +10 -9
sglang/srt/models/qwen2_moe.py +20 -5
sglang/srt/models/qwen3.py +0 -24
sglang/srt/models/qwen3_classification.py +78 -0
sglang/srt/models/qwen3_moe.py +18 -5
sglang/srt/models/registry.py +1 -1
sglang/srt/models/step3_vl.py +6 -2
sglang/srt/models/torch_native_llama.py +0 -24
sglang/srt/multimodal/processors/base_processor.py +23 -13
sglang/srt/multimodal/processors/glm4v.py +132 -0
sglang/srt/multimodal/processors/qwen_audio.py +4 -2
sglang/srt/operations.py +17 -2
sglang/srt/reasoning_parser.py +316 -0
sglang/srt/sampling/sampling_batch_info.py +7 -4
sglang/srt/server_args.py +142 -140
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_worker.py +16 -0
sglang/srt/two_batch_overlap.py +16 -12
sglang/srt/utils.py +3 -3
sglang/srt/weight_sync/tensor_bucket.py +106 -0
sglang/test/attention/test_trtllm_mla_backend.py +186 -36
sglang/test/doc_patch.py +59 -0
sglang/test/few_shot_gsm8k.py +1 -1
sglang/test/few_shot_gsm8k_engine.py +1 -1
sglang/test/run_eval.py +4 -1
sglang/test/simple_eval_common.py +6 -0
sglang/test/simple_eval_gpqa.py +2 -0
sglang/test/test_fp4_moe.py +118 -36
sglang/test/test_marlin_moe.py +1 -1
sglang/test/test_marlin_utils.py +1 -1
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/METADATA +27 -31
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/RECORD +166 -142
sglang/lang/backend/__init__.py +0 -0
sglang/srt/function_call/harmony_tool_parser.py +0 -130
sglang/srt/layers/quantization/scalar_type.py +0 -352
sglang/srt/lora/backend/flashinfer_backend.py +0 -131
/sglang/{api.py → lang/api.py} +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/top_level.txt +0 -0

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -51,6 +51,7 @@ from sglang.srt.layers.dp_attention import (
     get_attention_tp_rank,
     get_attention_tp_size,
     get_local_attention_dp_size,
+    is_dp_attention_enabled,
 )
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
@@ -212,7 +213,7 @@ class DeepseekV2MLP(nn.Module):
         self,
         x,
         forward_batch=None,
-        can_fuse_mlp_allreduce: bool = False,
+        should_allreduce_fusion: bool = False,
         use_reduce_scatter: bool = False,
     ):
         if (self.tp_size == 1) and x.shape[0] == 0:
@@ -221,7 +222,7 @@ class DeepseekV2MLP(nn.Module):
         gate_up, _ = self.gate_up_proj(x)
         x = self.act_fn(gate_up)
         x, _ = self.down_proj(
-            x, skip_all_reduce=can_fuse_mlp_allreduce or use_reduce_scatter
+            x, skip_all_reduce=should_allreduce_fusion or use_reduce_scatter
         )
         return x
@@ -448,7 +449,7 @@ class DeepseekV2MoE(nn.Module):
         self,
         hidden_states: torch.Tensor,
         forward_batch: Optional[ForwardBatch] = None,
-        can_fuse_mlp_allreduce: bool = False,
+        should_allreduce_fusion: bool = False,
         use_reduce_scatter: bool = False,
     ) -> torch.Tensor:
         if not self._enable_deepep_moe:
@@ -459,11 +460,11 @@ class DeepseekV2MoE(nn.Module):
                 and hidden_states.shape[0] <= DUAL_STREAM_TOKEN_THRESHOLD
             ):
                 return self.forward_normal_dual_stream(
-                    hidden_states, can_fuse_mlp_allreduce, use_reduce_scatter
+                    hidden_states, should_allreduce_fusion, use_reduce_scatter
                 )
             else:
                 return self.forward_normal(
-                    hidden_states, can_fuse_mlp_allreduce, use_reduce_scatter
+                    hidden_states, should_allreduce_fusion, use_reduce_scatter
                 )
         else:
             return self.forward_deepep(hidden_states, forward_batch)
@@ -471,7 +472,7 @@ class DeepseekV2MoE(nn.Module):
     def forward_normal_dual_stream(
         self,
         hidden_states: torch.Tensor,
-        can_fuse_mlp_allreduce: bool = False,
+        should_allreduce_fusion: bool = False,
         use_reduce_scatter: bool = False,
     ) -> torch.Tensor:
@@ -500,20 +501,20 @@ class DeepseekV2MoE(nn.Module):
         torch.add(final_hidden_states, shared_output, out=final_hidden_states_out)
         final_hidden_states = final_hidden_states_out
         sm.tag(final_hidden_states)
-        if self.tp_size > 1 and not can_fuse_mlp_allreduce and not use_reduce_scatter:
+        if self.tp_size > 1 and not should_allreduce_fusion and not use_reduce_scatter:
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states
     def forward_normal(
         self,
         hidden_states: torch.Tensor,
-        can_fuse_mlp_allreduce: bool = False,
+        should_allreduce_fusion: bool = False,
         use_reduce_scatter: bool = False,
     ) -> torch.Tensor:
         if hasattr(self, "shared_experts") and use_intel_amx_backend(
             self.shared_experts.gate_up_proj
         ):
-            return self.forward_cpu(hidden_states, can_fuse_mlp_allreduce)
+            return self.forward_cpu(hidden_states, should_allreduce_fusion)
         shared_output = self._forward_shared_experts(hidden_states)
         # router_logits: (num_tokens, n_experts)
@@ -537,12 +538,14 @@ class DeepseekV2MoE(nn.Module):
             torch.add(final_hidden_states, shared_output, out=final_hidden_states_out)
             final_hidden_states = final_hidden_states_out
             sm.tag(final_hidden_states)
-        if self.tp_size > 1 and not can_fuse_mlp_allreduce and not use_reduce_scatter:
+        if self.tp_size > 1 and not should_allreduce_fusion and not use_reduce_scatter:
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states
     def forward_cpu(
-        self, hidden_states: torch.Tensor, can_fuse_mlp_allreduce: bool = False
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
     ) -> torch.Tensor:
         # router_logits: (num_tokens, n_experts)
         router_logits = self.gate(hidden_states)
@@ -593,7 +596,7 @@ class DeepseekV2MoE(nn.Module):
             None,  # a2_scale
             True,  # is_vnni
         )
-        if self.tp_size > 1 and not can_fuse_mlp_allreduce:
+        if self.tp_size > 1 and not should_allreduce_fusion:
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states
@@ -1194,6 +1197,16 @@ class DeepseekV2AttentionMLA(nn.Module):
         output, _ = self.o_proj(attn_output)
         return output
+    def _fuse_rope_for_trtllm_mla(self, forward_batch: ForwardBatch) -> bool:
+        """
+        Check if we should skip rope and do fused rope+quantize for TRTLLM MLA decode in fp8_e4m3 path.
+        """
+        return (
+            self.current_attention_backend == "trtllm_mla"
+            and forward_batch.forward_mode.is_decode_or_idle()
+            and forward_batch.attn_backend.data_type == torch.float8_e4m3fn
+        )
     def forward_absorb_prepare(
         self,
         positions: torch.Tensor,
@@ -1273,7 +1286,9 @@ class DeepseekV2AttentionMLA(nn.Module):
             q_nope_out = torch.bmm(q_nope.transpose(0, 1), self.w_kc)
         q_nope_out = q_nope_out.transpose(0, 1)
-        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+        if not self._fuse_rope_for_trtllm_mla(forward_batch):
+            q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
         return q_pe, k_pe, q_nope_out, k_nope, forward_batch, zero_allocator
@@ -1286,8 +1301,20 @@ class DeepseekV2AttentionMLA(nn.Module):
             or self.current_attention_backend == "cutlass_mla"
             or self.current_attention_backend == "trtllm_mla"
         ):
+            extra_args = {}
+            if self._fuse_rope_for_trtllm_mla(forward_batch):
+                extra_args = {
+                    "cos_sin_cache": self.rotary_emb.cos_sin_cache,
+                    "is_neox": self.rotary_emb.is_neox_style,
+                }
             attn_output = self.attn_mqa(
-                q_nope_out, k_nope, k_nope, forward_batch, q_rope=q_pe, k_rope=k_pe
+                q_nope_out,
+                k_nope,
+                k_nope,
+                forward_batch,
+                q_rope=q_pe,
+                k_rope=k_pe,
+                **extra_args,
             )
         else:
             q = torch.cat([q_nope_out, q_pe], dim=-1)
@@ -1771,7 +1798,6 @@ class DeepseekV2DecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
-        self.enable_dp_attention = global_server_args_dict["enable_dp_attention"]
         self.speculative_algorithm = global_server_args_dict["speculative_algorithm"]
         self.layer_id = layer_id
         self.is_nextn = is_nextn
@@ -1842,6 +1868,8 @@ class DeepseekV2DecoderLayer(nn.Module):
             allow_reduce_scatter=True,
         )
+        self._fuse_allreduce_lookup_table = self._build_fuse_allreduce_lookup_table()
     def _is_layer_sparse(self, layer_id: int, is_nextn: bool) -> bool:
         return is_nextn or (
             self.config.n_routed_experts is not None
@@ -1850,27 +1878,18 @@ class DeepseekV2DecoderLayer(nn.Module):
         )
     def _should_fuse_mlp_allreduce_with_next_layer(self, forward_batch) -> bool:
-        """Check if MLP allreduce can be fused with next layer's add_rmsnorm"""
+        """Check if MLP allreduce can be fused with next layer's residual_rmsnorm"""
-        if (
-            self.layer_id == self.config.num_hidden_layers - 1
-            or get_tensor_model_parallel_world_size() <= 1
-        ):
-            return False
-        if not global_server_args_dict.get("enable_flashinfer_allreduce_fusion", False):
-            return False
-        if not _is_sm100_supported or not _is_flashinfer_available:
-            return False
+        batch_size = (
+            forward_batch.input_ids.shape[0]
+            if hasattr(forward_batch, "input_ids")
+            else 0
+        )
-        if hasattr(forward_batch, "input_ids") and (
-            forward_batch.input_ids.shape[0] == 0
-            or forward_batch.input_ids.shape[0] > 128
-        ):
+        if batch_size > 128:
             return False
-        return True
+        return self._fuse_allreduce_lookup_table.get(batch_size, False)
     def forward(
         self,
@@ -1896,9 +1915,11 @@ class DeepseekV2DecoderLayer(nn.Module):
             hidden_states, residual, forward_batch
         )
-        can_fuse_mlp_allreduce = (
+        should_allreduce_fusion = (
             self._should_fuse_mlp_allreduce_with_next_layer(forward_batch)
-            and not (self.enable_dp_attention and self.speculative_algorithm.is_eagle())
+            and not (
+                is_dp_attention_enabled() and self.speculative_algorithm.is_eagle()
+            )
             and not self.is_nextn
         )
@@ -1907,13 +1928,13 @@ class DeepseekV2DecoderLayer(nn.Module):
             forward_batch
         )
         hidden_states = self.mlp(
-            hidden_states, forward_batch, can_fuse_mlp_allreduce, use_reduce_scatter
+            hidden_states, forward_batch, should_allreduce_fusion, use_reduce_scatter
         )
-        if can_fuse_mlp_allreduce:
+        if should_allreduce_fusion:
             hidden_states._sglang_needs_allreduce_fusion = True
-        if not can_fuse_mlp_allreduce:
+        if not should_allreduce_fusion:
             hidden_states, residual = self.layer_communicator.postprocess_layer(
                 hidden_states, residual, forward_batch
             )
@@ -1990,6 +2011,26 @@ class DeepseekV2DecoderLayer(nn.Module):
         )
         return output
+    def _build_fuse_allreduce_lookup_table(self):
+        static_conditions_met = (
+            self.layer_id != self.config.num_hidden_layers - 1
+            and get_tensor_model_parallel_world_size() > 1
+            and global_server_args_dict.get("enable_flashinfer_allreduce_fusion", False)
+            and _is_sm100_supported
+            and _is_flashinfer_available
+        )
+        if not static_conditions_met:
+            return {}
+        lookup_table = {}
+        for batch_size in range(129):  # 0 to 128
+            is_last_layer = self.layer_id == self.config.num_hidden_layers - 1
+            should_fuse = batch_size > 0 and batch_size <= 128 and not is_last_layer
+            lookup_table[batch_size] = should_fuse
+        return lookup_table
 class DeepseekV2Model(nn.Module):
     fall_back_to_pt_during_load = False
@@ -2008,7 +2049,7 @@ class DeepseekV2Model(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
-            enable_tp=not global_server_args_dict["enable_dp_attention"],
+            enable_tp=not is_dp_attention_enabled(),
         )
         self.alt_stream = torch.cuda.Stream() if _is_cuda else None
         self.layers = nn.ModuleList(

sglang/srt/models/gemma2.py CHANGED Viewed

@@ -432,40 +432,6 @@ class Gemma2ForCausalLM(nn.Module):
         return result
-    def get_hidden_dim(self, module_name):
-        # return input_dim, output_dim
-        if module_name in ["q_proj", "qkv_proj"]:
-            return (
-                self.config.hidden_size,
-                self.config.head_dim * self.config.num_attention_heads,
-            )
-        elif module_name in ["o_proj"]:
-            return (
-                self.config.head_dim * self.config.num_attention_heads,
-                self.config.hidden_size,
-            )
-        elif module_name in ["kv_proj"]:
-            return (
-                self.config.hidden_size,
-                self.config.head_dim * self.config.num_key_value_heads,
-            )
-        elif module_name == "gate_up_proj":
-            return self.config.hidden_size, self.config.intermediate_size
-        elif module_name == "down_proj":
-            return self.config.intermediate_size, self.config.hidden_size
-        else:
-            raise NotImplementedError()
-    def get_module_name(self, name):
-        params_mapping = {
-            "q_proj": "qkv_proj",
-            "k_proj": "qkv_proj",
-            "v_proj": "qkv_proj",
-            "gate_proj": "gate_up_proj",
-            "up_proj": "gate_up_proj",
-        }
-        return params_mapping.get(name, name)
     def get_attention_sliding_window_size(self):
         return get_attention_sliding_window_size(self.config)

sglang/srt/models/gemma3n_mm.py CHANGED Viewed

@@ -501,27 +501,26 @@ class Gemma3nForConditionalGeneration(PreTrainedModel):
     def get_hidden_dim(self, module_name):
         # return input_dim, output_dim
-        if module_name in ["q_proj", "qkv_proj"]:
+        if module_name == "qkv_proj":
             return (
                 self.config.hidden_size,
-                self.config.head_dim * self.config.num_attention_heads,
+                self.config.head_dim
+                * (
+                    self.config.num_attention_heads
+                    + self.config.num_key_value_heads * 2
+                ),
             )
-        elif module_name in ["o_proj"]:
+        elif module_name == "o_proj":
             return (
                 self.config.head_dim * self.config.num_attention_heads,
                 self.config.hidden_size,
             )
-        elif module_name in ["kv_proj"]:
-            return (
-                self.config.hidden_size,
-                self.config.head_dim * self.config.num_key_value_heads,
-            )
         elif module_name == "gate_up_proj":
             assert len(set(self.config.intermediate_size)) == 1, (
                 "Currently SGLang requires uniform intermediate size for all layers. "
                 "Please file an issue if you need support for non-uniform intermediate sizes."
             )
-            return self.config.hidden_size, self.config.intermediate_size[0]
+            return self.config.hidden_size, self.config.intermediate_size[0] * 2
         elif module_name == "down_proj":
             assert len(set(self.config.intermediate_size)) == 1, (
                 "Currently SGLang requires uniform intermediate size for all layers. "

sglang/srt/models/glm4.py CHANGED Viewed

@@ -218,6 +218,12 @@ class Glm4Model(nn.Module):
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
     @torch.no_grad()
     def forward(
         self,

sglang/srt/models/glm4_moe.py CHANGED Viewed

@@ -40,6 +40,7 @@ from sglang.srt.layers.dp_attention import (
     get_attention_tp_rank,
     get_attention_tp_size,
     get_local_attention_dp_size,
+    is_dp_attention_enabled,
 )
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
@@ -154,13 +155,13 @@ class Glm4MoeMLP(nn.Module):
             )
         self.act_fn = SiluAndMul()
-    def forward(self, x, forward_batch=None, can_fuse_mlp_allreduce=False):
+    def forward(self, x, forward_batch=None, should_allreduce_fusion=False):
         if (self.tp_size == 1) and x.shape[0] == 0:
             return x
         gate_up, _ = self.gate_up_proj(x)
         x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x, skip_all_reduce=can_fuse_mlp_allreduce)
+        x, _ = self.down_proj(x, skip_all_reduce=should_allreduce_fusion)
         return x
@@ -529,7 +530,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
     def forward_normal_dual_stream(
         self,
         hidden_states: torch.Tensor,
-        can_fuse_mlp_allreduce: bool = False,
+        should_allreduce_fusion: bool = False,
         use_reduce_scatter: bool = False,
     ) -> torch.Tensor:
@@ -553,7 +554,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
         if self.ep_size > 1:
             if (
                 self.tp_size > 1
-                and not can_fuse_mlp_allreduce
+                and not should_allreduce_fusion
                 and not use_reduce_scatter
             ):
                 final_hidden_states = tensor_model_parallel_all_reduce(
@@ -564,7 +565,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
             final_hidden_states += shared_output
             if (
                 self.tp_size > 1
-                and not can_fuse_mlp_allreduce
+                and not should_allreduce_fusion
                 and not use_reduce_scatter
             ):
                 final_hidden_states = tensor_model_parallel_all_reduce(
@@ -575,13 +576,13 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
     def forward_normal(
         self,
         hidden_states: torch.Tensor,
-        can_fuse_mlp_allreduce: bool = False,
+        should_allreduce_fusion: bool = False,
         use_reduce_scatter: bool = False,
     ) -> torch.Tensor:
         if hasattr(self, "shared_experts") and use_intel_amx_backend(
             self.shared_experts.gate_up_proj
         ):
-            return self.forward_cpu(hidden_states, can_fuse_mlp_allreduce)
+            return self.forward_cpu(hidden_states, should_allreduce_fusion)
         shared_output = self._forward_shared_experts(hidden_states)
         # router_logits: (num_tokens, n_experts)
@@ -596,7 +597,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
             # fused in biased_grouped_topk so we can skip here
             final_hidden_states *= self.routed_scaling_factor
         if self.ep_size > 1:
-            if self.tp_size > 1 and not can_fuse_mlp_allreduce:
+            if self.tp_size > 1 and not should_allreduce_fusion:
                 final_hidden_states = tensor_model_parallel_all_reduce(
                     final_hidden_states
                 )
@@ -605,7 +606,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
         else:
             if shared_output is not None:
                 final_hidden_states += shared_output
-            if self.tp_size > 1 and not can_fuse_mlp_allreduce:
+            if self.tp_size > 1 and not should_allreduce_fusion:
                 final_hidden_states = tensor_model_parallel_all_reduce(
                     final_hidden_states
                 )
@@ -634,7 +635,6 @@ class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
         )
         rms_norm_eps = config.rms_norm_eps
         attention_bias = config.attention_bias
-        self.enable_dp_attention = global_server_args_dict["enable_dp_attention"]
         self.layer_id = layer_id
         self.self_attn = Glm4MoeAttention(
             hidden_size=self.hidden_size,
@@ -744,7 +744,7 @@ class Glm4MoeModel(DeepseekV2Model):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
-            enable_tp=not global_server_args_dict["enable_dp_attention"],
+            enable_tp=not is_dp_attention_enabled(),
         )
         self.alt_stream = torch.cuda.Stream() if _is_cuda else None
         self.layers = nn.ModuleList(

sglang/srt/models/glm4_moe_nextn.py CHANGED Viewed

@@ -22,6 +22,7 @@ from transformers import PretrainedConfig
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers.dp_attention import is_dp_attention_enabled
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
@@ -56,7 +57,7 @@ class Glm4MoeModelNextN(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
-            enable_tp=not global_server_args_dict["enable_dp_attention"],
+            enable_tp=not is_dp_attention_enabled(),
             prefix=add_prefix("embed_tokens", prefix),
         )

sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl

sglang 0.5.0rc0py3-none-any.whl → 0.5.0rc2py3-none-any.whl