PyPI - sglang - Versions diffs - 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl - Mend

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

sglang/__init__.py +8 -3
sglang/bench_one_batch.py +119 -17
sglang/lang/chat_template.py +18 -0
sglang/srt/bench_utils.py +137 -0
sglang/srt/configs/model_config.py +42 -7
sglang/srt/conversation.py +9 -5
sglang/srt/disaggregation/base/conn.py +5 -2
sglang/srt/disaggregation/decode.py +14 -4
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
sglang/srt/disaggregation/mooncake/conn.py +286 -160
sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
sglang/srt/disaggregation/prefill.py +2 -0
sglang/srt/distributed/parallel_state.py +15 -11
sglang/srt/entrypoints/context.py +227 -0
sglang/srt/entrypoints/engine.py +15 -9
sglang/srt/entrypoints/harmony_utils.py +372 -0
sglang/srt/entrypoints/http_server.py +74 -4
sglang/srt/entrypoints/openai/protocol.py +218 -1
sglang/srt/entrypoints/openai/serving_chat.py +41 -11
sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
sglang/srt/entrypoints/openai/tool_server.py +175 -0
sglang/srt/entrypoints/tool.py +87 -0
sglang/srt/eplb/expert_location.py +5 -1
sglang/srt/function_call/ebnf_composer.py +1 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +331 -0
sglang/srt/function_call/kimik2_detector.py +3 -3
sglang/srt/function_call/qwen3_coder_detector.py +219 -9
sglang/srt/hf_transformers_utils.py +30 -3
sglang/srt/jinja_template_utils.py +14 -1
sglang/srt/layers/attention/aiter_backend.py +375 -115
sglang/srt/layers/attention/ascend_backend.py +3 -0
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
sglang/srt/layers/attention/flashattention_backend.py +18 -0
sglang/srt/layers/attention/flashinfer_backend.py +52 -13
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +85 -14
sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
sglang/srt/layers/attention/vision.py +22 -6
sglang/srt/layers/attention/wave_backend.py +627 -0
sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
sglang/srt/layers/communicator.py +29 -14
sglang/srt/layers/dp_attention.py +12 -0
sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
sglang/srt/layers/linear.py +3 -7
sglang/srt/layers/moe/cutlass_moe.py +12 -3
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
sglang/srt/layers/moe/ep_moe/layer.py +135 -73
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
sglang/srt/layers/moe/topk.py +16 -4
sglang/srt/layers/moe/utils.py +16 -0
sglang/srt/layers/quantization/__init__.py +27 -3
sglang/srt/layers/quantization/fp4.py +557 -0
sglang/srt/layers/quantization/fp8.py +3 -6
sglang/srt/layers/quantization/fp8_kernel.py +277 -0
sglang/srt/layers/quantization/fp8_utils.py +51 -10
sglang/srt/layers/quantization/modelopt_quant.py +258 -68
sglang/srt/layers/quantization/mxfp4.py +654 -0
sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
sglang/srt/layers/quantization/quark/utils.py +107 -0
sglang/srt/layers/quantization/unquant.py +60 -6
sglang/srt/layers/quantization/w4afp8.py +21 -12
sglang/srt/layers/quantization/w8a8_int8.py +48 -34
sglang/srt/layers/rotary_embedding.py +506 -3
sglang/srt/layers/utils.py +9 -0
sglang/srt/layers/vocab_parallel_embedding.py +8 -3
sglang/srt/lora/backend/base_backend.py +3 -23
sglang/srt/lora/layers.py +60 -114
sglang/srt/lora/lora.py +17 -62
sglang/srt/lora/lora_manager.py +82 -62
sglang/srt/lora/lora_registry.py +23 -11
sglang/srt/lora/mem_pool.py +63 -68
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/utils.py +25 -58
sglang/srt/managers/cache_controller.py +75 -58
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +20 -8
sglang/srt/managers/mm_utils.py +6 -13
sglang/srt/managers/multimodal_processor.py +1 -1
sglang/srt/managers/schedule_batch.py +61 -25
sglang/srt/managers/schedule_policy.py +6 -6
sglang/srt/managers/scheduler.py +41 -19
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
sglang/srt/managers/scheduler_recv_skipper.py +37 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
sglang/srt/managers/template_manager.py +35 -1
sglang/srt/managers/tokenizer_manager.py +47 -30
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
sglang/srt/mem_cache/allocator.py +61 -87
sglang/srt/mem_cache/hicache_storage.py +1 -1
sglang/srt/mem_cache/hiradix_cache.py +80 -22
sglang/srt/mem_cache/lora_radix_cache.py +421 -0
sglang/srt/mem_cache/memory_pool_host.py +34 -36
sglang/srt/mem_cache/multimodal_cache.py +33 -13
sglang/srt/mem_cache/radix_cache.py +2 -5
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
sglang/srt/model_executor/cuda_graph_runner.py +29 -9
sglang/srt/model_executor/forward_batch_info.py +61 -19
sglang/srt/model_executor/model_runner.py +148 -37
sglang/srt/model_loader/loader.py +18 -6
sglang/srt/model_loader/weight_utils.py +10 -0
sglang/srt/models/bailing_moe.py +425 -0
sglang/srt/models/deepseek_v2.py +137 -59
sglang/srt/models/ernie4.py +426 -0
sglang/srt/models/ernie4_eagle.py +203 -0
sglang/srt/models/gemma2.py +0 -34
sglang/srt/models/gemma3n_mm.py +38 -0
sglang/srt/models/glm4.py +6 -0
sglang/srt/models/glm4_moe.py +28 -16
sglang/srt/models/glm4v.py +589 -0
sglang/srt/models/glm4v_moe.py +400 -0
sglang/srt/models/gpt_oss.py +1251 -0
sglang/srt/models/granite.py +0 -25
sglang/srt/models/llama.py +0 -25
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/qwen2.py +6 -0
sglang/srt/models/qwen2_5_vl.py +7 -3
sglang/srt/models/qwen2_audio.py +10 -9
sglang/srt/models/qwen2_moe.py +6 -0
sglang/srt/models/qwen3.py +0 -24
sglang/srt/models/qwen3_moe.py +32 -6
sglang/srt/models/registry.py +1 -1
sglang/srt/models/step3_vl.py +9 -0
sglang/srt/models/torch_native_llama.py +0 -24
sglang/srt/models/transformers.py +2 -5
sglang/srt/multimodal/processors/base_processor.py +23 -13
sglang/srt/multimodal/processors/glm4v.py +132 -0
sglang/srt/multimodal/processors/qwen_audio.py +4 -2
sglang/srt/multimodal/processors/step3_vl.py +3 -1
sglang/srt/reasoning_parser.py +332 -37
sglang/srt/server_args.py +186 -75
sglang/srt/speculative/eagle_worker.py +16 -0
sglang/srt/two_batch_overlap.py +169 -9
sglang/srt/utils.py +41 -5
sglang/srt/weight_sync/tensor_bucket.py +106 -0
sglang/test/attention/test_trtllm_mla_backend.py +186 -36
sglang/test/doc_patch.py +59 -0
sglang/test/few_shot_gsm8k.py +1 -1
sglang/test/few_shot_gsm8k_engine.py +1 -1
sglang/test/run_eval.py +4 -1
sglang/test/runners.py +2 -2
sglang/test/simple_eval_common.py +6 -0
sglang/test/simple_eval_gpqa.py +2 -0
sglang/test/test_fp4_moe.py +118 -36
sglang/test/test_utils.py +1 -1
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
sglang/srt/lora/backend/flashinfer_backend.py +0 -131
/sglang/{api.py → lang/api.py} +0 -0
/sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -60,12 +60,9 @@ from sglang.srt.layers.linear import (
     RowParallelLinear,
 )
 from sglang.srt.layers.logits_processor import LogitsProcessor
-from sglang.srt.layers.moe.ep_moe.layer import (
-    DeepEPMoE,
-    get_moe_impl_class,
-    should_use_flashinfer_trtllm_moe,
-)
+from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
 from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.moe.utils import should_use_flashinfer_trtllm_moe
 from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.fp8_kernel import (
@@ -211,13 +208,21 @@ class DeepseekV2MLP(nn.Module):
             )
         self.act_fn = SiluAndMul()
-    def forward(self, x, forward_batch=None, can_fuse_mlp_allreduce=False):
+    def forward(
+        self,
+        x,
+        forward_batch=None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ):
         if (self.tp_size == 1) and x.shape[0] == 0:
             return x
         gate_up, _ = self.gate_up_proj(x)
         x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x, can_fuse_mlp_allreduce=can_fuse_mlp_allreduce)
+        x, _ = self.down_proj(
+            x, skip_all_reduce=should_allreduce_fusion or use_reduce_scatter
+        )
         return x
@@ -307,19 +312,15 @@ class DeepseekV2MoE(nn.Module):
             config=config, prefix=add_prefix("gate", prefix), is_nextn=is_nextn
         )
-        self.topk = (
-            TopK(
-                top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
-                renormalize=config.norm_topk_prob,
-                use_grouped_topk=True,
-                num_expert_group=config.n_group,
-                num_fused_shared_experts=self.num_fused_shared_experts,
-                topk_group=config.topk_group,
-                correction_bias=self.gate.e_score_correction_bias,
-                routed_scaling_factor=self.routed_scaling_factor,
-            )
-            if not should_use_flashinfer_trtllm_moe()
-            else None
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
+            renormalize=config.norm_topk_prob,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            topk_group=config.topk_group,
+            correction_bias=self.gate.e_score_correction_bias,
+            routed_scaling_factor=self.routed_scaling_factor,
         )
         self.experts = get_moe_impl_class()(
@@ -447,7 +448,8 @@ class DeepseekV2MoE(nn.Module):
         self,
         hidden_states: torch.Tensor,
         forward_batch: Optional[ForwardBatch] = None,
-        can_fuse_mlp_allreduce: bool = False,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
     ) -> torch.Tensor:
         if not self._enable_deepep_moe:
             DUAL_STREAM_TOKEN_THRESHOLD = 1024
@@ -457,15 +459,20 @@ class DeepseekV2MoE(nn.Module):
                 and hidden_states.shape[0] <= DUAL_STREAM_TOKEN_THRESHOLD
             ):
                 return self.forward_normal_dual_stream(
-                    hidden_states, can_fuse_mlp_allreduce
+                    hidden_states, should_allreduce_fusion, use_reduce_scatter
                 )
             else:
-                return self.forward_normal(hidden_states, can_fuse_mlp_allreduce)
+                return self.forward_normal(
+                    hidden_states, should_allreduce_fusion, use_reduce_scatter
+                )
         else:
             return self.forward_deepep(hidden_states, forward_batch)
     def forward_normal_dual_stream(
-        self, hidden_states: torch.Tensor, can_fuse_mlp_allreduce: bool = False
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
     ) -> torch.Tensor:
         current_stream = torch.cuda.current_stream()
@@ -476,10 +483,14 @@ class DeepseekV2MoE(nn.Module):
             # router_logits: (num_tokens, n_experts)
             router_logits = self.gate(hidden_states)
             kwargs = {"hidden_states": hidden_states}
-            if self.topk is not None:
-                kwargs["topk_output"] = self.topk(hidden_states, router_logits)
+            # FlashInferFP4MoE (TRTLLM path) expects (TopK, router_logits) tuple
+            # Regular FusedMoE (CUTLASS path) expects StandardTopKOutput
+            if should_use_flashinfer_trtllm_moe():
+                kwargs["topk_output"] = (self.topk, router_logits)
             else:
-                kwargs["router_logits"] = router_logits
+                kwargs["topk_output"] = self.topk(hidden_states, router_logits)
             final_hidden_states = self.experts(**kwargs)
             if not _is_cuda:
                 final_hidden_states *= self.routed_scaling_factor
@@ -489,26 +500,33 @@ class DeepseekV2MoE(nn.Module):
         torch.add(final_hidden_states, shared_output, out=final_hidden_states_out)
         final_hidden_states = final_hidden_states_out
         sm.tag(final_hidden_states)
-        if self.tp_size > 1 and not can_fuse_mlp_allreduce:
+        if self.tp_size > 1 and not should_allreduce_fusion and not use_reduce_scatter:
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states
     def forward_normal(
-        self, hidden_states: torch.Tensor, can_fuse_mlp_allreduce: bool = False
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
     ) -> torch.Tensor:
         if hasattr(self, "shared_experts") and use_intel_amx_backend(
             self.shared_experts.gate_up_proj
         ):
-            return self.forward_cpu(hidden_states, can_fuse_mlp_allreduce)
+            return self.forward_cpu(hidden_states, should_allreduce_fusion)
         shared_output = self._forward_shared_experts(hidden_states)
         # router_logits: (num_tokens, n_experts)
         router_logits = self.gate(hidden_states)
         kwargs = {"hidden_states": hidden_states}
-        if self.topk is not None:
-            kwargs["topk_output"] = self.topk(hidden_states, router_logits)
+        # FlashInferFP4MoE (TRTLLM path) expects (TopK, router_logits) tuple
+        # Regular FusedMoE (CUTLASS path) expects StandardTopKOutput
+        if should_use_flashinfer_trtllm_moe():
+            kwargs["topk_output"] = (self.topk, router_logits)
         else:
-            kwargs["router_logits"] = router_logits
+            kwargs["topk_output"] = self.topk(hidden_states, router_logits)
         final_hidden_states = self.experts(**kwargs)
         if not _is_cuda and not _use_aiter:
             # fused in biased_grouped_topk so we can skip here
@@ -519,12 +537,14 @@ class DeepseekV2MoE(nn.Module):
             torch.add(final_hidden_states, shared_output, out=final_hidden_states_out)
             final_hidden_states = final_hidden_states_out
             sm.tag(final_hidden_states)
-        if self.tp_size > 1 and not can_fuse_mlp_allreduce:
+        if self.tp_size > 1 and not should_allreduce_fusion and not use_reduce_scatter:
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states
     def forward_cpu(
-        self, hidden_states: torch.Tensor, can_fuse_mlp_allreduce: bool = False
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
     ) -> torch.Tensor:
         # router_logits: (num_tokens, n_experts)
         router_logits = self.gate(hidden_states)
@@ -575,7 +595,7 @@ class DeepseekV2MoE(nn.Module):
             None,  # a2_scale
             True,  # is_vnni
         )
-        if self.tp_size > 1 and not can_fuse_mlp_allreduce:
+        if self.tp_size > 1 and not should_allreduce_fusion:
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states
@@ -1176,6 +1196,16 @@ class DeepseekV2AttentionMLA(nn.Module):
         output, _ = self.o_proj(attn_output)
         return output
+    def _fuse_rope_for_trtllm_mla(self, forward_batch: ForwardBatch) -> bool:
+        """
+        Check if we should skip rope and do fused rope+quantize for TRTLLM MLA decode in fp8_e4m3 path.
+        """
+        return (
+            self.current_attention_backend == "trtllm_mla"
+            and forward_batch.forward_mode.is_decode_or_idle()
+            and forward_batch.attn_backend.data_type == torch.float8_e4m3fn
+        )
     def forward_absorb_prepare(
         self,
         positions: torch.Tensor,
@@ -1255,7 +1285,9 @@ class DeepseekV2AttentionMLA(nn.Module):
             q_nope_out = torch.bmm(q_nope.transpose(0, 1), self.w_kc)
         q_nope_out = q_nope_out.transpose(0, 1)
-        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+        if not self._fuse_rope_for_trtllm_mla(forward_batch):
+            q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
         return q_pe, k_pe, q_nope_out, k_nope, forward_batch, zero_allocator
@@ -1268,8 +1300,20 @@ class DeepseekV2AttentionMLA(nn.Module):
             or self.current_attention_backend == "cutlass_mla"
             or self.current_attention_backend == "trtllm_mla"
         ):
+            extra_args = {}
+            if self._fuse_rope_for_trtllm_mla(forward_batch):
+                extra_args = {
+                    "cos_sin_cache": self.rotary_emb.cos_sin_cache,
+                    "is_neox": self.rotary_emb.is_neox_style,
+                }
             attn_output = self.attn_mqa(
-                q_nope_out, k_nope, k_nope, forward_batch, q_rope=q_pe, k_rope=k_pe
+                q_nope_out,
+                k_nope,
+                k_nope,
+                forward_batch,
+                q_rope=q_pe,
+                k_rope=k_pe,
+                **extra_args,
             )
         else:
             q = torch.cat([q_nope_out, q_pe], dim=-1)
@@ -1821,8 +1865,11 @@ class DeepseekV2DecoderLayer(nn.Module):
             layer_scatter_modes=self.layer_scatter_modes,
             input_layernorm=self.input_layernorm,
             post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
         )
+        self._fuse_allreduce_lookup_table = self._build_fuse_allreduce_lookup_table()
     def _is_layer_sparse(self, layer_id: int, is_nextn: bool) -> bool:
         return is_nextn or (
             self.config.n_routed_experts is not None
@@ -1831,27 +1878,18 @@ class DeepseekV2DecoderLayer(nn.Module):
         )
     def _should_fuse_mlp_allreduce_with_next_layer(self, forward_batch) -> bool:
-        """Check if MLP allreduce can be fused with next layer's add_rmsnorm"""
+        """Check if MLP allreduce can be fused with next layer's residual_rmsnorm"""
-        if (
-            self.layer_id == self.config.num_hidden_layers - 1
-            or get_tensor_model_parallel_world_size() <= 1
-        ):
-            return False
-        if not global_server_args_dict.get("enable_flashinfer_allreduce_fusion", False):
-            return False
-        if not _is_sm100_supported or not _is_flashinfer_available:
-            return False
+        batch_size = (
+            forward_batch.input_ids.shape[0]
+            if hasattr(forward_batch, "input_ids")
+            else 0
+        )
-        if hasattr(forward_batch, "input_ids") and (
-            forward_batch.input_ids.shape[0] == 0
-            or forward_batch.input_ids.shape[0] > 128
-        ):
+        if batch_size > 128:
             return False
-        return True
+        return self._fuse_allreduce_lookup_table.get(batch_size, False)
     def forward(
         self,
@@ -1877,18 +1915,24 @@ class DeepseekV2DecoderLayer(nn.Module):
             hidden_states, residual, forward_batch
         )
-        can_fuse_mlp_allreduce = (
+        should_allreduce_fusion = (
             self._should_fuse_mlp_allreduce_with_next_layer(forward_batch)
             and not (self.enable_dp_attention and self.speculative_algorithm.is_eagle())
             and not self.is_nextn
         )
-        hidden_states = self.mlp(hidden_states, forward_batch, can_fuse_mlp_allreduce)
+        # For DP with padding, reduce scatter can be used instead of all-reduce.
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+        hidden_states = self.mlp(
+            hidden_states, forward_batch, should_allreduce_fusion, use_reduce_scatter
+        )
-        if can_fuse_mlp_allreduce:
+        if should_allreduce_fusion:
             hidden_states._sglang_needs_allreduce_fusion = True
-        if not can_fuse_mlp_allreduce:
+        if not should_allreduce_fusion:
             hidden_states, residual = self.layer_communicator.postprocess_layer(
                 hidden_states, residual, forward_batch
             )
@@ -1965,6 +2009,26 @@ class DeepseekV2DecoderLayer(nn.Module):
         )
         return output
+    def _build_fuse_allreduce_lookup_table(self):
+        static_conditions_met = (
+            self.layer_id != self.config.num_hidden_layers - 1
+            and get_tensor_model_parallel_world_size() > 1
+            and global_server_args_dict.get("enable_flashinfer_allreduce_fusion", False)
+            and _is_sm100_supported
+            and _is_flashinfer_available
+        )
+        if not static_conditions_met:
+            return {}
+        lookup_table = {}
+        for batch_size in range(129):  # 0 to 128
+            is_last_layer = self.layer_id == self.config.num_hidden_layers - 1
+            should_fuse = batch_size > 0 and batch_size <= 128 and not is_last_layer
+            lookup_table[batch_size] = should_fuse
+        return lookup_table
 class DeepseekV2Model(nn.Module):
     fall_back_to_pt_during_load = False
@@ -2060,6 +2124,8 @@ class DeepseekV2Model(nn.Module):
 class DeepseekV2ForCausalLM(nn.Module):
+    # for quark model load
+    packed_modules_mapping = {}
     def __init__(
         self,
@@ -2068,6 +2134,18 @@ class DeepseekV2ForCausalLM(nn.Module):
         prefix: str = "",
     ) -> None:
         super().__init__()
+        # for quark model load
+        # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+        self.fuse_qkv_a_proj = (
+            hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
+        )
+        if self.fuse_qkv_a_proj:
+            self.packed_modules_mapping["fused_qkv_a_proj_with_mqa"] = [
+                "q_a_proj",
+                "kv_a_proj_with_mqa",
+            ]
         self.config = config
         self.tp_size = get_tensor_model_parallel_world_size()
         self.quant_config = quant_config

sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc1py3-none-any.whl