PyPI - sglang - Versions diffs - 0.4.9.post4__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl - Mend

sglang 0.4.9.post4py3-none-any.whl → 0.4.9.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

sglang/lang/chat_template.py +21 -0
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/model_config.py +4 -0
sglang/srt/constrained/base_grammar_backend.py +10 -2
sglang/srt/constrained/xgrammar_backend.py +7 -5
sglang/srt/conversation.py +16 -1
sglang/srt/debug_utils/__init__.py +0 -0
sglang/srt/debug_utils/dump_comparator.py +131 -0
sglang/srt/debug_utils/dumper.py +108 -0
sglang/srt/debug_utils/text_comparator.py +172 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
sglang/srt/disaggregation/mooncake/conn.py +16 -0
sglang/srt/disaggregation/prefill.py +13 -1
sglang/srt/entrypoints/engine.py +4 -2
sglang/srt/entrypoints/openai/serving_chat.py +132 -79
sglang/srt/function_call/ebnf_composer.py +10 -3
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +164 -0
sglang/srt/function_call/qwen3_coder_detector.py +1 -0
sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
sglang/srt/layers/attention/vision.py +56 -8
sglang/srt/layers/layernorm.py +26 -1
sglang/srt/layers/logits_processor.py +14 -3
sglang/srt/layers/moe/ep_moe/layer.py +172 -206
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
sglang/srt/layers/moe/topk.py +84 -22
sglang/srt/layers/multimodal.py +11 -8
sglang/srt/layers/quantization/fp8.py +25 -247
sglang/srt/layers/quantization/fp8_kernel.py +78 -48
sglang/srt/layers/quantization/modelopt_quant.py +25 -10
sglang/srt/layers/quantization/unquant.py +24 -76
sglang/srt/layers/quantization/w4afp8.py +68 -17
sglang/srt/lora/lora_registry.py +93 -29
sglang/srt/managers/cache_controller.py +9 -7
sglang/srt/managers/mm_utils.py +154 -35
sglang/srt/managers/multimodal_processor.py +3 -14
sglang/srt/managers/schedule_batch.py +14 -8
sglang/srt/managers/scheduler.py +35 -1
sglang/srt/managers/tokenizer_manager.py +37 -6
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/mem_cache/hiradix_cache.py +5 -2
sglang/srt/model_executor/model_runner.py +68 -14
sglang/srt/models/deepseek_v2.py +62 -28
sglang/srt/models/glm4_moe.py +1035 -0
sglang/srt/models/glm4_moe_nextn.py +167 -0
sglang/srt/models/interns1.py +328 -0
sglang/srt/models/internvl.py +143 -47
sglang/srt/models/llava.py +9 -5
sglang/srt/models/minicpmo.py +4 -1
sglang/srt/models/qwen2_moe.py +2 -2
sglang/srt/models/qwen3_moe.py +5 -2
sglang/srt/multimodal/processors/base_processor.py +20 -6
sglang/srt/multimodal/processors/clip.py +2 -2
sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
sglang/srt/multimodal/processors/gemma3.py +2 -2
sglang/srt/multimodal/processors/gemma3n.py +2 -2
sglang/srt/multimodal/processors/internvl.py +21 -8
sglang/srt/multimodal/processors/janus_pro.py +2 -2
sglang/srt/multimodal/processors/kimi_vl.py +2 -2
sglang/srt/multimodal/processors/llava.py +4 -4
sglang/srt/multimodal/processors/minicpm.py +2 -3
sglang/srt/multimodal/processors/mlama.py +2 -2
sglang/srt/multimodal/processors/mllama4.py +18 -111
sglang/srt/multimodal/processors/phi4mm.py +2 -2
sglang/srt/multimodal/processors/pixtral.py +2 -2
sglang/srt/multimodal/processors/qwen_audio.py +2 -2
sglang/srt/multimodal/processors/qwen_vl.py +2 -2
sglang/srt/multimodal/processors/vila.py +3 -1
sglang/srt/reasoning_parser.py +2 -1
sglang/srt/server_args.py +57 -6
sglang/srt/utils.py +96 -1
sglang/srt/weight_sync/utils.py +119 -0
sglang/test/runners.py +4 -0
sglang/test/test_utils.py +65 -5
sglang/utils.py +19 -0
sglang/version.py +1 -1
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +4 -4
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +83 -73
sglang/srt/debug_utils.py +0 -74
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -1308,9 +1308,58 @@ class ModelRunner:
         else:
             self.attn_backend = self._get_attention_backend()
-    # TODO unify with 6338
     def _get_attention_backend(self):
-        if self.server_args.attention_backend == "flashinfer":
+        """Init attention kernel backend."""
+        self.decode_attention_backend_str = (
+            self.server_args.decode_attention_backend
+            if self.server_args.decode_attention_backend
+            else self.server_args.attention_backend
+        )
+        self.prefill_attention_backend_str = (
+            self.server_args.prefill_attention_backend
+            if self.server_args.prefill_attention_backend
+            else self.server_args.attention_backend
+        )
+        if self.decode_attention_backend_str != self.prefill_attention_backend_str:
+            assert (
+                self.server_args.speculative_algorithm is None
+            ), "Currently HybridAttentionBackend does not support speculative decoding."
+            from sglang.srt.layers.attention.hybrid_attn_backend import (
+                HybridAttnBackend,
+            )
+            attn_backend = HybridAttnBackend(
+                decode_backend=self._get_attention_backend_from_str(
+                    self.decode_attention_backend_str
+                ),
+                prefill_backend=self._get_attention_backend_from_str(
+                    self.prefill_attention_backend_str
+                ),
+            )
+            logger.info(
+                f"Using hybrid attention backend for decode and prefill: "
+                f"decode_backend={self.decode_attention_backend_str}, "
+                f"prefill_backend={self.prefill_attention_backend_str}."
+            )
+            logger.warning(
+                f"Warning: Attention backend specified by --attention-backend or default backend might be overridden."
+                f"The feature of hybrid attention backend is experimental and unstable. Please raise an issue if you encounter any problem."
+            )
+        else:
+            attn_backend = self._get_attention_backend_from_str(
+                self.server_args.attention_backend
+            )
+        global_server_args_dict.update(
+            {
+                "decode_attention_backend": self.decode_attention_backend_str,
+                "prefill_attention_backend": self.prefill_attention_backend_str,
+            }
+        )
+        return attn_backend
+    def _get_attention_backend_from_str(self, backend_str: str):
+        if backend_str == "flashinfer":
             if not self.use_mla_backend:
                 from sglang.srt.layers.attention.flashinfer_backend import (
                     FlashInferAttnBackend,
@@ -1318,7 +1367,11 @@ class ModelRunner:
                 # Init streams
                 if self.server_args.speculative_algorithm == "EAGLE":
-                    self.plan_stream_for_flashinfer = torch.cuda.Stream()
+                    if (
+                        not hasattr(self, "plan_stream_for_flashinfer")
+                        or not self.plan_stream_for_flashinfer
+                    ):
+                        self.plan_stream_for_flashinfer = torch.cuda.Stream()
                 return FlashInferAttnBackend(self)
             else:
                 from sglang.srt.layers.attention.flashinfer_mla_backend import (
@@ -1326,15 +1379,15 @@ class ModelRunner:
                 )
                 return FlashInferMLAAttnBackend(self)
-        elif self.server_args.attention_backend == "aiter":
+        elif backend_str == "aiter":
             from sglang.srt.layers.attention.aiter_backend import AiterAttnBackend
             return AiterAttnBackend(self)
-        elif self.server_args.attention_backend == "ascend":
+        elif backend_str == "ascend":
             from sglang.srt.layers.attention.ascend_backend import AscendAttnBackend
             return AscendAttnBackend(self)
-        elif self.server_args.attention_backend == "triton":
+        elif backend_str == "triton":
             assert not self.model_config.is_encoder_decoder, (
                 "Cross attention is not supported in the triton attention backend. "
                 "Please use `--attention-backend flashinfer`."
@@ -1349,17 +1402,17 @@ class ModelRunner:
                 from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
                 return TritonAttnBackend(self)
-        elif self.server_args.attention_backend == "torch_native":
+        elif backend_str == "torch_native":
             from sglang.srt.layers.attention.torch_native_backend import (
                 TorchNativeAttnBackend,
             )
             return TorchNativeAttnBackend(self)
-        elif self.server_args.attention_backend == "flashmla":
+        elif backend_str == "flashmla":
             from sglang.srt.layers.attention.flashmla_backend import FlashMLABackend
             return FlashMLABackend(self)
-        elif self.server_args.attention_backend == "fa3":
+        elif backend_str == "fa3":
             assert (
                 torch.cuda.get_device_capability()[0] == 8 and not self.use_mla_backend
             ) or torch.cuda.get_device_capability()[0] == 9, (
@@ -1371,7 +1424,7 @@ class ModelRunner:
             )
             return FlashAttentionBackend(self)
-        elif self.server_args.attention_backend == "cutlass_mla":
+        elif backend_str == "cutlass_mla":
             from sglang.srt.layers.attention.cutlass_mla_backend import (
                 CutlassMLABackend,
             )
@@ -1385,9 +1438,7 @@ class ModelRunner:
             logger.info(f"Intel AMX attention backend is enabled.")
             return IntelAMXAttnBackend(self)
         else:
-            raise ValueError(
-                f"Invalid attention backend: {self.server_args.attention_backend}"
-            )
+            raise ValueError(f"Invalid attention backend: {backend_str}")
     def init_double_sparsity_channel_config(self, selected_channel):
         selected_channel = "." + selected_channel + "_proj"
@@ -1475,7 +1526,10 @@ class ModelRunner:
         if self.support_pp:
             kwargs["pp_proxy_tensors"] = pp_proxy_tensors
         return self.model.forward(
-            forward_batch.input_ids, forward_batch.positions, forward_batch, **kwargs
+            forward_batch.input_ids,
+            forward_batch.positions,
+            forward_batch,
+            **kwargs,
         )
     def forward_extend(

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -56,7 +56,11 @@ from sglang.srt.layers.linear import (
     RowParallelLinear,
 )
 from sglang.srt.layers.logits_processor import LogitsProcessor
-from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
+from sglang.srt.layers.moe.ep_moe.layer import (
+    DeepEPMoE,
+    get_moe_impl_class,
+    use_flashinfer_trtllm_moe,
+)
 from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
 from sglang.srt.layers.moe.topk import TopK
 from sglang.srt.layers.quantization import deep_gemm_wrapper
@@ -302,15 +306,19 @@ class DeepseekV2MoE(nn.Module):
             config=config, prefix=add_prefix("gate", prefix), is_nextn=is_nextn
         )
-        self.topk = TopK(
-            top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
-            renormalize=config.norm_topk_prob,
-            use_grouped_topk=True,
-            num_expert_group=config.n_group,
-            num_fused_shared_experts=self.num_fused_shared_experts,
-            topk_group=config.topk_group,
-            correction_bias=self.gate.e_score_correction_bias,
-            routed_scaling_factor=self.routed_scaling_factor,
+        self.topk = (
+            TopK(
+                top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
+                renormalize=config.norm_topk_prob,
+                use_grouped_topk=True,
+                num_expert_group=config.n_group,
+                num_fused_shared_experts=self.num_fused_shared_experts,
+                topk_group=config.topk_group,
+                correction_bias=self.gate.e_score_correction_bias,
+                routed_scaling_factor=self.routed_scaling_factor,
+            )
+            if not use_flashinfer_trtllm_moe
+            else None
         )
         self.experts = get_moe_impl_class()(
@@ -332,10 +340,22 @@ class DeepseekV2MoE(nn.Module):
             # Additional args for FusedMoE
             **(
                 dict(
-                    enable_flashinfer_moe=True,
+                    enable_flashinfer_cutlass_moe=True,
                     enable_ep_moe=global_server_args_dict["enable_ep_moe"],
                 )
-                if global_server_args_dict["enable_flashinfer_moe"]
+                if global_server_args_dict["enable_flashinfer_cutlass_moe"]
+                else {}
+            ),
+            **(
+                dict(
+                    renormalize=config.norm_topk_prob,
+                    use_grouped_topk=True,
+                    num_expert_group=config.n_group,
+                    num_fused_shared_experts=self.num_fused_shared_experts,
+                    topk_group=config.topk_group,
+                    correction_bias=self.gate.e_score_correction_bias,
+                )
+                if use_flashinfer_trtllm_moe
                 else {}
             ),
         )
@@ -455,10 +475,12 @@ class DeepseekV2MoE(nn.Module):
         with torch.cuda.stream(self.alt_stream):
             # router_logits: (num_tokens, n_experts)
             router_logits = self.gate(hidden_states)
-            topk_output = self.topk(hidden_states, router_logits)
-            final_hidden_states = self.experts(
-                hidden_states=hidden_states, topk_output=topk_output
-            )
+            kwargs = {"hidden_states": hidden_states}
+            if self.topk is not None:
+                kwargs["topk_output"] = self.topk(hidden_states, router_logits)
+            else:
+                kwargs["router_logits"] = router_logits
+            final_hidden_states = self.experts(**kwargs)
             if not _is_cuda:
                 final_hidden_states *= self.routed_scaling_factor
         current_stream.wait_stream(self.alt_stream)
@@ -478,10 +500,12 @@ class DeepseekV2MoE(nn.Module):
         shared_output = self._forward_shared_experts(hidden_states)
         # router_logits: (num_tokens, n_experts)
         router_logits = self.gate(hidden_states)
-        topk_output = self.topk(hidden_states, router_logits)
-        final_hidden_states = self.experts(
-            hidden_states=hidden_states, topk_output=topk_output
-        )
+        kwargs = {"hidden_states": hidden_states}
+        if self.topk is not None:
+            kwargs["topk_output"] = self.topk(hidden_states, router_logits)
+        else:
+            kwargs["router_logits"] = router_logits
+        final_hidden_states = self.experts(**kwargs)
         if not _is_cuda and not _use_aiter:
             # fused in biased_grouped_topk so we can skip here
             final_hidden_states *= self.routed_scaling_factor
@@ -901,7 +925,10 @@ class DeepseekV2AttentionMLA(nn.Module):
         self.disable_chunked_prefix_cache = global_server_args_dict[
             "disable_chunked_prefix_cache"
         ]
-        self.attention_backend = global_server_args_dict["attention_backend"]
+        self.current_attention_backend = (
+            None  # Attention backend used by current forward batch
+        )
         self.rocm_fused_decode_mla = get_bool_env_var(
             "SGLANG_ROCM_FUSED_DECODE_MLA", "false"
         )
@@ -985,9 +1012,16 @@ class DeepseekV2AttentionMLA(nn.Module):
                 else:
                     return AttnForwardMethod.MLA
-        if self.attention_backend == "ascend":
+        # Determine attention backend used by current forward batch
+        if forward_batch.forward_mode.is_decode_or_idle():
+            attention_backend = global_server_args_dict["decode_attention_backend"]
+        else:
+            attention_backend = global_server_args_dict["prefill_attention_backend"]
+        self.current_attention_backend = attention_backend
+        if attention_backend == "ascend":
             return AttnForwardMethod.MLA
-        elif self.attention_backend == "flashinfer":
+        elif attention_backend == "flashinfer":
             # Flashinfer MLA: Do not absorb when enabling ragged prefill
             if (
                 not self.flashinfer_mla_disable_ragged
@@ -999,7 +1033,7 @@ class DeepseekV2AttentionMLA(nn.Module):
                 return AttnForwardMethod.MHA
             else:
                 return _dispatch_mla_subtype()
-        elif self.attention_backend == "fa3":
+        elif attention_backend == "fa3":
             # Flash Attention: Use MHA with chunked KV cache when prefilling on long sequences.
             if forward_batch.extend_prefix_lens_cpu is not None:
                 sum_extend_prefix_lens = sum(forward_batch.extend_prefix_lens_cpu)
@@ -1016,7 +1050,7 @@ class DeepseekV2AttentionMLA(nn.Module):
                 return AttnForwardMethod.MHA_CHUNKED_KV
             else:
                 return _dispatch_mla_subtype()
-        elif self.attention_backend == "aiter":
+        elif attention_backend == "aiter":
             if (
                 forward_batch.forward_mode.is_extend()
                 and not forward_batch.forward_mode.is_target_verify()
@@ -1264,9 +1298,9 @@ class DeepseekV2AttentionMLA(nn.Module):
         self, q_pe, k_pe, q_nope_out, k_nope, forward_batch, zero_allocator
     ):
         if (
-            self.attention_backend == "fa3"
-            or self.attention_backend == "flashinfer"
-            or self.attention_backend == "cutlass_mla"
+            self.current_attention_backend == "fa3"
+            or self.current_attention_backend == "flashinfer"
+            or self.current_attention_backend == "cutlass_mla"
         ):
             attn_output = self.attn_mqa(
                 q_nope_out, k_nope, k_nope, forward_batch, q_rope=q_pe, k_rope=k_pe

sglang 0.4.9.post4__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl

sglang 0.4.9.post4py3-none-any.whl → 0.4.9.post5py3-none-any.whl