PyPI - sglang - Versions diffs - 0.4.9.post3__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl - Mend

sglang 0.4.9.post3py3-none-any.whl → 0.4.9.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (128) hide show

sglang/lang/chat_template.py +21 -0
sglang/srt/_custom_ops.py +29 -1
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/model_config.py +5 -1
sglang/srt/constrained/base_grammar_backend.py +10 -2
sglang/srt/constrained/xgrammar_backend.py +7 -5
sglang/srt/conversation.py +17 -2
sglang/srt/debug_utils/__init__.py +0 -0
sglang/srt/debug_utils/dump_comparator.py +131 -0
sglang/srt/debug_utils/dumper.py +108 -0
sglang/srt/debug_utils/text_comparator.py +172 -0
sglang/srt/disaggregation/common/conn.py +34 -6
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
sglang/srt/disaggregation/mini_lb.py +3 -2
sglang/srt/disaggregation/mooncake/conn.py +65 -20
sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
sglang/srt/disaggregation/nixl/conn.py +17 -13
sglang/srt/disaggregation/prefill.py +13 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
sglang/srt/distributed/parallel_state.py +70 -15
sglang/srt/entrypoints/engine.py +5 -9
sglang/srt/entrypoints/http_server.py +20 -32
sglang/srt/entrypoints/openai/protocol.py +3 -3
sglang/srt/entrypoints/openai/serving_chat.py +148 -72
sglang/srt/function_call/base_format_detector.py +74 -12
sglang/srt/function_call/deepseekv3_detector.py +26 -11
sglang/srt/function_call/ebnf_composer.py +105 -66
sglang/srt/function_call/function_call_parser.py +6 -4
sglang/srt/function_call/glm4_moe_detector.py +164 -0
sglang/srt/function_call/kimik2_detector.py +41 -16
sglang/srt/function_call/llama32_detector.py +6 -3
sglang/srt/function_call/mistral_detector.py +11 -3
sglang/srt/function_call/pythonic_detector.py +16 -14
sglang/srt/function_call/qwen25_detector.py +12 -3
sglang/srt/function_call/{qwen3_detector.py → qwen3_coder_detector.py} +11 -9
sglang/srt/layers/activation.py +11 -3
sglang/srt/layers/attention/base_attn_backend.py +3 -1
sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
sglang/srt/layers/attention/vision.py +56 -8
sglang/srt/layers/communicator.py +12 -12
sglang/srt/layers/dp_attention.py +72 -24
sglang/srt/layers/layernorm.py +26 -1
sglang/srt/layers/logits_processor.py +46 -25
sglang/srt/layers/moe/ep_moe/layer.py +172 -206
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +25 -224
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
sglang/srt/layers/moe/topk.py +88 -34
sglang/srt/layers/multimodal.py +11 -8
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -9
sglang/srt/layers/quantization/fp8.py +25 -247
sglang/srt/layers/quantization/fp8_kernel.py +78 -48
sglang/srt/layers/quantization/modelopt_quant.py +33 -14
sglang/srt/layers/quantization/unquant.py +24 -76
sglang/srt/layers/quantization/utils.py +0 -9
sglang/srt/layers/quantization/w4afp8.py +68 -17
sglang/srt/layers/radix_attention.py +5 -3
sglang/srt/lora/lora_manager.py +133 -169
sglang/srt/lora/lora_registry.py +188 -0
sglang/srt/lora/mem_pool.py +2 -2
sglang/srt/managers/cache_controller.py +62 -13
sglang/srt/managers/io_struct.py +19 -1
sglang/srt/managers/mm_utils.py +154 -35
sglang/srt/managers/multimodal_processor.py +3 -14
sglang/srt/managers/schedule_batch.py +27 -11
sglang/srt/managers/scheduler.py +48 -26
sglang/srt/managers/tokenizer_manager.py +62 -28
sglang/srt/managers/tp_worker.py +5 -4
sglang/srt/mem_cache/allocator.py +67 -7
sglang/srt/mem_cache/hicache_storage.py +17 -1
sglang/srt/mem_cache/hiradix_cache.py +35 -18
sglang/srt/mem_cache/memory_pool_host.py +3 -0
sglang/srt/model_executor/cuda_graph_runner.py +61 -25
sglang/srt/model_executor/forward_batch_info.py +201 -29
sglang/srt/model_executor/model_runner.py +109 -37
sglang/srt/models/deepseek_v2.py +63 -30
sglang/srt/models/glm4_moe.py +1035 -0
sglang/srt/models/glm4_moe_nextn.py +167 -0
sglang/srt/models/interns1.py +328 -0
sglang/srt/models/internvl.py +143 -47
sglang/srt/models/llava.py +9 -5
sglang/srt/models/minicpmo.py +4 -1
sglang/srt/models/mllama4.py +10 -3
sglang/srt/models/qwen2_moe.py +2 -6
sglang/srt/models/qwen3_moe.py +6 -8
sglang/srt/multimodal/processors/base_processor.py +20 -6
sglang/srt/multimodal/processors/clip.py +2 -2
sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
sglang/srt/multimodal/processors/gemma3.py +2 -2
sglang/srt/multimodal/processors/gemma3n.py +2 -2
sglang/srt/multimodal/processors/internvl.py +21 -8
sglang/srt/multimodal/processors/janus_pro.py +2 -2
sglang/srt/multimodal/processors/kimi_vl.py +2 -2
sglang/srt/multimodal/processors/llava.py +4 -4
sglang/srt/multimodal/processors/minicpm.py +2 -3
sglang/srt/multimodal/processors/mlama.py +2 -2
sglang/srt/multimodal/processors/mllama4.py +18 -111
sglang/srt/multimodal/processors/phi4mm.py +2 -2
sglang/srt/multimodal/processors/pixtral.py +2 -2
sglang/srt/multimodal/processors/qwen_audio.py +2 -2
sglang/srt/multimodal/processors/qwen_vl.py +2 -2
sglang/srt/multimodal/processors/vila.py +3 -1
sglang/srt/reasoning_parser.py +48 -5
sglang/srt/sampling/sampling_batch_info.py +6 -5
sglang/srt/server_args.py +132 -60
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +37 -36
sglang/srt/speculative/eagle_utils.py +51 -23
sglang/srt/speculative/eagle_worker.py +59 -44
sglang/srt/two_batch_overlap.py +9 -5
sglang/srt/utils.py +113 -69
sglang/srt/weight_sync/utils.py +119 -0
sglang/test/runners.py +4 -0
sglang/test/test_activation.py +50 -1
sglang/test/test_utils.py +65 -5
sglang/utils.py +19 -0
sglang/version.py +1 -1
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +6 -6
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +127 -114
sglang/srt/debug_utils.py +0 -74
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -20,10 +20,10 @@ import logging
 import os
 import random
 import tempfile
-from token import OP
 from typing import List, Literal, Optional, Union
 from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
+from sglang.srt.lora.lora_registry import LoRARef
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
     LORA_TARGET_ALL_MODULES,
@@ -80,7 +80,7 @@ class ServerArgs:
     schedule_policy: str = "fcfs"
     schedule_conservativeness: float = 1.0
     cpu_offload_gb: int = 0
-    page_size: int = 1
+    page_size: Optional[int] = None
     hybrid_kvcache_ratio: Optional[float] = None
     swa_full_tokens_ratio: float = 0.8
     disable_hybrid_swa_memory: bool = False
@@ -145,12 +145,14 @@ class ServerArgs:
     enable_lora: Optional[bool] = None
     max_lora_rank: Optional[int] = None
     lora_target_modules: Optional[Union[set[str], List[str]]] = None
-    lora_paths: Optional[Union[dict[str, str], List[str]]] = None
+    lora_paths: Optional[Union[dict[str, str], dict[str, LoRARef], List[str]]] = None
     max_loras_per_batch: int = 8
     lora_backend: str = "triton"
     # Kernel backend
     attention_backend: Optional[str] = None
+    decode_attention_backend: Optional[str] = None
+    prefill_attention_backend: Optional[str] = None
     sampling_backend: Optional[str] = None
     grammar_backend: Optional[str] = None
     mm_attention_backend: Optional[str] = None
@@ -169,7 +171,8 @@ class ServerArgs:
     ep_size: int = 1
     enable_ep_moe: bool = False
     enable_deepep_moe: bool = False
-    enable_flashinfer_moe: bool = False
+    enable_flashinfer_cutlass_moe: bool = False
+    enable_flashinfer_trtllm_moe: bool = False
     enable_flashinfer_allreduce_fusion: bool = False
     deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
     ep_num_redundant_experts: int = 0
@@ -266,31 +269,20 @@ class ServerArgs:
     def __post_init__(self):
         # Expert parallelism
+        # We put it here first due to some internal ckpt conversation issues.
         if self.enable_ep_moe:
             self.ep_size = self.tp_size
             logger.warning(
                 f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
             )
-        if self.enable_flashinfer_moe:
-            assert (
-                self.quantization == "modelopt_fp4"
-            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
-            os.environ["TRTLLM_ENABLE_PDL"] = "1"
-            self.disable_shared_experts_fusion = True
-            logger.warning(
-                f"Flashinfer MoE is enabled. Shared expert fusion is disabled."
-            )
         # Set missing default values
         if self.tokenizer_path is None:
             self.tokenizer_path = self.model_path
-        if self.device is None:
-            self.device = get_device()
         if self.served_model_name is None:
             self.served_model_name = self.model_path
+        if self.device is None:
+            self.device = get_device()
         if self.random_seed is None:
             self.random_seed = random.randint(0, 1 << 30)
@@ -359,7 +351,6 @@ class ServerArgs:
                     self.chunked_prefill_size = 16384
             else:
                 self.chunked_prefill_size = 4096
-        assert self.chunked_prefill_size % self.page_size == 0
         # Set cuda graph max batch size
         if self.cuda_graph_max_bs is None:
@@ -398,18 +389,32 @@ class ServerArgs:
             )
             self.page_size = 128
-        if self.attention_backend == "flashmla":
+        if (
+            self.attention_backend == "flashmla"
+            or self.decode_attention_backend == "flashmla"
+        ):
             logger.warning(
                 "FlashMLA only supports a page_size of 64, change page_size to 64."
             )
             self.page_size = 64
-        if self.attention_backend == "cutlass_mla":
+        if (
+            self.attention_backend == "cutlass_mla"
+            or self.decode_attention_backend == "cutlass_mla"
+        ):
             logger.warning(
                 "Cutlass MLA only supports a page_size of 128, change page_size to 128."
             )
             self.page_size = 128
+        # Set page size
+        if self.page_size is None:
+            self.page_size = 1
+        # AMD-specific Triton attention KV splits default number
+        if is_hip():
+            self.triton_attention_num_kv_splits = 16
         # Choose grammar backend
         if self.grammar_backend is None:
             self.grammar_backend = "xgrammar"
@@ -431,6 +436,17 @@ class ServerArgs:
                 self.enable_dp_attention
             ), "Please enable dp attention when setting enable_dp_lm_head. "
+        # MoE kernel
+        if self.enable_flashinfer_cutlass_moe:
+            assert (
+                self.quantization == "modelopt_fp4"
+            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
+            os.environ["TRTLLM_ENABLE_PDL"] = "1"
+        if self.enable_flashinfer_trtllm_moe:
+            assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE"
+            logger.warning(f"Flashinfer TRTLLM MoE is enabled.")
         # DeepEP MoE
         if self.enable_deepep_moe:
             if self.deepep_mode == "normal":
@@ -455,6 +471,9 @@ class ServerArgs:
                 "EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
             )
+        if self.enable_eplb:
+            assert self.enable_ep_moe or self.enable_deepep_moe
         if self.enable_expert_distribution_metrics and (
             self.expert_distribution_recorder_mode is None
         ):
@@ -494,7 +513,7 @@ class ServerArgs:
                 )
             model_arch = self.get_hf_config().architectures[0]
-            if model_arch == "DeepseekV3ForCausalLM":
+            if model_arch in ["DeepseekV3ForCausalLM", "Glm4MoeForCausalLM"]:
                 # Auto set draft_model_path DeepSeek-V3/R1
                 if self.speculative_draft_model_path is None:
                     self.speculative_draft_model_path = self.model_path
@@ -502,14 +521,6 @@ class ServerArgs:
                     logger.warning(
                         "DeepSeek MTP does not require setting speculative_draft_model_path."
                     )
-            elif "Llama4" in model_arch:
-                # TODO: remove this after Llama4 supports in other backends
-                if self.attention_backend != "fa3":
-                    self.attention_backend = "fa3"
-                    logger.warning(
-                        "Llama4 requires using fa3 attention backend. "
-                        "Attention backend is automatically set to fa3."
-                    )
             # Auto choose parameters
             if self.speculative_num_steps is None:
@@ -542,12 +553,11 @@ class ServerArgs:
         ) and check_gguf_file(self.model_path):
             self.quantization = self.load_format = "gguf"
+        # Model loading
         if is_remote_url(self.model_path):
             self.load_format = "remote"
-        # AMD-specific Triton attention KV splits default number
-        if is_hip():
-            self.triton_attention_num_kv_splits = 16
+        if self.custom_weight_loader is None:
+            self.custom_weight_loader = []
         # PD disaggregation
         if self.disaggregation_mode == "decode":
@@ -572,6 +582,7 @@ class ServerArgs:
             self.disable_cuda_graph = True
             logger.warning("Cuda graph is disabled for prefill server")
+        # Propagate env vars
         os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
             "1" if self.enable_torch_compile else "0"
         )
@@ -580,9 +591,6 @@ class ServerArgs:
             "1" if self.disable_outlines_disk_cache else "0"
         )
-        if self.custom_weight_loader is None:
-            self.custom_weight_loader = []
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         # Model and tokenizer
@@ -1099,10 +1107,11 @@ class ServerArgs:
                 "deepseekv3",
                 "pythonic",
                 "kimi_k2",
-                "qwen3",
+                "qwen3_coder",
+                "glm45",
             ],
             default=ServerArgs.tool_call_parser,
-            help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', and 'kimi_k2'.",
+            help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', and 'qwen3_coder'.",
         )
         # Data parallelism
@@ -1213,6 +1222,35 @@ class ServerArgs:
             default=ServerArgs.attention_backend,
             help="Choose the kernels for attention layers.",
         )
+        parser.add_argument(
+            "--decode-attention-backend",
+            type=str,
+            choices=[
+                "flashinfer",
+                "triton",
+                "torch_native",
+                "fa3",
+                "flashmla",
+                "cutlass_mla",
+            ],
+            default=ServerArgs.decode_attention_backend,
+            help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
+        )
+        parser.add_argument(
+            "--prefill-attention-backend",
+            type=str,
+            choices=[
+                "flashinfer",
+                "triton",
+                "torch_native",
+                "fa3",
+                "flashmla",
+                "cutlass_mla",
+            ],
+            default=ServerArgs.prefill_attention_backend,
+            help="Choose the kernels for prefill attention layers (have priority over --attention-backend).",
+        )
         parser.add_argument(
             "--sampling-backend",
             type=str,
@@ -1227,6 +1265,13 @@ class ServerArgs:
             default=ServerArgs.grammar_backend,
             help="Choose the backend for grammar-guided decoding.",
         )
+        parser.add_argument(
+            "--mm-attention-backend",
+            type=str,
+            choices=["sdpa", "fa3", "triton_attn"],
+            default=ServerArgs.mm_attention_backend,
+            help="Set multimodal attention backend.",
+        )
         # Speculative decoding
         parser.add_argument(
@@ -1276,13 +1321,6 @@ class ServerArgs:
             help="The path of the draft model's small vocab table.",
             default=ServerArgs.speculative_token_map,
         )
-        parser.add_argument(
-            "--mm-attention-backend",
-            type=str,
-            choices=["sdpa", "fa3", "triton_attn"],
-            default=ServerArgs.mm_attention_backend,
-            help="Set multimodal attention backend.",
-        )
         # Expert parallelism
         parser.add_argument(
@@ -1298,10 +1336,15 @@ class ServerArgs:
             help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
         )
         parser.add_argument(
-            "--enable-flashinfer-moe",
+            "--enable-flashinfer-cutlass-moe",
             action="store_true",
             help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
         )
+        parser.add_argument(
+            "--enable-flashinfer-trtllm-moe",
+            action="store_true",
+            help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP with --enable-ep-moe",
+        )
         parser.add_argument(
             "--enable-flashinfer-allreduce-fusion",
             action="store_true",
@@ -1530,11 +1573,6 @@ class ServerArgs:
             action="store_true",
             help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
         )
-        parser.add_argument(
-            "--disable-overlap-cg-plan",
-            action="store_true",
-            help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
-        )
         parser.add_argument(
             "--enable-mixed-chunk",
             action="store_true",
@@ -1792,11 +1830,11 @@ class ServerArgs:
         return hf_config
     def check_server_args(self):
+        # Check parallel size constraints
         assert (
             self.tp_size * self.pp_size
         ) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
-        # FIXME pp constraints
         if self.pp_size > 1:
             assert (
                 self.disable_overlap_schedule
@@ -1807,11 +1845,7 @@ class ServerArgs:
         assert not (
             self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
         ), "multi-node data parallel is not supported unless dp attention!"
-        assert (
-            self.max_loras_per_batch > 0
-            # FIXME
-            and (self.lora_paths is None or self.disable_radix_cache)
-        ), "compatibility of lora and radix attention is in progress"
         assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
         assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
@@ -1820,9 +1854,32 @@ class ServerArgs:
             None,
         }, "moe_dense_tp_size only support 1 and None currently"
+        # Check model architecture
+        model_arch = self.get_hf_config().architectures[0]
+        if "Llama4" in model_arch:
+            assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
+        # Check LoRA
         self.check_lora_server_args()
+        # Check speculative decoding
+        if self.speculative_algorithm is not None:
+            assert (
+                not self.enable_mixed_chunk
+            ), "enable_mixed_chunk is required for speculative decoding"
+        # Check chunked prefill
+        assert (
+            self.chunked_prefill_size % self.page_size == 0
+        ), "chunked_prefill_size must be divisible by page_size"
     def check_lora_server_args(self):
+        assert (
+            self.max_loras_per_batch > 0
+            # FIXME
+            and (self.lora_paths is None or self.disable_radix_cache)
+        ), "compatibility of lora and radix attention is in progress"
         # Enable LoRA if any LoRA paths are provided for backward compatibility.
         if self.lora_paths:
             if self.enable_lora is None:
@@ -1843,9 +1900,24 @@ class ServerArgs:
                 for lora_path in lora_paths:
                     if "=" in lora_path:
                         name, path = lora_path.split("=", 1)
-                        self.lora_paths[name] = path
+                        self.lora_paths[name] = LoRARef(lora_name=name, lora_path=path)
                     else:
-                        self.lora_paths[lora_path] = lora_path
+                        self.lora_paths[lora_path] = LoRARef(
+                            lora_name=lora_path,
+                            lora_path=lora_path,
+                        )
+            elif isinstance(self.lora_paths, dict):
+                self.lora_paths = {
+                    k: LoRARef(lora_name=k, lora_path=v)
+                    for k, v in self.lora_paths.items()
+                }
+            elif self.lora_paths is None:
+                self.lora_paths = {}
+            else:
+                raise ValueError(
+                    f"Invalid type for --lora-paths: {type(self.lora_paths)}. "
+                    "Expected a list or a dictionary."
+                )
             # Expand target modules
             if self.lora_target_modules:

sglang/srt/speculative/eagle_draft_cuda_graph_runner.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Callable
 import torch
+from sglang.srt.layers.dp_attention import DPPaddingMode
 from sglang.srt.model_executor.cuda_graph_runner import (
     CUDA_GRAPH_CAPTURE_FAILED_MSG,
     CudaGraphRunner,
@@ -97,13 +98,6 @@ class EAGLEDraftCudaGraphRunner:
             )
             if self.require_gathered_buffer:
-                self.gathered_buffer = torch.zeros(
-                    (
-                        self.max_num_token,
-                        self.model_runner.model_config.hidden_size,
-                    ),
-                    dtype=self.model_runner.dtype,
-                )
                 if self.require_mlp_tp_gather:
                     self.global_num_tokens_gpu = torch.zeros(
                         (self.dp_size,), dtype=torch.int32
@@ -111,12 +105,30 @@ class EAGLEDraftCudaGraphRunner:
                     self.global_num_tokens_for_logprob_gpu = torch.zeros(
                         (self.dp_size,), dtype=torch.int32
                     )
+                    self.gathered_buffer = torch.zeros(
+                        (
+                            self.max_num_token * self.dp_size,
+                            self.model_runner.model_config.hidden_size,
+                        ),
+                        dtype=self.model_runner.dtype,
+                    )
                 else:
                     assert self.require_attn_tp_gather
                     self.global_num_tokens_gpu = torch.zeros((1,), dtype=torch.int32)
                     self.global_num_tokens_for_logprob_gpu = torch.zeros(
                         (1,), dtype=torch.int32
                     )
+                    self.gathered_buffer = torch.zeros(
+                        (
+                            self.max_num_token,
+                            self.model_runner.model_config.hidden_size,
+                        ),
+                        dtype=self.model_runner.dtype,
+                    )
+            else:
+                self.global_num_tokens_gpu = None
+                self.global_num_tokens_for_logprob_gpu = None
+                self.gathered_buffer = None
         # Capture
         try:
@@ -130,9 +142,9 @@ class EAGLEDraftCudaGraphRunner:
     def can_run(self, forward_batch: ForwardBatch):
         if self.require_mlp_tp_gather:
             cuda_graph_bs = (
-                sum(forward_batch.global_num_tokens_cpu) // self.num_tokens_per_bs
+                max(forward_batch.global_num_tokens_cpu) // self.num_tokens_per_bs
                 if self.model_runner.spec_algorithm.is_eagle()
-                else sum(forward_batch.global_num_tokens_cpu)
+                else max(forward_batch.global_num_tokens_cpu)
             )
         else:
             cuda_graph_bs = forward_batch.batch_size
@@ -168,26 +180,20 @@ class EAGLEDraftCudaGraphRunner:
         if self.require_mlp_tp_gather:
             self.global_num_tokens_gpu.copy_(
                 torch.tensor(
-                    [
-                        num_tokens // self.dp_size + (i < (num_tokens % self.dp_size))
-                        for i in range(self.dp_size)
-                    ],
+                    [num_tokens] * self.dp_size,
                     dtype=torch.int32,
                     device=self.input_ids.device,
                 )
             )
             self.global_num_tokens_for_logprob_gpu.copy_(
                 torch.tensor(
-                    [
-                        num_tokens // self.dp_size + (i < (num_tokens % self.dp_size))
-                        for i in range(self.dp_size)
-                    ],
+                    [num_tokens] * self.dp_size,
                     dtype=torch.int32,
                     device=self.input_ids.device,
                 )
             )
             global_num_tokens = self.global_num_tokens_gpu
-            gathered_buffer = self.gathered_buffer[:num_tokens]
+            gathered_buffer = self.gathered_buffer[: num_tokens * self.dp_size]
             global_num_tokens_for_logprob = self.global_num_tokens_for_logprob_gpu
         elif self.require_attn_tp_gather:
             self.global_num_tokens_gpu.copy_(
@@ -233,6 +239,7 @@ class EAGLEDraftCudaGraphRunner:
             return_logprob=False,
             positions=positions,
             global_num_tokens_gpu=global_num_tokens,
+            dp_padding_mode=DPPaddingMode.get_default_mode_in_cuda_graph(),
             gathered_buffer=gathered_buffer,
             spec_algorithm=self.model_runner.spec_algorithm,
             spec_info=spec_info,
@@ -290,12 +297,13 @@ class EAGLEDraftCudaGraphRunner:
         # Pad
         if self.require_mlp_tp_gather:
-            total_batch_size = (
-                sum(forward_batch.global_num_tokens_cpu) // self.num_tokens_per_bs
+            max_num_tokens = max(forward_batch.global_num_tokens_cpu)
+            max_batch_size = (
+                max_num_tokens // self.num_tokens_per_bs
                 if self.model_runner.spec_algorithm.is_eagle()
-                else sum(forward_batch.global_num_tokens_cpu)
+                else max_num_tokens
             )
-            index = bisect.bisect_left(self.capture_bs, total_batch_size)
+            index = bisect.bisect_left(self.capture_bs, max_batch_size)
         else:
             index = bisect.bisect_left(self.capture_bs, raw_bs)
         bs = self.capture_bs[index]
@@ -316,12 +324,10 @@ class EAGLEDraftCudaGraphRunner:
         self.topk_index[:raw_bs].copy_(forward_batch.spec_info.topk_index)
         self.hidden_states[:raw_bs].copy_(forward_batch.spec_info.hidden_states)
+        # TODO(ch-wan): support num_token_non_padded
         if self.require_gathered_buffer:
-            self.global_num_tokens_gpu.copy_(forward_batch.global_num_tokens_gpu)
-            self.global_num_tokens_for_logprob_gpu.copy_(
-                forward_batch.global_num_tokens_for_logprob_gpu
-            )
-            forward_batch.gathered_buffer = self.gathered_buffer
+            self.global_num_tokens_gpu.fill_(bs * self.num_tokens_per_bs)
+            self.global_num_tokens_for_logprob_gpu.fill_(bs * self.num_tokens_per_bs)
         # Attention backend
         if bs != raw_bs:
@@ -330,7 +336,6 @@ class EAGLEDraftCudaGraphRunner:
             forward_batch.req_pool_indices = self.req_pool_indices[:bs]
             forward_batch.positions = self.positions[:num_tokens]
-        # Special handle for seq_len_cpu used when flashinfer mla is used
         if forward_batch.seq_lens_cpu is not None:
             if bs != raw_bs:
                 self.seq_lens_cpu.fill_(self.seq_len_fill_value)

sglang 0.4.9.post3__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl

sglang 0.4.9.post3py3-none-any.whl → 0.4.9.post5py3-none-any.whl