PyPI - sglang - Versions diffs - 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

sglang 0.4.4.post3py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/bench_serving.py +49 -7
sglang/lang/chat_template.py +24 -0
sglang/srt/_custom_ops.py +59 -92
sglang/srt/configs/model_config.py +5 -0
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/conversation.py +29 -4
sglang/srt/custom_op.py +5 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +27 -79
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/entrypoints/engine.py +0 -5
sglang/srt/layers/attention/flashattention_backend.py +678 -83
sglang/srt/layers/attention/flashinfer_backend.py +5 -7
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
sglang/srt/layers/moe/ep_moe/layer.py +79 -80
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
sglang/srt/layers/moe/fused_moe_native.py +5 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +416 -50
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/topk.py +49 -3
sglang/srt/layers/quantization/__init__.py +5 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
sglang/srt/layers/quantization/fp8.py +3 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -4
sglang/srt/layers/quantization/moe_wna16.py +503 -0
sglang/srt/layers/quantization/utils.py +1 -1
sglang/srt/layers/quantization/w8a8_int8.py +2 -0
sglang/srt/layers/radix_attention.py +2 -0
sglang/srt/layers/rotary_embedding.py +63 -12
sglang/srt/managers/cache_controller.py +34 -11
sglang/srt/managers/mm_utils.py +202 -156
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
sglang/srt/managers/multimodal_processors/clip.py +7 -26
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
sglang/srt/managers/multimodal_processors/llava.py +34 -14
sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
sglang/srt/managers/multimodal_processors/mlama.py +10 -23
sglang/srt/managers/multimodal_processors/mllama4.py +161 -0
sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
sglang/srt/managers/schedule_batch.py +185 -128
sglang/srt/managers/scheduler.py +4 -4
sglang/srt/managers/tokenizer_manager.py +1 -1
sglang/srt/managers/utils.py +1 -6
sglang/srt/mem_cache/hiradix_cache.py +62 -52
sglang/srt/mem_cache/memory_pool.py +72 -6
sglang/srt/mem_cache/paged_allocator.py +39 -0
sglang/srt/metrics/collector.py +23 -53
sglang/srt/model_executor/cuda_graph_runner.py +8 -6
sglang/srt/model_executor/forward_batch_info.py +10 -10
sglang/srt/model_executor/model_runner.py +60 -57
sglang/srt/model_loader/loader.py +8 -0
sglang/srt/models/clip.py +12 -7
sglang/srt/models/deepseek_janus_pro.py +10 -15
sglang/srt/models/deepseek_v2.py +212 -121
sglang/srt/models/deepseek_vl2.py +105 -104
sglang/srt/models/gemma3_mm.py +14 -80
sglang/srt/models/llama.py +16 -5
sglang/srt/models/llama4.py +420 -0
sglang/srt/models/llava.py +31 -19
sglang/srt/models/llavavid.py +16 -7
sglang/srt/models/minicpmo.py +63 -147
sglang/srt/models/minicpmv.py +17 -27
sglang/srt/models/mllama.py +29 -14
sglang/srt/models/mllama4.py +154 -0
sglang/srt/models/qwen2.py +9 -6
sglang/srt/models/qwen2_5_vl.py +21 -31
sglang/srt/models/qwen2_vl.py +20 -21
sglang/srt/openai_api/adapter.py +18 -6
sglang/srt/platforms/interface.py +371 -0
sglang/srt/server_args.py +99 -14
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
sglang/srt/speculative/eagle_utils.py +140 -28
sglang/srt/speculative/eagle_worker.py +93 -24
sglang/srt/utils.py +104 -51
sglang/test/test_custom_ops.py +55 -0
sglang/test/test_utils.py +13 -26
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/METADATA +4 -3
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/RECORD +99 -84
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -15,11 +15,12 @@
 import argparse
 import dataclasses
+import json
 import logging
 import os
 import random
 import tempfile
-from typing import List, Optional
+from typing import List, Literal, Optional
 from sglang.srt.hf_transformers_utils import check_gguf_file
 from sglang.srt.reasoning_parser import ReasoningParser
@@ -127,14 +128,14 @@ class ServerArgs:
     # Kernel backend
     attention_backend: Optional[str] = None
     sampling_backend: Optional[str] = None
-    grammar_backend: Optional[str] = "xgrammar"
+    grammar_backend: Optional[str] = None
     # Speculative decoding
     speculative_algorithm: Optional[str] = None
     speculative_draft_model_path: Optional[str] = None
-    speculative_num_steps: int = 5
-    speculative_eagle_topk: int = 4
-    speculative_num_draft_tokens: int = 8
+    speculative_num_steps: Optional[int] = None
+    speculative_eagle_topk: Optional[int] = None
+    speculative_num_draft_tokens: Optional[int] = None
     speculative_accept_threshold_single: float = 1.0
     speculative_accept_threshold_acc: float = 1.0
     speculative_token_map: Optional[str] = None
@@ -160,6 +161,7 @@ class ServerArgs:
     enable_dp_attention: bool = False
     enable_ep_moe: bool = False
     enable_deepep_moe: bool = False
+    deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
     enable_torch_compile: bool = False
     torch_compile_max_bs: int = 32
     cuda_graph_max_bs: Optional[int] = None
@@ -177,10 +179,12 @@ class ServerArgs:
     tool_call_parser: Optional[str] = None
     enable_hierarchical_cache: bool = False
     hicache_ratio: float = 2.0
-    enable_flashinfer_mla: bool = False
+    enable_flashinfer_mla: bool = False  # TODO: remove this argument
     enable_flashmla: bool = False
     flashinfer_mla_disable_ragged: bool = False
     warmups: Optional[str] = None
+    n_share_experts_fusion: int = 0
+    disable_shared_experts_fusion: bool = False
     # Debug tensor dumps
     debug_tensor_dump_output_folder: Optional[str] = None
@@ -192,6 +196,13 @@ class ServerArgs:
     disaggregation_bootstrap_port: int = 8998
     def __post_init__(self):
+        # Expert parallelism
+        if self.enable_ep_moe:
+            self.ep_size = self.tp_size
+            logger.info(
+                f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
+            )
         # Set missing default values
         if self.tokenizer_path is None:
             self.tokenizer_path = self.model_path
@@ -215,6 +226,9 @@ class ServerArgs:
             # GPU memory is not known yet or no GPU is available.
             gpu_mem = None
+        if is_hip():
+            self.disable_shared_experts_fusion = True
         # Set mem fraction static, which depends on the tensor parallelism size
         if self.mem_fraction_static is None:
             if self.tp_size >= 16:
@@ -253,15 +267,11 @@ class ServerArgs:
             else:
                 self.cuda_graph_max_bs = 160
-        # Choose kernel backends
+        # Set kernel backends for hpu device
         if self.device == "hpu":
             self.attention_backend = "torch_native"
             self.sampling_backend = "pytorch"
-        if self.attention_backend is None:
-            self.attention_backend = (
-                "flashinfer" if is_flashinfer_available() else "triton"
-            )
         if self.sampling_backend is None:
             self.sampling_backend = (
                 "flashinfer" if is_flashinfer_available() else "pytorch"
@@ -273,6 +283,10 @@ class ServerArgs:
             )
             self.disable_cuda_graph = True
+        # Choose grammar backend
+        if self.grammar_backend is None:
+            self.grammar_backend = "xgrammar"
         # Expert parallelism
         if self.enable_ep_moe:
             self.ep_size = self.tp_size
@@ -295,6 +309,10 @@ class ServerArgs:
         self.enable_sp_layernorm = False
         # DeepEP MoE
         if self.enable_deepep_moe:
+            if self.deepep_mode == "auto":
+                assert (
+                    not self.enable_dp_attention
+                ), "DeepEP MoE `auto` mode is not supported with DP Attention."
             self.ep_size = self.tp_size
             self.enable_sp_layernorm = (
                 self.dp_size < self.tp_size if self.enable_dp_attention else True
@@ -313,12 +331,29 @@ class ServerArgs:
             or self.speculative_algorithm == "EAGLE3"
         ):
             if self.max_running_requests is None:
-                self.max_running_requests = 32
+                self.max_running_requests = 48
             self.disable_overlap_schedule = True
             logger.info(
                 "Overlap scheduler is disabled because of using "
                 "eagle speculative decoding."
             )
+            # Auto choose parameters
+            if self.speculative_num_steps is None:
+                assert (
+                    self.speculative_eagle_topk is None
+                    and self.speculative_num_draft_tokens is None
+                )
+                (
+                    self.speculative_num_steps,
+                    self.speculative_eagle_topk,
+                    self.speculative_num_draft_tokens,
+                ) = auto_choose_speculative_params(self)
+            if self.page_size > 1 and self.speculative_eagle_topk > 1:
+                self.speculative_eagle_topk = 1
+                logger.info("speculative_eagle_topk is changed to 1 when page_size > 1")
             # The token generated from the verify step is counted.
             # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
             # assert self.speculative_num_steps < self.speculative_num_draft_tokens
@@ -462,6 +497,7 @@ class ServerArgs:
                 "modelopt",
                 "w8a8_int8",
                 "w8a8_fp8",
+                "moe_wna16",
             ],
             help="The quantization method.",
         )
@@ -795,14 +831,14 @@ class ServerArgs:
         parser.add_argument(
             "--grammar-backend",
             type=str,
-            choices=["xgrammar", "outlines", "llguidance"],
+            choices=["xgrammar", "outlines", "llguidance", "none"],
             default=ServerArgs.grammar_backend,
             help="Choose the backend for grammar-guided decoding.",
         )
         parser.add_argument(
             "--enable-flashinfer-mla",
             action="store_true",
-            help="Enable FlashInfer MLA optimization",
+            help="Enable FlashInfer MLA optimization. This argument will be deprecated soon! Please use '--attention-backend flashinfer' instead for switching on flashfiner mla!",
         )
         parser.add_argument(
             "--enable-flashmla",
@@ -1060,6 +1096,25 @@ class ServerArgs:
             action="store_true",
             help="Enabling DeepEP MoE implementation for EP MoE.",
         )
+        parser.add_argument(
+            "--deepep-mode",
+            type=str,
+            choices=["normal", "low_latency", "auto"],
+            help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
+        )
+        parser.add_argument(
+            "--n-share-experts-fusion",
+            type=int,
+            default=0,
+            help="The number of shared_experts need to be replica to fuse with normal experts in deepseek v3/r1 "
+            "we use tp_size by default.",
+        )
+        parser.add_argument(
+            "--disable-shared-experts-fusion",
+            action="store_true",
+            help="Disable shared experts fusion by setting n_share_experts_fusion to 0.",
+        )
         # Server warmups
         parser.add_argument(
@@ -1253,3 +1308,33 @@ class DeprecatedAction(argparse.Action):
     def __call__(self, parser, namespace, values, option_string=None):
         raise ValueError(self.help)
+def auto_choose_speculative_params(self: ServerArgs):
+    """
+    Automatically choose the parameters for speculative decoding.
+    You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
+    """
+    if self.decrypted_config_file:
+        config_path = self.decrypted_config_file
+    else:
+        config_path = os.path.join(self.model_path, "config.json")
+    if not os.path.exists(config_path):
+        raise ValueError(f"{config_path} is not found.")
+    config = json.load(open(config_path))
+    arch = config.get("architectures", ["Unknown"])[0]
+    if arch in ["LlamaForCausalLM"]:
+        # The default value for llama
+        return (5, 4, 8)
+    elif arch in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
+        # The default value for deepseek
+        return (5, 4, 8)
+    elif arch in ["Grok1ForCausalLM", "Grok1VForCausalLM"]:
+        return (5, 4, 8)
+    else:
+        # The default value for all other models
+        return (5, 4, 8)

sglang/srt/speculative/eagle_draft_cuda_graph_runner.py CHANGED Viewed

@@ -214,10 +214,10 @@ class EAGLEDraftCudaGraphRunner:
             forward_batch.positions = self.positions[:num_tokens]
         # Special handle for seq_len_cpu used when flashinfer mla is used
-        if (forward_batch.decode_seq_lens_cpu is not None) and (bs != raw_bs):
+        if (forward_batch.seq_lens_cpu is not None) and (bs != raw_bs):
             self.seq_lens_cpu.fill_(1)
-            self.seq_lens_cpu[:raw_bs].copy_(forward_batch.decode_seq_lens_cpu)
-            forward_batch.decode_seq_lens_cpu = self.seq_lens_cpu[:bs]
+            self.seq_lens_cpu[:raw_bs].copy_(forward_batch.seq_lens_cpu)
+            forward_batch.seq_lens_cpu = self.seq_lens_cpu[:bs]
         self.model_runner.draft_attn_backend.init_forward_metadata_replay_cuda_graph(
             forward_batch, bs
@@ -233,7 +233,7 @@ class EAGLEDraftCudaGraphRunner:
             forward_batch.positions = self.positions[:raw_num_token]
             forward_batch.seq_lens = self.seq_lens[:raw_bs]
             forward_batch.req_pool_indices = self.req_pool_indices[:raw_bs]
-            if forward_batch.decode_seq_lens_cpu is not None:
-                forward_batch.decode_seq_lens_cpu = self.seq_lens_cpu[:raw_bs]
+            if forward_batch.seq_lens_cpu is not None:
+                forward_batch.seq_lens_cpu = self.seq_lens_cpu[:raw_bs]
         return out

sglang/srt/speculative/eagle_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import os
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, List, Optional
@@ -10,11 +11,15 @@ import triton.language as tl
 from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
-from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.managers.schedule_batch import (
+    ScheduleBatch,
+    get_last_loc,
+    global_server_args_dict,
+)
 from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
 from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
 from sglang.srt.speculative.build_eagle_tree import build_tree_kernel_efficient
-from sglang.srt.utils import is_cuda_available, is_hip
+from sglang.srt.utils import is_cuda_available, is_hip, next_power_of_2
 if is_cuda_available():
     from sgl_kernel import (
@@ -34,6 +39,9 @@ import logging
 logger = logging.getLogger(__name__)
+SIMULATE_ACC_LEN = os.environ.get("SIMULATE_ACC_LEN")
 @dataclass
 class EagleDraftInput:
     # The inputs for decode
@@ -93,7 +101,7 @@ class EagleDraftInput:
             torch.cumsum(self.accept_length, axis=0, dtype=torch.int),
             self.positions,
             new_verified_id,
-            triton.next_power_of_2(speculative_num_steps + 1),
+            next_power_of_2(speculative_num_steps + 1),
         )
         batch.seq_lens_sum = sum(seq_lens_cpu)
@@ -225,18 +233,34 @@ class EagleVerifyInput:
             CaptureHiddenMode.FULL,
         )
-    def prepare_for_verify(self, batch: ScheduleBatch):
+    def prepare_for_verify(self, batch: ScheduleBatch, page_size: int):
         batch.input_ids = self.draft_token
-        batch.out_cache_loc = batch.alloc_token_slots(batch.input_ids.numel())
+        if page_size == 1:
+            batch.out_cache_loc = batch.alloc_token_slots(len(batch.input_ids))
+            end_offset = batch.seq_lens + self.draft_token_num
+        else:
+            prefix_lens = batch.seq_lens
+            end_offset = prefix_lens + self.draft_token_num
+            last_loc = get_last_loc(
+                batch.req_to_token_pool.req_to_token,
+                batch.req_pool_indices,
+                prefix_lens,
+            )
+            batch.out_cache_loc = batch.alloc_paged_token_slots_extend(
+                prefix_lens, end_offset, last_loc, len(batch.input_ids)
+            )
+            self.last_loc = last_loc
         bs = batch.batch_size()
         assign_req_to_token_pool[(bs,)](
             batch.req_pool_indices,
             batch.req_to_token_pool.req_to_token,
             batch.seq_lens,
-            batch.seq_lens + self.draft_token_num,
+            end_offset,
             batch.out_cache_loc,
             batch.req_to_token_pool.req_to_token.shape[1],
-            triton.next_power_of_2(bs),
+            next_power_of_2(bs),
         )
     def generate_attn_arg_prefill(
@@ -282,6 +306,7 @@ class EagleVerifyInput:
         batch: ScheduleBatch,
         logits_output: torch.Tensor,
         token_to_kv_pool_allocator: TokenToKVPoolAllocator,
+        page_size: int,
     ) -> torch.Tensor:
         """
         Verify and find accepted tokens based on logits output and batch
@@ -305,6 +330,7 @@ class EagleVerifyInput:
         )
         accept_length = torch.empty((bs,), dtype=torch.int32, device="cuda")
+        # Apply penalty
         if sampling_info.penalizer_orchestrator.is_required:
             # This is a relaxed version of penalties for speculative decoding.
             linear_penalty = torch.zeros(
@@ -317,6 +343,7 @@ class EagleVerifyInput:
                 torch.repeat_interleave(linear_penalty, self.draft_token_num, dim=0)
             )
+        # Sample tokens
         if batch.sampling_info.is_all_greedy:
             target_predict = torch.argmax(logits_output.next_token_logits, dim=-1)
             target_predict = target_predict.reshape(bs, self.draft_token_num)
@@ -378,13 +405,24 @@ class EagleVerifyInput:
                 deterministic=True,
             )
+        if SIMULATE_ACC_LEN:
+            # Do simulation
+            accept_index = _generate_simulated_accept_index(
+                accept_index=accept_index,
+                predict=predict,  # mutable
+                accept_length=accept_length,  # mutable
+                simulate_acc_len=SIMULATE_ACC_LEN,
+                bs=bs,
+                spec_steps=self.spec_steps,
+            )
         new_accept_index = []
         unfinished_index = []
         accept_index_cpu = accept_index.tolist()
         predict_cpu = predict.tolist()
         has_finished = False
-        # iterate every accepted token and check if req has finished after append the token
+        # Iterate every accepted token and check if req has finished after append the token
         # should be checked BEFORE free kv cache slots
         for i, (req, accept_index_row) in enumerate(zip(batch.reqs, accept_index_cpu)):
             new_accept_index_ = []
@@ -407,13 +445,28 @@ class EagleVerifyInput:
                 unfinished_index.append(i)
             req.spec_verify_ct += 1
+        if has_finished:
+            accept_length = (accept_index != -1).sum(dim=1) - 1
+        # Free the KV cache for unaccepted tokens
+        accept_index = accept_index[accept_index != -1]
+        verified_id = predict[accept_index]
+        evict_mask = torch.full_like(self.draft_token, True, dtype=torch.bool)
+        evict_mask[accept_index] = False
+        if page_size != 1:
+            align_evict_mask_to_page_size[len(batch.seq_lens),](
+                batch.seq_lens,
+                evict_mask,
+                page_size,
+                self.draft_token_num,
+                next_power_of_2(self.draft_token_num),
+            )
+        token_to_kv_pool_allocator.free(batch.out_cache_loc[evict_mask])
+        # Construct EagleVerifyOutput
         if not has_finished:
-            accept_index = accept_index[accept_index != -1]
-            verified_id = predict[accept_index]
-            evict_mask = torch.full_like(self.draft_token, True, dtype=torch.bool)
-            evict_mask[accept_index] = False
-            mem_need_free_idx = batch.out_cache_loc[evict_mask]
-            token_to_kv_pool_allocator.free(mem_need_free_idx)
             batch.out_cache_loc = batch.out_cache_loc[accept_index]
             assign_req_to_token_pool[(bs,)](
                 batch.req_pool_indices,
@@ -422,7 +475,7 @@ class EagleVerifyInput:
                 batch.seq_lens + accept_length + 1,
                 batch.out_cache_loc,
                 batch.req_to_token_pool.req_to_token.shape[1],
-                triton.next_power_of_2(bs),
+                next_power_of_2(bs),
             )
             batch.seq_lens.add_(accept_length + 1)
             accept_length_cpu = accept_length.tolist()
@@ -443,13 +496,6 @@ class EagleVerifyInput:
                 accepeted_indices=accept_index,
             )
         else:
-            accept_length = (accept_index != -1).sum(dim=1) - 1
-            accept_index = accept_index[accept_index != -1]
-            verified_id = predict[accept_index]
-            evict_mask = torch.full_like(self.draft_token, True, dtype=torch.bool)
-            evict_mask[accept_index] = False
-            mem_need_free_idx = batch.out_cache_loc[evict_mask]
-            token_to_kv_pool_allocator.free(mem_need_free_idx)
             assign_req_to_token_pool[(bs,)](
                 batch.req_pool_indices,
                 batch.req_to_token_pool.req_to_token,
@@ -457,7 +503,7 @@ class EagleVerifyInput:
                 batch.seq_lens + accept_length + 1,
                 batch.out_cache_loc[accept_index],
                 batch.req_to_token_pool.req_to_token.shape[1],
-                triton.next_power_of_2(bs),
+                next_power_of_2(bs),
             )
             batch.seq_lens.add_(accept_length + 1)
             accept_length_cpu = accept_length.tolist()
@@ -465,20 +511,21 @@ class EagleVerifyInput:
             draft_input = EagleDraftInput()
             if len(new_accept_index) > 0:
                 new_accept_index = torch.tensor(new_accept_index, device="cuda")
+                unfinished_index_device = torch.tensor(unfinished_index, device="cuda")
                 draft_input.hidden_states = batch.spec_info.hidden_states[
                     new_accept_index
                 ]
                 draft_input.verified_id = predict[new_accept_index]
-                draft_input.accept_length = accept_length[unfinished_index]
                 draft_input.accept_length_cpu = [
                     accept_length_cpu[i] for i in unfinished_index
                 ]
+                draft_input.accept_length = accept_length[unfinished_index_device]
                 if has_finished:
                     draft_input.seq_lens_for_draft_extend = batch.seq_lens[
-                        unfinished_index
+                        unfinished_index_device
                     ]
                     draft_input.req_pool_indices_for_draft_extend = (
-                        batch.req_pool_indices[unfinished_index]
+                        batch.req_pool_indices[unfinished_index_device]
                     )
                 else:
                     draft_input.seq_lens_for_draft_extend = batch.seq_lens
@@ -564,13 +611,24 @@ def assign_draft_cache_locs(
     pool_len: tl.constexpr,
     topk: tl.constexpr,
     speculative_num_steps: tl.constexpr,
+    page_size: tl.constexpr,
 ):
     BLOCK_SIZE: tl.constexpr = 32
     pid = tl.program_id(axis=0)
     kv_start = tl.load(seq_lens + pid)
-    kv_end = tl.load(seq_lens + pid) + topk * speculative_num_steps
+    if page_size == 1 or topk == 1:
+        kv_end = tl.load(seq_lens + pid) + topk * speculative_num_steps
+        out_cache_ptr = out_cache_loc + pid * topk * speculative_num_steps
+    else:
+        prefix_len = tl.load(seq_lens + pid)
+        last_page_len = prefix_len % page_size
+        num_new_page = (
+            last_page_len + speculative_num_steps + page_size - 1
+        ) // page_size
+        kv_end = prefix_len // page_size * page_size + num_new_page * (page_size * topk)
     token_pool = req_to_token + tl.load(req_pool_indices + pid) * pool_len
-    out_cache_ptr = out_cache_loc + pid * topk * speculative_num_steps
     num_loop = tl.cdiv(topk * speculative_num_steps, BLOCK_SIZE)
     for i in range(num_loop):
@@ -642,6 +700,29 @@ def generate_draft_decode_kv_indices(
     tl.store(kv_indptr + zid, base + zid * iters)
+@triton.jit
+def align_evict_mask_to_page_size(
+    seq_lens,
+    evict_mask,
+    page_size: tl.constexpr,
+    num_draft_tokens: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    t_range = tl.arange(0, BLOCK_SIZE)
+    bid = tl.program_id(axis=0)
+    seq_len = tl.load(seq_lens + bid)
+    io_mask = t_range < num_draft_tokens
+    mask_row = tl.load(evict_mask + bid * num_draft_tokens + t_range, mask=io_mask)
+    num_trues = tl.sum(mask_row)
+    num_false = num_draft_tokens - num_trues
+    start = (seq_len + num_false - 1) // page_size * page_size - seq_len
+    for i in range(max(start, 0), min(start + page_size, num_draft_tokens)):
+        tl.store(evict_mask + bid * num_draft_tokens + i, False)
 @torch.compile(dynamic=True)
 def select_top_k_tokens(
     i: int,
@@ -699,3 +780,34 @@ def fast_topk(values, topk, dim):
     else:
         # Use topk for efficiency with larger k values
         return torch.topk(values, topk, dim=dim)
+def _generate_simulated_accept_index(
+    accept_index,
+    predict,
+    accept_length,
+    simulate_acc_len,
+    bs,
+    spec_steps,
+):
+    simulate_acc_len_float = float(simulate_acc_len)
+    simulated_values = torch.normal(
+        mean=simulate_acc_len_float,
+        std=1.0,
+        size=(1,),
+        device="cpu",
+    )
+    # clamp simulated values to be between 1 and self.spec_steps
+    simulated_values = torch.clamp(simulated_values, min=1.0, max=spec_steps)
+    simulate_acc_len = int(simulated_values.round().item())
+    accept_indx_first_col = accept_index[:, 0].view(-1, 1)
+    sim_accept_index = torch.full(
+        (bs, spec_steps + 1), -1, dtype=torch.int32, device="cuda"
+    )
+    sim_accept_index[:, :simulate_acc_len] = accept_indx_first_col + torch.arange(
+        simulate_acc_len, device=accept_index.device
+    )
+    accept_length.fill_(simulate_acc_len - 1)
+    predict.fill_(100)  # some legit token id
+    return sim_accept_index

sglang 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl

sglang 0.4.4.post3py3-none-any.whl → 0.4.5py3-none-any.whl