PyPI - sglang - Versions diffs - 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl - Mend

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

sglang/bench_serving.py +72 -10
sglang/srt/_custom_ops.py +59 -92
sglang/srt/configs/deepseekvl2.py +10 -1
sglang/srt/configs/model_config.py +6 -16
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/custom_op.py +5 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +28 -80
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/distributed/parallel_state.py +32 -5
sglang/srt/entrypoints/engine.py +0 -5
sglang/srt/entrypoints/http_server.py +7 -1
sglang/srt/entrypoints/verl_engine.py +2 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/attention/flashattention_backend.py +582 -125
sglang/srt/layers/attention/flashinfer_backend.py +5 -7
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/dp_attention.py +12 -1
sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
sglang/srt/layers/moe/ep_moe/layer.py +79 -80
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
sglang/srt/layers/moe/topk.py +79 -6
sglang/srt/layers/quantization/__init__.py +137 -165
sglang/srt/layers/quantization/awq.py +200 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
sglang/srt/layers/quantization/fp8_kernel.py +2 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -4
sglang/srt/layers/quantization/gptq.py +30 -40
sglang/srt/layers/quantization/moe_wna16.py +501 -0
sglang/srt/layers/quantization/utils.py +1 -1
sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
sglang/srt/lora/backend/base_backend.py +4 -4
sglang/srt/lora/backend/flashinfer_backend.py +12 -9
sglang/srt/lora/backend/triton_backend.py +5 -8
sglang/srt/lora/layers.py +19 -33
sglang/srt/lora/lora_manager.py +20 -7
sglang/srt/lora/mem_pool.py +12 -6
sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
sglang/srt/lora/utils.py +6 -0
sglang/srt/managers/cache_controller.py +34 -11
sglang/srt/managers/io_struct.py +4 -2
sglang/srt/managers/mm_utils.py +202 -156
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
sglang/srt/managers/multimodal_processors/clip.py +44 -0
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
sglang/srt/managers/multimodal_processors/llava.py +34 -14
sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
sglang/srt/managers/multimodal_processors/mlama.py +10 -23
sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
sglang/srt/managers/schedule_batch.py +185 -127
sglang/srt/managers/scheduler.py +29 -23
sglang/srt/managers/tokenizer_manager.py +1 -2
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/utils.py +1 -6
sglang/srt/mem_cache/hiradix_cache.py +62 -52
sglang/srt/mem_cache/memory_pool.py +72 -6
sglang/srt/mem_cache/paged_allocator.py +39 -0
sglang/srt/metrics/collector.py +23 -53
sglang/srt/model_executor/cuda_graph_runner.py +16 -13
sglang/srt/model_executor/forward_batch_info.py +10 -10
sglang/srt/model_executor/model_runner.py +64 -59
sglang/srt/model_loader/loader.py +19 -1
sglang/srt/model_loader/weight_utils.py +6 -3
sglang/srt/models/clip.py +568 -0
sglang/srt/models/deepseek_janus_pro.py +12 -17
sglang/srt/models/deepseek_v2.py +339 -123
sglang/srt/models/deepseek_vl2.py +105 -104
sglang/srt/models/gemma3_causal.py +12 -2
sglang/srt/models/gemma3_mm.py +20 -80
sglang/srt/models/llama.py +4 -1
sglang/srt/models/llava.py +31 -19
sglang/srt/models/llavavid.py +16 -7
sglang/srt/models/minicpmo.py +63 -147
sglang/srt/models/minicpmv.py +17 -27
sglang/srt/models/mllama.py +29 -14
sglang/srt/models/qwen2.py +9 -6
sglang/srt/models/qwen2_5_vl.py +21 -31
sglang/srt/models/qwen2_vl.py +20 -21
sglang/srt/openai_api/adapter.py +106 -93
sglang/srt/openai_api/protocol.py +10 -5
sglang/srt/patch_torch.py +71 -0
sglang/srt/platforms/interface.py +371 -0
sglang/srt/server_args.py +120 -25
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
sglang/srt/speculative/eagle_utils.py +140 -28
sglang/srt/speculative/eagle_worker.py +94 -25
sglang/srt/utils.py +137 -51
sglang/test/runners.py +27 -2
sglang/test/test_custom_ops.py +55 -0
sglang/test/test_utils.py +14 -27
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +10 -5
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +108 -99
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0

sglang/srt/speculative/eagle_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import os
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, List, Optional
@@ -10,11 +11,15 @@ import triton.language as tl
 from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
-from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.managers.schedule_batch import (
+    ScheduleBatch,
+    get_last_loc,
+    global_server_args_dict,
+)
 from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
 from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
 from sglang.srt.speculative.build_eagle_tree import build_tree_kernel_efficient
-from sglang.srt.utils import is_cuda_available, is_hip
+from sglang.srt.utils import is_cuda_available, is_hip, next_power_of_2
 if is_cuda_available():
     from sgl_kernel import (
@@ -34,6 +39,9 @@ import logging
 logger = logging.getLogger(__name__)
+SIMULATE_ACC_LEN = os.environ.get("SIMULATE_ACC_LEN")
 @dataclass
 class EagleDraftInput:
     # The inputs for decode
@@ -93,7 +101,7 @@ class EagleDraftInput:
             torch.cumsum(self.accept_length, axis=0, dtype=torch.int),
             self.positions,
             new_verified_id,
-            triton.next_power_of_2(speculative_num_steps + 1),
+            next_power_of_2(speculative_num_steps + 1),
         )
         batch.seq_lens_sum = sum(seq_lens_cpu)
@@ -225,18 +233,34 @@ class EagleVerifyInput:
             CaptureHiddenMode.FULL,
         )
-    def prepare_for_verify(self, batch: ScheduleBatch):
+    def prepare_for_verify(self, batch: ScheduleBatch, page_size: int):
         batch.input_ids = self.draft_token
-        batch.out_cache_loc = batch.alloc_token_slots(batch.input_ids.numel())
+        if page_size == 1:
+            batch.out_cache_loc = batch.alloc_token_slots(len(batch.input_ids))
+            end_offset = batch.seq_lens + self.draft_token_num
+        else:
+            prefix_lens = batch.seq_lens
+            end_offset = prefix_lens + self.draft_token_num
+            last_loc = get_last_loc(
+                batch.req_to_token_pool.req_to_token,
+                batch.req_pool_indices,
+                prefix_lens,
+            )
+            batch.out_cache_loc = batch.alloc_paged_token_slots_extend(
+                prefix_lens, end_offset, last_loc, len(batch.input_ids)
+            )
+            self.last_loc = last_loc
         bs = batch.batch_size()
         assign_req_to_token_pool[(bs,)](
             batch.req_pool_indices,
             batch.req_to_token_pool.req_to_token,
             batch.seq_lens,
-            batch.seq_lens + self.draft_token_num,
+            end_offset,
             batch.out_cache_loc,
             batch.req_to_token_pool.req_to_token.shape[1],
-            triton.next_power_of_2(bs),
+            next_power_of_2(bs),
         )
     def generate_attn_arg_prefill(
@@ -282,6 +306,7 @@ class EagleVerifyInput:
         batch: ScheduleBatch,
         logits_output: torch.Tensor,
         token_to_kv_pool_allocator: TokenToKVPoolAllocator,
+        page_size: int,
     ) -> torch.Tensor:
         """
         Verify and find accepted tokens based on logits output and batch
@@ -305,6 +330,7 @@ class EagleVerifyInput:
         )
         accept_length = torch.empty((bs,), dtype=torch.int32, device="cuda")
+        # Apply penalty
         if sampling_info.penalizer_orchestrator.is_required:
             # This is a relaxed version of penalties for speculative decoding.
             linear_penalty = torch.zeros(
@@ -317,6 +343,7 @@ class EagleVerifyInput:
                 torch.repeat_interleave(linear_penalty, self.draft_token_num, dim=0)
             )
+        # Sample tokens
         if batch.sampling_info.is_all_greedy:
             target_predict = torch.argmax(logits_output.next_token_logits, dim=-1)
             target_predict = target_predict.reshape(bs, self.draft_token_num)
@@ -378,13 +405,24 @@ class EagleVerifyInput:
                 deterministic=True,
             )
+        if SIMULATE_ACC_LEN:
+            # Do simulation
+            accept_index = _generate_simulated_accept_index(
+                accept_index=accept_index,
+                predict=predict,  # mutable
+                accept_length=accept_length,  # mutable
+                simulate_acc_len=SIMULATE_ACC_LEN,
+                bs=bs,
+                spec_steps=self.spec_steps,
+            )
         new_accept_index = []
         unfinished_index = []
         accept_index_cpu = accept_index.tolist()
         predict_cpu = predict.tolist()
         has_finished = False
-        # iterate every accepted token and check if req has finished after append the token
+        # Iterate every accepted token and check if req has finished after append the token
         # should be checked BEFORE free kv cache slots
         for i, (req, accept_index_row) in enumerate(zip(batch.reqs, accept_index_cpu)):
             new_accept_index_ = []
@@ -407,13 +445,28 @@ class EagleVerifyInput:
                 unfinished_index.append(i)
             req.spec_verify_ct += 1
+        if has_finished:
+            accept_length = (accept_index != -1).sum(dim=1) - 1
+        # Free the KV cache for unaccepted tokens
+        accept_index = accept_index[accept_index != -1]
+        verified_id = predict[accept_index]
+        evict_mask = torch.full_like(self.draft_token, True, dtype=torch.bool)
+        evict_mask[accept_index] = False
+        if page_size != 1:
+            align_evict_mask_to_page_size[len(batch.seq_lens),](
+                batch.seq_lens,
+                evict_mask,
+                page_size,
+                self.draft_token_num,
+                next_power_of_2(self.draft_token_num),
+            )
+        token_to_kv_pool_allocator.free(batch.out_cache_loc[evict_mask])
+        # Construct EagleVerifyOutput
         if not has_finished:
-            accept_index = accept_index[accept_index != -1]
-            verified_id = predict[accept_index]
-            evict_mask = torch.full_like(self.draft_token, True, dtype=torch.bool)
-            evict_mask[accept_index] = False
-            mem_need_free_idx = batch.out_cache_loc[evict_mask]
-            token_to_kv_pool_allocator.free(mem_need_free_idx)
             batch.out_cache_loc = batch.out_cache_loc[accept_index]
             assign_req_to_token_pool[(bs,)](
                 batch.req_pool_indices,
@@ -422,7 +475,7 @@ class EagleVerifyInput:
                 batch.seq_lens + accept_length + 1,
                 batch.out_cache_loc,
                 batch.req_to_token_pool.req_to_token.shape[1],
-                triton.next_power_of_2(bs),
+                next_power_of_2(bs),
             )
             batch.seq_lens.add_(accept_length + 1)
             accept_length_cpu = accept_length.tolist()
@@ -443,13 +496,6 @@ class EagleVerifyInput:
                 accepeted_indices=accept_index,
             )
         else:
-            accept_length = (accept_index != -1).sum(dim=1) - 1
-            accept_index = accept_index[accept_index != -1]
-            verified_id = predict[accept_index]
-            evict_mask = torch.full_like(self.draft_token, True, dtype=torch.bool)
-            evict_mask[accept_index] = False
-            mem_need_free_idx = batch.out_cache_loc[evict_mask]
-            token_to_kv_pool_allocator.free(mem_need_free_idx)
             assign_req_to_token_pool[(bs,)](
                 batch.req_pool_indices,
                 batch.req_to_token_pool.req_to_token,
@@ -457,7 +503,7 @@ class EagleVerifyInput:
                 batch.seq_lens + accept_length + 1,
                 batch.out_cache_loc[accept_index],
                 batch.req_to_token_pool.req_to_token.shape[1],
-                triton.next_power_of_2(bs),
+                next_power_of_2(bs),
             )
             batch.seq_lens.add_(accept_length + 1)
             accept_length_cpu = accept_length.tolist()
@@ -465,20 +511,21 @@ class EagleVerifyInput:
             draft_input = EagleDraftInput()
             if len(new_accept_index) > 0:
                 new_accept_index = torch.tensor(new_accept_index, device="cuda")
+                unfinished_index_device = torch.tensor(unfinished_index, device="cuda")
                 draft_input.hidden_states = batch.spec_info.hidden_states[
                     new_accept_index
                 ]
                 draft_input.verified_id = predict[new_accept_index]
-                draft_input.accept_length = accept_length[unfinished_index]
                 draft_input.accept_length_cpu = [
                     accept_length_cpu[i] for i in unfinished_index
                 ]
+                draft_input.accept_length = accept_length[unfinished_index_device]
                 if has_finished:
                     draft_input.seq_lens_for_draft_extend = batch.seq_lens[
-                        unfinished_index
+                        unfinished_index_device
                     ]
                     draft_input.req_pool_indices_for_draft_extend = (
-                        batch.req_pool_indices[unfinished_index]
+                        batch.req_pool_indices[unfinished_index_device]
                     )
                 else:
                     draft_input.seq_lens_for_draft_extend = batch.seq_lens
@@ -564,13 +611,24 @@ def assign_draft_cache_locs(
     pool_len: tl.constexpr,
     topk: tl.constexpr,
     speculative_num_steps: tl.constexpr,
+    page_size: tl.constexpr,
 ):
     BLOCK_SIZE: tl.constexpr = 32
     pid = tl.program_id(axis=0)
     kv_start = tl.load(seq_lens + pid)
-    kv_end = tl.load(seq_lens + pid) + topk * speculative_num_steps
+    if page_size == 1 or topk == 1:
+        kv_end = tl.load(seq_lens + pid) + topk * speculative_num_steps
+        out_cache_ptr = out_cache_loc + pid * topk * speculative_num_steps
+    else:
+        prefix_len = tl.load(seq_lens + pid)
+        last_page_len = prefix_len % page_size
+        num_new_page = (
+            last_page_len + speculative_num_steps + page_size - 1
+        ) // page_size
+        kv_end = prefix_len // page_size * page_size + num_new_page * (page_size * topk)
     token_pool = req_to_token + tl.load(req_pool_indices + pid) * pool_len
-    out_cache_ptr = out_cache_loc + pid * topk * speculative_num_steps
     num_loop = tl.cdiv(topk * speculative_num_steps, BLOCK_SIZE)
     for i in range(num_loop):
@@ -642,6 +700,29 @@ def generate_draft_decode_kv_indices(
     tl.store(kv_indptr + zid, base + zid * iters)
+@triton.jit
+def align_evict_mask_to_page_size(
+    seq_lens,
+    evict_mask,
+    page_size: tl.constexpr,
+    num_draft_tokens: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    t_range = tl.arange(0, BLOCK_SIZE)
+    bid = tl.program_id(axis=0)
+    seq_len = tl.load(seq_lens + bid)
+    io_mask = t_range < num_draft_tokens
+    mask_row = tl.load(evict_mask + bid * num_draft_tokens + t_range, mask=io_mask)
+    num_trues = tl.sum(mask_row)
+    num_false = num_draft_tokens - num_trues
+    start = (seq_len + num_false - 1) // page_size * page_size - seq_len
+    for i in range(max(start, 0), min(start + page_size, num_draft_tokens)):
+        tl.store(evict_mask + bid * num_draft_tokens + i, False)
 @torch.compile(dynamic=True)
 def select_top_k_tokens(
     i: int,
@@ -699,3 +780,34 @@ def fast_topk(values, topk, dim):
     else:
         # Use topk for efficiency with larger k values
         return torch.topk(values, topk, dim=dim)
+def _generate_simulated_accept_index(
+    accept_index,
+    predict,
+    accept_length,
+    simulate_acc_len,
+    bs,
+    spec_steps,
+):
+    simulate_acc_len_float = float(simulate_acc_len)
+    simulated_values = torch.normal(
+        mean=simulate_acc_len_float,
+        std=1.0,
+        size=(1,),
+        device="cpu",
+    )
+    # clamp simulated values to be between 1 and self.spec_steps
+    simulated_values = torch.clamp(simulated_values, min=1.0, max=spec_steps)
+    simulate_acc_len = int(simulated_values.round().item())
+    accept_indx_first_col = accept_index[:, 0].view(-1, 1)
+    sim_accept_index = torch.full(
+        (bs, spec_steps + 1), -1, dtype=torch.int32, device="cuda"
+    )
+    sim_accept_index[:, :simulate_acc_len] = accept_indx_first_col + torch.arange(
+        simulate_acc_len, device=accept_index.device
+    )
+    accept_length.fill_(simulate_acc_len - 1)
+    predict.fill_(100)  # some legit token id
+    return sim_accept_index

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -11,7 +11,11 @@ from sglang.srt.distributed import GroupCoordinator, patch_tensor_parallel_group
 from sglang.srt.layers.dp_attention import disable_dp_size
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.sampler import get_token_ids_logprobs, get_top_logprobs
-from sglang.srt.managers.schedule_batch import ScheduleBatch
+from sglang.srt.managers.schedule_batch import (
+    ScheduleBatch,
+    get_last_loc,
+    global_server_args_dict,
+)
 from sglang.srt.managers.tp_worker import TpModelWorker
 from sglang.srt.model_executor.forward_batch_info import (
     CaptureHiddenMode,
@@ -67,6 +71,7 @@ class EAGLEWorker(TpModelWorker):
         self.gpu_id = gpu_id
         self.device = server_args.device
         self.target_worker = target_worker
+        self.page_size = server_args.page_size
         self.speculative_algorithm = SpeculativeAlgorithm.from_string(
             server_args.speculative_algorithm
         )
@@ -145,15 +150,26 @@ class EAGLEWorker(TpModelWorker):
     def init_attention_backend(self):
         # Create multi-step attn backends and cuda graph runners
         if self.server_args.attention_backend == "flashinfer":
-            from sglang.srt.layers.attention.flashinfer_backend import (
-                FlashInferMultiStepDraftBackend,
-            )
+            if not global_server_args_dict["use_mla_backend"]:
+                from sglang.srt.layers.attention.flashinfer_backend import (
+                    FlashInferMultiStepDraftBackend,
+                )
-            self.draft_attn_backend = FlashInferMultiStepDraftBackend(
-                self.draft_model_runner,
-                self.topk,
-                self.speculative_num_steps,
-            )
+                self.draft_attn_backend = FlashInferMultiStepDraftBackend(
+                    self.draft_model_runner,
+                    self.topk,
+                    self.speculative_num_steps,
+                )
+            else:
+                from sglang.srt.layers.attention.flashinfer_mla_backend import (
+                    FlashInferMLAMultiStepDraftBackend,
+                )
+                self.draft_attn_backend = FlashInferMLAMultiStepDraftBackend(
+                    self.draft_model_runner,
+                    self.topk,
+                    self.speculative_num_steps,
+                )
             self.draft_extend_attn_backend = None
             self.padded_static_len = self.speculative_num_steps + 1
             self.has_prefill_wrapper_verify = True
@@ -170,19 +186,19 @@ class EAGLEWorker(TpModelWorker):
             self.draft_extend_attn_backend = None
             self.padded_static_len = self.speculative_num_steps + 1
             self.has_prefill_wrapper_verify = False
-        elif self.server_args.attention_backend == "flashinfer_mla":
-            from sglang.srt.layers.attention.flashinfer_mla_backend import (
-                FlashInferMLAMultiStepDraftBackend,
+        elif self.server_args.attention_backend == "fa3":
+            from sglang.srt.layers.attention.flashattention_backend import (
+                FlashAttentionMultiStepBackend,
             )
-            self.draft_attn_backend = FlashInferMLAMultiStepDraftBackend(
+            self.draft_attn_backend = FlashAttentionMultiStepBackend(
                 self.draft_model_runner,
                 self.topk,
                 self.speculative_num_steps,
             )
             self.draft_extend_attn_backend = None
             self.padded_static_len = self.speculative_num_steps + 1
-            self.has_prefill_wrapper_verify = True
+            self.has_prefill_wrapper_verify = False
         else:
             raise ValueError(
                 f"EAGLE is not supportted in attention backend {self.server_args.attention_backend}"
@@ -234,14 +250,11 @@ class EAGLEWorker(TpModelWorker):
         """
         if batch.forward_mode.is_decode():
             with self.draft_tp_context(self.draft_model_runner.tp_group):
-                spec_info, to_free_cache_loc = self.draft(batch)
+                spec_info = self.draft(batch)
             logits_output, verify_output, model_worker_batch = self.verify(
                 batch, spec_info
             )
-            # Free cache loc (we put it here to avoid synchronization and hide kernel launch overhead.)
-            self.token_to_kv_pool_allocator.free(to_free_cache_loc)
             # If it is None, it means all requests are finished
             if batch.spec_info.verified_id is not None:
                 with self.draft_tp_context(self.draft_model_runner.tp_group):
@@ -305,9 +318,59 @@ class EAGLEWorker(TpModelWorker):
             )
         # Allocate cache locations
-        out_cache_loc = batch.alloc_token_slots(
-            num_seqs * self.topk * self.speculative_num_steps
-        )
+        if self.page_size == 1:
+            out_cache_loc, token_to_kv_pool_state_backup = batch.alloc_token_slots(
+                num_seqs * self.topk * self.speculative_num_steps, backup_state=True
+            )
+        else:
+            if self.topk == 1:
+                prefix_lens = batch.seq_lens
+                seq_lens = prefix_lens + self.speculative_num_steps
+                extend_num_tokens = num_seqs * self.speculative_num_steps
+            else:
+                # In this case, the last partial page needs to be duplicated.
+                # KV cache layout in batch.req_to_token_pool.req_to_token:
+                #
+                # | -------- | -- xxxx .. | -- xxxx .. | -- xxxx .. |
+                #    prefix     top-k = 0    tok-k = 1    top-k = 2
+                #
+                #  "-" means prefix tokens
+                #  "x" means speculative draft tokens
+                #  "." means padded tokens
+                # TODO: fuse these ops
+                prefix_lens = batch.seq_lens
+                last_page_lens = prefix_lens % self.page_size
+                num_new_pages = (
+                    last_page_lens + self.speculative_num_steps + self.page_size - 1
+                ) // self.page_size
+                seq_lens = (
+                    prefix_lens // self.page_size * self.page_size
+                    + num_new_pages * (self.page_size * self.topk)
+                )
+                extend_num_tokens = torch.sum(seq_lens - prefix_lens).item()
+                raise NotImplementedError(
+                    "page_size > 1 and top_k > 1 are not supported."
+                )
+                # TODO: Support page_size > 1 and top_k > 1
+                # 1. Duplicate the KV cache in the last partial page for all top-k segments
+                # 2. Modify generate_draft_decode_kv_indices accordingly
+            last_loc = get_last_loc(
+                batch.req_to_token_pool.req_to_token,
+                batch.req_pool_indices,
+                prefix_lens,
+            )
+            out_cache_loc, token_to_kv_pool_state_backup = (
+                batch.alloc_paged_token_slots_extend(
+                    prefix_lens,
+                    seq_lens,
+                    last_loc,
+                    extend_num_tokens,
+                    backup_state=True,
+                )
+            )
         assign_draft_cache_locs[(num_seqs,)](
             batch.req_pool_indices,
             batch.req_to_token_pool.req_to_token,
@@ -316,6 +379,7 @@ class EAGLEWorker(TpModelWorker):
             batch.req_to_token_pool.req_to_token.shape[1],
             self.topk,
             self.speculative_num_steps,
+            self.page_size,
         )
         batch.out_cache_loc = out_cache_loc
         batch.seq_lens_sum = torch.sum(batch.seq_lens).item()
@@ -343,6 +407,8 @@ class EAGLEWorker(TpModelWorker):
             # Run forward steps
             score_list, token_list, parents_list = self.draft_forward(forward_batch)
+        self.token_to_kv_pool_allocator.restore_state(token_to_kv_pool_state_backup)
         ret = EagleVerifyInput.create(
             spec_info.verified_id,
             score_list,
@@ -354,7 +420,7 @@ class EAGLEWorker(TpModelWorker):
             self.speculative_num_steps,
             self.server_args.speculative_num_draft_tokens,
         )
-        return ret, out_cache_loc
+        return ret
     def draft_forward(self, forward_batch: ForwardBatch):
         # Parse args
@@ -411,7 +477,7 @@ class EAGLEWorker(TpModelWorker):
         return score_list, token_list, parents_list
     def verify(self, batch: ScheduleBatch, spec_info: EagleVerifyInput):
-        spec_info.prepare_for_verify(batch)
+        spec_info.prepare_for_verify(batch, self.page_size)
         batch.forward_mode = ForwardMode.TARGET_VERIFY
         batch.spec_info = spec_info
         model_worker_batch = batch.get_model_worker_batch()
@@ -421,7 +487,10 @@ class EAGLEWorker(TpModelWorker):
         self._detect_nan_if_needed(logits_output)
         spec_info.hidden_states = logits_output.hidden_states
         res: EagleVerifyOutput = spec_info.verify(
-            batch, logits_output, self.token_to_kv_pool_allocator
+            batch,
+            logits_output,
+            self.token_to_kv_pool_allocator,
+            self.page_size,
         )
         # Post process based on verified outputs.
@@ -586,5 +655,5 @@ def load_token_map(token_map_path: str) -> List[int]:
             ignore_patterns=["*.bin", "*.safetensors"],
         )
         token_map_path = os.path.join(cache_dir, os.path.basename(token_map_path))
-    hot_token_id = torch.load(token_map_path)
+    hot_token_id = torch.load(token_map_path, weights_only=True)
     return torch.tensor(hot_token_id, dtype=torch.int32)

sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post4py3-none-any.whl