PyPI - sglang - Versions diffs - 0.4.2.post1__py3-none-any.whl → 0.4.2.post2__py3-none-any.whl - Mend

sglang 0.4.2.post1py3-none-any.whl → 0.4.2.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -21,8 +21,8 @@ from typing import TYPE_CHECKING, Callable
 import torch
 import tqdm
-from vllm.model_executor.custom_op import CustomOp
+from sglang.srt.custom_op import CustomOp
 from sglang.srt.distributed import get_tensor_model_parallel_rank
 from sglang.srt.distributed.parallel_state import GroupCoordinator, graph_capture
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
@@ -103,69 +103,75 @@ def set_torch_compile_config():
         torch._dynamo.config.cache_size_limit = 1024
+def get_batch_sizes_to_capture(model_runner: ModelRunner):
+    server_args = model_runner.server_args
+    capture_bs = server_args.cuda_graph_bs
+    if capture_bs is None:
+        if server_args.disable_cuda_graph_padding:
+            capture_bs = list(range(1, 33)) + [64, 128]
+        else:
+            capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
+    if max(capture_bs) > model_runner.req_to_token_pool.size:
+        # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
+        # is very samll. We add more values here to make sure we capture the maximum bs.
+        capture_bs = list(
+            sorted(
+                set(
+                    capture_bs
+                    + [model_runner.req_to_token_pool.size - 1]
+                    + [model_runner.req_to_token_pool.size]
+                )
+            )
+        )
+    capture_bs = [
+        bs
+        for bs in capture_bs
+        if bs <= model_runner.req_to_token_pool.size
+        and bs <= server_args.cuda_graph_max_bs
+    ]
+    compile_bs = (
+        [bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs]
+        if server_args.enable_torch_compile
+        else []
+    )
+    return capture_bs, compile_bs
+# Reuse this memory pool across all cuda graph runners.
+global_graph_memory_pool = None
+def get_global_graph_memory_pool():
+    return global_graph_memory_pool
+def set_global_graph_memory_pool(val):
+    global global_graph_memory_pool
+    global_graph_memory_pool = val
 class CudaGraphRunner:
     """A CudaGraphRunner runs the forward pass of a model with cuda graph and torch.compile."""
-    def __init__(self, model_runner: "ModelRunner"):
+    def __init__(self, model_runner: ModelRunner):
         # Parse args
         self.model_runner = model_runner
         self.graphs = {}
-        self.input_buffers = {}
         self.output_buffers = {}
-        self.flashinfer_handlers = {}
-        self.graph_memory_pool = None
-        self.use_torch_compile = model_runner.server_args.enable_torch_compile
+        self.enable_torch_compile = model_runner.server_args.enable_torch_compile
         self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
-        self.is_encoder_decoder = self.model_runner.model_config.is_encoder_decoder
-        self.enable_dp_attention = self.model_runner.server_args.enable_dp_attention
-        self.tp_size = self.model_runner.tp_size
-        self.dp_size = self.model_runner.server_args.dp_size
+        self.is_encoder_decoder = model_runner.model_config.is_encoder_decoder
+        self.enable_dp_attention = model_runner.server_args.enable_dp_attention
+        self.tp_size = model_runner.server_args.tp_size
+        self.dp_size = model_runner.server_args.dp_size
         # Batch sizes to capture
-        self.capture_bs = self.model_runner.server_args.cuda_graph_bs
-        if self.capture_bs is None:
-            if model_runner.server_args.disable_cuda_graph_padding:
-                self.capture_bs = list(range(1, 33)) + [64, 128]
-            else:
-                self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
-        if max(self.capture_bs) > model_runner.req_to_token_pool.size:
-            # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
-            # is very samll. We add more values here to make sure we capture the maximum bs.
-            self.capture_bs = list(
-                sorted(
-                    set(
-                        self.capture_bs
-                        + [model_runner.req_to_token_pool.size - 1]
-                        + [model_runner.req_to_token_pool.size]
-                    )
-                )
-            )
-        self.capture_bs = [
-            bs
-            for bs in self.capture_bs
-            if bs <= model_runner.req_to_token_pool.size
-            and bs <= model_runner.server_args.cuda_graph_max_bs
-        ]
-        self.compile_bs = (
-            [
-                bs
-                for bs in self.capture_bs
-                if bs <= self.model_runner.server_args.torch_compile_max_bs
-            ]
-            if self.use_torch_compile
-            else []
-        )
+        self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
         self.capture_forward_mode = ForwardMode.DECODE
         self.num_tokens_per_bs = 1
         if model_runner.spec_algorithm.is_eagle():
             if self.model_runner.is_draft_worker:
-                self.num_tokens_per_bs = (
-                    self.model_runner.server_args.speculative_eagle_topk
-                )
+                raise RuntimeError("This should not happen")
             else:
                 self.capture_forward_mode = ForwardMode.TARGET_VERIFY
                 self.num_tokens_per_bs = (
@@ -182,10 +188,10 @@ class CudaGraphRunner:
         # FIXME(lsyin): leave it here for now, I don't know whether it is necessary
         self.encoder_len_fill_value = 0
-        if self.use_torch_compile:
+        if self.enable_torch_compile:
             set_torch_compile_config()
-        # Common inputs
+        # Graph inputs
         with torch.device("cuda"):
             self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int64)
             self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int32)
@@ -301,7 +307,7 @@ class CudaGraphRunner:
         stream = self.stream
         num_tokens = bs * self.num_tokens_per_bs
-        # Common inputs
+        # Graph inputs
         input_ids = self.input_ids[:num_tokens]
         req_pool_indices = self.req_pool_indices[:bs]
         seq_lens = self.seq_lens[:bs]
@@ -320,7 +326,7 @@ class CudaGraphRunner:
             global_num_tokens = None
             gathered_buffer = None
-        spec_info = self.get_spec_info(num_tokens, positions)
+        spec_info = self.get_spec_info(num_tokens)
         forward_batch = ForwardBatch(
             forward_mode=self.capture_forward_mode,
@@ -335,7 +341,6 @@ class CudaGraphRunner:
             seq_lens_sum=seq_lens.sum(),
             encoder_lens=encoder_lens,
             return_logprob=False,
-            top_logprobs_nums=[0] * bs,
             positions=positions,
             global_num_tokens=global_num_tokens,
             gathered_buffer=gathered_buffer,
@@ -375,13 +380,14 @@ class CudaGraphRunner:
         torch.cuda.synchronize()
         self.model_runner.tp_group.barrier()
-        with torch.cuda.graph(graph, pool=self.graph_memory_pool, stream=stream):
+        global global_graph_memory_pool
+        with torch.cuda.graph(graph, pool=global_graph_memory_pool, stream=stream):
             out = run_once()
         torch.cuda.synchronize()
         self.model_runner.tp_group.barrier()
-        self.graph_memory_pool = graph.pool()
+        global_graph_memory_pool = graph.pool()
         return graph, out
     def replay(self, forward_batch: ForwardBatch):
@@ -439,35 +445,26 @@ class CudaGraphRunner:
         )
         return logits_output
-    def get_spec_info(self, num_tokens: int, positions: torch.Tensor):
+    def get_spec_info(self, num_tokens: int):
         spec_info = None
         if self.model_runner.spec_algorithm.is_eagle():
-            from sglang.srt.speculative.eagle_utils import (
-                EAGLEDraftInput,
-                EagleVerifyInput,
-            )
+            from sglang.srt.speculative.eagle_utils import EagleVerifyInput
             if self.model_runner.is_draft_worker:
-                spec_info = EAGLEDraftInput()
-                spec_info.load_server_args(self.model_runner.server_args)
-                spec_info.hidden_states = self.hidden_states[:num_tokens]
-                spec_info.positions = positions
-                spec_info.capture_hidden_mode = CaptureHiddenMode.FULL
+                raise RuntimeError("This should not happen.")
             else:
                 spec_info = EagleVerifyInput(
-                    None,
-                    None,
-                    None,
-                    None,
-                    None,
-                    None,
-                    self.model_runner.server_args.speculative_num_draft_tokens,
-                )
-                spec_info.custom_mask = torch.zeros(
-                    (num_tokens * self.model_runner.model_config.context_len),
-                    dtype=torch.bool,
-                    device="cuda",
+                    draft_token=None,
+                    custom_mask=torch.zeros(
+                        (num_tokens * self.model_runner.model_config.context_len),
+                        dtype=torch.bool,
+                        device="cuda",
+                    ),
+                    positions=None,
+                    retrive_index=None,
+                    retrive_cum_len=None,
+                    draft_token_num=self.model_runner.server_args.speculative_num_draft_tokens,
+                    capture_hidden_mode=CaptureHiddenMode.FULL,
                 )
-                spec_info.capture_hidden_mode = CaptureHiddenMode.FULL
         return spec_info

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -197,64 +197,6 @@ class ForwardBatch:
     # For Qwen2-VL
     mrope_positions: torch.Tensor = None
-    def compute_mrope_positions(
-        self, model_runner: ModelRunner, batch: ModelWorkerBatch
-    ):
-        device = model_runner.device
-        hf_config = model_runner.model_config.hf_config
-        mrope_positions_list = [None] * self.seq_lens.shape[0]
-        if self.forward_mode.is_decode():
-            for i, _ in enumerate(mrope_positions_list):
-                mrope_position_delta = (
-                    0
-                    if batch.image_inputs[i] is None
-                    else batch.image_inputs[i].mrope_position_delta
-                )
-                mrope_positions_list[i] = MRotaryEmbedding.get_next_input_positions(
-                    mrope_position_delta,
-                    int(self.seq_lens[i]) - 1,
-                    int(self.seq_lens[i]),
-                )
-        elif self.forward_mode.is_extend():
-            extend_start_loc_cpu = self.extend_start_loc.cpu().numpy()
-            for i, image_inputs in enumerate(batch.image_inputs):
-                extend_start_loc, extend_seq_len, extend_prefix_len = (
-                    extend_start_loc_cpu[i],
-                    batch.extend_seq_lens[i],
-                    batch.extend_prefix_lens[i],
-                )
-                if image_inputs is None:
-                    # text only
-                    mrope_positions = [
-                        [
-                            pos
-                            for pos in range(
-                                extend_prefix_len, extend_prefix_len + extend_seq_len
-                            )
-                        ]
-                    ] * 3
-                else:
-                    # TODO: current qwen2-vl do not support radix cache since mrope position calculation
-                    mrope_positions, mrope_position_delta = (
-                        MRotaryEmbedding.get_input_positions(
-                            input_tokens=self.input_ids[
-                                extend_start_loc : extend_start_loc + extend_seq_len
-                            ],
-                            image_grid_thw=image_inputs.image_grid_thws,
-                            vision_start_token_id=hf_config.vision_start_token_id,
-                            spatial_merge_size=hf_config.vision_config.spatial_merge_size,
-                            context_len=0,
-                        )
-                    )
-                    batch.image_inputs[i].mrope_position_delta = mrope_position_delta
-                mrope_positions_list[i] = mrope_positions
-        self.mrope_positions = torch.concat(
-            [torch.tensor(pos, device=device) for pos in mrope_positions_list],
-            axis=1,
-        )
-        self.mrope_positions = self.mrope_positions.to(torch.int64)
     @classmethod
     def init_new(
         cls,
@@ -337,7 +279,7 @@ class ForwardBatch:
             ret.extend_logprob_start_lens_cpu = batch.extend_logprob_start_lens
         if model_runner.model_is_mrope:
-            ret.compute_mrope_positions(model_runner, batch)
+            ret._compute_mrope_positions(model_runner, batch)
         # Init lora information
         if model_runner.server_args.lora_paths is not None:
@@ -345,6 +287,63 @@ class ForwardBatch:
         return ret
+    def _compute_mrope_positions(
+        self, model_runner: ModelRunner, batch: ModelWorkerBatch
+    ):
+        device = model_runner.device
+        hf_config = model_runner.model_config.hf_config
+        mrope_positions_list = [None] * self.seq_lens.shape[0]
+        if self.forward_mode.is_decode():
+            for i, _ in enumerate(mrope_positions_list):
+                mrope_position_delta = (
+                    0
+                    if batch.image_inputs[i] is None
+                    else batch.image_inputs[i].mrope_position_delta
+                )
+                mrope_positions_list[i] = MRotaryEmbedding.get_next_input_positions(
+                    mrope_position_delta,
+                    int(self.seq_lens[i]) - 1,
+                    int(self.seq_lens[i]),
+                )
+        elif self.forward_mode.is_extend():
+            extend_start_loc_cpu = self.extend_start_loc.cpu().numpy()
+            for i, image_inputs in enumerate(batch.image_inputs):
+                extend_start_loc, extend_seq_len, extend_prefix_len = (
+                    extend_start_loc_cpu[i],
+                    batch.extend_seq_lens[i],
+                    batch.extend_prefix_lens[i],
+                )
+                if image_inputs is None:
+                    # text only
+                    mrope_positions = [
+                        [
+                            pos
+                            for pos in range(
+                                extend_prefix_len, extend_prefix_len + extend_seq_len
+                            )
+                        ]
+                    ] * 3
+                else:
+                    # TODO: current qwen2-vl do not support radix cache since mrope position calculation
+                    mrope_positions, mrope_position_delta = (
+                        MRotaryEmbedding.get_input_positions(
+                            input_tokens=self.input_ids[
+                                extend_start_loc : extend_start_loc + extend_seq_len
+                            ],
+                            image_grid_thw=image_inputs.image_grid_thws,
+                            vision_start_token_id=hf_config.vision_start_token_id,
+                            spatial_merge_size=hf_config.vision_config.spatial_merge_size,
+                            context_len=0,
+                        )
+                    )
+                    batch.image_inputs[i].mrope_position_delta = mrope_position_delta
+                mrope_positions_list[i] = mrope_positions
+        self.mrope_positions = torch.concat(
+            [torch.tensor(pos, device=device) for pos in mrope_positions_list],
+            axis=1,
+        )
+        self.mrope_positions = self.mrope_positions.to(torch.int64)
 def compute_position_triton(
     extend_prefix_lens: torch.Tensor, extend_seq_lens: torch.Tensor, extend_seq_lens_sum

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -52,6 +52,7 @@ from sglang.srt.mem_cache.memory_pool import (
     MLATokenToKVPool,
     ReqToTokenPool,
 )
+from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader import get_model
 from sglang.srt.server_args import ServerArgs
@@ -529,6 +530,7 @@ class ModelRunner:
             max_loras_per_batch=self.server_args.max_loras_per_batch,
             load_config=self.load_config,
             dtype=self.dtype,
+            lora_backend=self.server_args.lora_backend,
         )
         logger.info("LoRA manager ready.")
@@ -714,8 +716,6 @@ class ModelRunner:
     def init_cuda_graphs(self):
         """Capture cuda graphs."""
-        from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
         self.cuda_graph_runner = None
         if not self.is_generation:

sglang/srt/models/qwen2_vl.py CHANGED Viewed

@@ -31,10 +31,10 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from vllm.model_executor.layers.activation import QuickGELU
 from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig
 from sglang.srt.hf_transformers_utils import get_processor
+from sglang.srt.layers.activation import QuickGELU
 from sglang.srt.layers.attention.vision import VisionAttention
 from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
 from sglang.srt.layers.logits_processor import LogitsProcessor

sglang/srt/server_args.py CHANGED Viewed

@@ -113,6 +113,7 @@ class ServerArgs:
     # LoRA
     lora_paths: Optional[List[str]] = None
     max_loras_per_batch: int = 8
+    lora_backend: str = "triton"
     # Kernel backend
     attention_backend: Optional[str] = None
@@ -273,6 +274,10 @@ class ServerArgs:
         ) and check_gguf_file(self.model_path):
             self.quantization = self.load_format = "gguf"
+        # AMD-specific Triton attention KV splits default number
+        if is_hip():
+            self.triton_attention_num_kv_splits = 16
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         # Model and port args
@@ -649,13 +654,19 @@ class ServerArgs:
             nargs="*",
             default=None,
             action=LoRAPathAction,
-            help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}",
+            help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}.",
         )
         parser.add_argument(
             "--max-loras-per-batch",
             type=int,
             default=8,
-            help="Maximum number of adapters for a running batch, include base-only request",
+            help="Maximum number of adapters for a running batch, include base-only request.",
+        )
+        parser.add_argument(
+            "--lora-backend",
+            type=str,
+            default="triton",
+            help="Choose the kernel backend for multi-LoRA serving.",
         )
         # Kernel backend

sglang/srt/speculative/build_eagle_tree.py CHANGED Viewed

@@ -79,11 +79,13 @@ __global__ void build_tree(Tensor<long, 2> parent_list, Tensor<long, 2> selected
 )
-def build_tree_kernel(parent_list, top_score_index, seq_lens, topk, depth, draft_token):
+def build_tree_kernel(
+    parent_list, top_score_index, seq_lens, seq_lens_sum, topk, depth, draft_token
+):
     bs = seq_lens.numel()
     device = parent_list.device
     tree_mask = torch.full(
-        (torch.sum(seq_lens).item() * draft_token + draft_token * draft_token * bs,),
+        (seq_lens_sum * draft_token + draft_token * draft_token * bs,),
         True,
         device=device,
     )

sglang/srt/speculative/eagle_draft_cuda_graph_runner.py ADDED Viewed

@@ -0,0 +1,213 @@
+from __future__ import annotations
+import bisect
+import time
+from typing import TYPE_CHECKING, Callable
+import torch
+from sglang.srt.model_executor.cuda_graph_runner import (
+    CudaGraphRunner,
+    get_batch_sizes_to_capture,
+    get_global_graph_memory_pool,
+    set_global_graph_memory_pool,
+    set_torch_compile_config,
+)
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+)
+from sglang.srt.speculative.eagle_utils import EagleDraftInput
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.eagle_worker import EAGLEWorker
+class EAGLEDraftCudaGraphRunner:
+    def __init__(self, eagle_worker: EAGLEWorker):
+        # Parse args
+        self.eagle_worker = eagle_worker
+        self.model_runner = model_runner = eagle_worker.model_runner
+        self.graphs = {}
+        self.output_buffers = {}
+        self.enable_torch_compile = model_runner.server_args.enable_torch_compile
+        self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
+        self.tp_size = self.model_runner.tp_size
+        self.dp_size = model_runner.server_args.dp_size
+        self.topk = model_runner.server_args.speculative_eagle_topk
+        self.speculative_num_steps = model_runner.server_args.speculative_num_steps
+        server_args = model_runner.server_args
+        assert self.disable_padding
+        # Batch sizes to capture
+        self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
+        self.num_tokens_per_bs = server_args.speculative_eagle_topk
+        # Attention backend
+        self.max_bs = max(self.capture_bs)
+        self.max_num_token = self.max_bs * self.num_tokens_per_bs
+        self.model_runner.draft_attn_backend.init_cuda_graph_state(self.max_num_token)
+        self.seq_len_fill_value = self.model_runner.draft_attn_backend.attn_backends[
+            0
+        ].get_cuda_graph_seq_len_fill_value()
+        if self.enable_torch_compile:
+            set_torch_compile_config()
+        # Graph inputs
+        with torch.device("cuda"):
+            self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int64)
+            self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int32)
+            self.seq_lens = torch.full(
+                (self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
+            )
+            self.out_cache_loc = torch.zeros(
+                (self.max_num_token * self.speculative_num_steps,), dtype=torch.int64
+            )
+            self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
+            self.topk_p = torch.zeros((self.max_bs, self.topk), dtype=torch.float32)
+            self.topk_index = torch.zeros((self.max_bs, self.topk), dtype=torch.int64)
+            self.hidden_states = torch.zeros(
+                (self.max_bs, self.model_runner.model_config.hidden_size),
+                dtype=self.model_runner.dtype,
+            )
+        # Capture
+        try:
+            self.capture()
+        except RuntimeError as e:
+            raise Exception(
+                f"Capture cuda graph failed: {e}\n"
+                "Possible solutions:\n"
+                "1. disable cuda graph by --disable-cuda-graph\n"
+                "2. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
+                "3. disable torch compile by not using --enable-torch-compile\n"
+                "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
+            )
+    def can_run(self, forward_batch: ForwardBatch):
+        is_bs_supported = (
+            forward_batch.batch_size in self.graphs
+            if self.disable_padding
+            else forward_batch.batch_size <= self.max_bs
+        )
+        return is_bs_supported
+    def capture(self):
+        CudaGraphRunner.capture(self)
+    def capture_one_batch_size(self, num_seqs: int, forward: Callable):
+        graph = torch.cuda.CUDAGraph()
+        stream = self.stream
+        num_tokens = num_seqs * self.num_tokens_per_bs
+        # Graph inputs
+        req_pool_indices = self.req_pool_indices[:num_seqs]
+        seq_lens = self.seq_lens[:num_seqs]
+        out_cache_loc = self.out_cache_loc[: num_tokens * self.speculative_num_steps]
+        positions = self.positions[:num_tokens]
+        topk_p = self.topk_p[:num_seqs]
+        topk_index = self.topk_index[:num_seqs]
+        hidden_states = self.hidden_states[:num_seqs]
+        spec_info = EagleDraftInput(
+            topk_p=topk_p,
+            topk_index=topk_index,
+            hidden_states=hidden_states,
+        )
+        # Forward batch
+        forward_batch = ForwardBatch(
+            forward_mode=ForwardMode.DECODE,
+            batch_size=num_seqs,
+            input_ids=None,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            req_to_token_pool=self.model_runner.req_to_token_pool,
+            token_to_kv_pool=self.model_runner.token_to_kv_pool,
+            out_cache_loc=out_cache_loc,
+            seq_lens_sum=seq_lens.sum(),
+            return_logprob=False,
+            positions=positions,
+            spec_algorithm=self.model_runner.spec_algorithm,
+            spec_info=spec_info,
+            capture_hidden_mode=(
+                spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
+            ),
+        )
+        # Attention backend
+        self.model_runner.draft_attn_backend.init_forward_metadata_capture_cuda_graph(
+            forward_batch
+        )
+        # Run and capture
+        def run_once():
+            # Backup two fileds, which will be modified in-place in `draft_forward`.
+            output_cache_loc_backup = forward_batch.out_cache_loc
+            hidden_states_backup = forward_batch.spec_info.hidden_states
+            ret = self.eagle_worker.draft_forward(forward_batch)
+            forward_batch.out_cache_loc = output_cache_loc_backup
+            forward_batch.spec_info.hidden_states = hidden_states_backup
+            return ret
+        for _ in range(2):
+            torch.cuda.synchronize()
+            self.model_runner.tp_group.barrier()
+            run_once()
+            torch.cuda.synchronize()
+            self.model_runner.tp_group.barrier()
+        torch.cuda.synchronize()
+        self.model_runner.tp_group.barrier()
+        with torch.cuda.graph(
+            graph, pool=get_global_graph_memory_pool(), stream=stream
+        ):
+            out = run_once()
+        torch.cuda.synchronize()
+        self.model_runner.tp_group.barrier()
+        set_global_graph_memory_pool(graph.pool())
+        return graph, out
+    def replay(self, forward_batch: ForwardBatch):
+        assert forward_batch.out_cache_loc is not None
+        raw_bs = forward_batch.batch_size
+        raw_num_token = raw_bs * self.num_tokens_per_bs
+        # Pad
+        index = bisect.bisect_left(self.capture_bs, raw_bs)
+        bs = self.capture_bs[index]
+        if bs != raw_bs:
+            self.seq_lens.fill_(1)
+            self.out_cache_loc.zero_()
+        # Common inputs
+        self.req_pool_indices[:raw_bs].copy_(forward_batch.req_pool_indices)
+        self.seq_lens[:raw_bs].copy_(forward_batch.seq_lens)
+        self.out_cache_loc[: raw_num_token * self.speculative_num_steps].copy_(
+            forward_batch.out_cache_loc
+        )
+        self.positions[:raw_num_token].copy_(forward_batch.positions)
+        self.topk_p[:raw_bs].copy_(forward_batch.spec_info.topk_p)
+        self.topk_index[:raw_bs].copy_(forward_batch.spec_info.topk_index)
+        self.hidden_states[:raw_bs].copy_(forward_batch.spec_info.hidden_states)
+        # Attention backend
+        self.model_runner.draft_attn_backend.init_forward_metadata_replay_cuda_graph(
+            forward_batch
+        )
+        # Replay
+        self.graphs[bs].replay()
+        return self.output_buffers[bs]

sglang 0.4.2.post1__py3-none-any.whl → 0.4.2.post2__py3-none-any.whl

sglang 0.4.2.post1py3-none-any.whl → 0.4.2.post2py3-none-any.whl