PyPI - sglang - Versions diffs - 0.4.1.post7__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

sglang 0.4.1.post7py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

sglang/bench_offline_throughput.py +17 -11
sglang/bench_one_batch.py +14 -6
sglang/bench_serving.py +47 -44
sglang/lang/chat_template.py +31 -0
sglang/srt/configs/load_config.py +1 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +5 -2
sglang/srt/entrypoints/engine.py +5 -2
sglang/srt/entrypoints/http_server.py +24 -0
sglang/srt/function_call_parser.py +494 -0
sglang/srt/layers/activation.py +5 -5
sglang/srt/layers/dp_attention.py +3 -1
sglang/srt/layers/layernorm.py +5 -5
sglang/srt/layers/linear.py +24 -9
sglang/srt/layers/logits_processor.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +20 -12
sglang/srt/layers/moe/fused_moe_native.py +17 -3
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +9 -0
sglang/srt/layers/parameter.py +16 -7
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/fp8.py +4 -1
sglang/srt/layers/rotary_embedding.py +6 -1
sglang/srt/layers/sampler.py +28 -8
sglang/srt/layers/torchao_utils.py +12 -6
sglang/srt/managers/detokenizer_manager.py +1 -0
sglang/srt/managers/io_struct.py +36 -5
sglang/srt/managers/schedule_batch.py +31 -25
sglang/srt/managers/scheduler.py +61 -35
sglang/srt/managers/tokenizer_manager.py +4 -0
sglang/srt/model_executor/cuda_graph_runner.py +23 -25
sglang/srt/model_executor/forward_batch_info.py +5 -7
sglang/srt/model_executor/model_runner.py +7 -4
sglang/srt/model_loader/loader.py +75 -0
sglang/srt/model_loader/weight_utils.py +91 -5
sglang/srt/models/commandr.py +14 -2
sglang/srt/models/dbrx.py +9 -1
sglang/srt/models/deepseek_v2.py +3 -3
sglang/srt/models/gemma2.py +9 -1
sglang/srt/models/grok.py +1 -0
sglang/srt/models/minicpm3.py +3 -3
sglang/srt/models/torch_native_llama.py +17 -4
sglang/srt/openai_api/adapter.py +139 -37
sglang/srt/openai_api/protocol.py +5 -4
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
sglang/srt/sampling/sampling_batch_info.py +4 -14
sglang/srt/server.py +2 -2
sglang/srt/server_args.py +20 -1
sglang/srt/speculative/eagle_utils.py +37 -15
sglang/srt/speculative/eagle_worker.py +11 -13
sglang/srt/utils.py +62 -65
sglang/test/test_programs.py +1 -0
sglang/test/test_utils.py +81 -22
sglang/version.py +1 -1
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/METADATA +7 -7
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/RECORD +67 -56
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -20,7 +20,7 @@ import os
 import time
 import uuid
 from http import HTTPStatus
-from typing import Dict, List
+from typing import Dict, List, Optional
 from fastapi import HTTPException, Request, UploadFile
 from fastapi.responses import ORJSONResponse, StreamingResponse
@@ -40,6 +40,7 @@ from sglang.srt.conversation import (
     generate_chat_conv,
     register_conv_template,
 )
+from sglang.srt.function_call_parser import TOOLS_TAG_LIST, FunctionCallParser
 from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
 from sglang.srt.openai_api.protocol import (
     BatchRequest,
@@ -71,7 +72,6 @@ from sglang.srt.openai_api.protocol import (
     TopLogprob,
     UsageInfo,
 )
-from sglang.srt.utils import TOOLS_TAG_LIST, parse_tool_response
 from sglang.utils import get_exception_traceback
 logger = logging.getLogger(__name__)
@@ -309,6 +309,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
                     ret,
                     to_file=True,
                     cache_report=tokenizer_manager.server_args.enable_cache_report,
+                    tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
                 )
             else:
                 responses = v1_generate_response(
@@ -877,9 +878,6 @@ def v1_chat_generate_request(
             tools = None
             if request.tools and request.tool_choice != "none":
                 request.skip_special_tokens = False
-                if request.stream:
-                    logger.warning("Streaming is not supported with tools.")
-                    request.stream = False
                 if not isinstance(request.tool_choice, str):
                     tools = [
                         item.function.model_dump()
@@ -908,12 +906,26 @@ def v1_chat_generate_request(
                     openai_compatible_messages = openai_compatible_messages[:-1]
                 else:
                     assistant_prefix = None
-                prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
-                    openai_compatible_messages,
-                    tokenize=True,
-                    add_generation_prompt=True,
-                    tools=tools,
-                )
+                try:
+                    prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
+                        openai_compatible_messages,
+                        tokenize=True,
+                        add_generation_prompt=True,
+                        tools=tools,
+                    )
+                except:
+                    #  This except branch will be triggered when the chosen model
+                    #  has a different tools input format that is not compatiable
+                    #  with openAI's apply_chat_template tool_call format, like Mistral.
+                    tools = [t if "function" in t else {"function": t} for t in tools]
+                    prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
+                        openai_compatible_messages,
+                        tokenize=True,
+                        add_generation_prompt=True,
+                        tools=tools,
+                    )
                 if assistant_prefix:
                     prompt_ids += tokenizer_manager.tokenizer.encode(assistant_prefix)
                 stop = request.stop
@@ -1005,7 +1017,9 @@ def v1_chat_generate_request(
     return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
-def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
+def v1_chat_generate_response(
+    request, ret, to_file=False, cache_report=False, tool_call_parser=None
+):
     choices = []
     for idx, ret_item in enumerate(ret):
@@ -1066,12 +1080,13 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
             if finish_reason == "stop":
                 finish_reason = "tool_calls"
             try:
-                text, call_info_list = parse_tool_response(text, tools)  # noqa
+                parser = FunctionCallParser(tools, tool_call_parser)
+                full_normal_text, call_info_list = parser.parse_non_stream(text)
                 tool_calls = [
                     ToolCall(
-                        id=str(call_info[0]),
+                        id=str(call_info.tool_index),
                         function=FunctionResponse(
-                            name=call_info[1], arguments=call_info[2]
+                            name=call_info.name, arguments=call_info.parameters
                         ),
                     )
                     for call_info in call_info_list
@@ -1172,6 +1187,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
     adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
     if adapted_request.stream:
+        parser_dict = {}
         async def generate_stream_resp():
             is_firsts = {}
@@ -1184,6 +1200,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
                     adapted_request, raw_request
                 ):
                     index = content.get("index", 0)
+                    text = content["text"]
                     is_first = is_firsts.get(index, True)
                     stream_buffer = stream_buffers.get(index, "")
@@ -1263,29 +1280,111 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
                     text = content["text"]
                     delta = text[len(stream_buffer) :]
-                    stream_buffer = stream_buffer + delta
-                    choice_data = ChatCompletionResponseStreamChoice(
-                        index=index,
-                        delta=DeltaMessage(content=delta),
-                        finish_reason=(finish_reason["type"] if finish_reason else ""),
-                        matched_stop=(
-                            finish_reason["matched"]
-                            if finish_reason and "matched" in finish_reason
-                            else None
-                        ),
-                        logprobs=choice_logprobs,
-                    )
-                    chunk = ChatCompletionStreamResponse(
-                        id=content["meta_info"]["id"],
-                        choices=[choice_data],
-                        model=request.model,
-                    )
+                    new_stream_buffer = stream_buffer + delta
-                    is_firsts[index] = is_first
-                    stream_buffers[index] = stream_buffer
-                    n_prev_tokens[index] = n_prev_token
+                    if request.tool_choice != "none" and request.tools:
+                        if index not in parser_dict:
+                            parser_dict[index] = FunctionCallParser(
+                                tools=request.tools,
+                                tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
+                            )
+                        parser = parser_dict[index]
+                        # parse_increment => returns (normal_text, calls)
+                        normal_text, calls = parser.parse_stream_chunk(delta)
+                        # 1) if there's normal_text, output it as normal content
+                        if normal_text:
+                            choice_data = ChatCompletionResponseStreamChoice(
+                                index=index,
+                                delta=DeltaMessage(content=normal_text),
+                                finish_reason=(
+                                    finish_reason["type"] if finish_reason else ""
+                                ),
+                            )
+                            chunk = ChatCompletionStreamResponse(
+                                id=content["meta_info"]["id"],
+                                choices=[choice_data],
+                                model=request.model,
+                            )
+                            yield f"data: {chunk.model_dump_json()}\n\n"
+                        # 2) if we found calls, we output them as separate chunk(s)
+                        for call_item in calls:
+                            # transform call_item -> FunctionResponse + ToolCall
+                            if (
+                                content["meta_info"]["finish_reason"]
+                                and content["meta_info"]["finish_reason"]["type"]
+                                == "stop"
+                            ):
+                                latest_delta_len = 0
+                                if isinstance(call_item.parameters, str):
+                                    latest_delta_len = len(call_item.parameters)
+                                expected_call = json.dumps(
+                                    parser.multi_format_parser.detectors[0]
+                                    .prev_tool_call_arr[index]
+                                    .get("arguments", {}),
+                                    ensure_ascii=False,
+                                )
+                                actual_call = parser.multi_format_parser.detectors[
+                                    0
+                                ].streamed_args_for_tool[index]
+                                if latest_delta_len > 0:
+                                    actual_call = actual_call[:-latest_delta_len]
+                                remaining_call = expected_call.replace(
+                                    actual_call, "", 1
+                                )
+                                call_item.parameters = remaining_call
+                            tool_call = ToolCall(
+                                id=str(call_item.tool_index),
+                                function=FunctionResponse(
+                                    name=call_item.name,
+                                    arguments=call_item.parameters,
+                                ),
+                            )
+                            choice_data = ChatCompletionResponseStreamChoice(
+                                index=index,
+                                delta=DeltaMessage(
+                                    role="assistant", tool_calls=[tool_call]
+                                ),
+                                finish_reason="tool_call",
+                            )
+                            chunk = ChatCompletionStreamResponse(
+                                id=content["meta_info"]["id"],
+                                choices=[choice_data],
+                                model=request.model,
+                            )
+                            yield f"data: {chunk.model_dump_json()}\n\n"
-                    yield f"data: {chunk.model_dump_json()}\n\n"
+                        stream_buffers[index] = new_stream_buffer
+                        is_firsts[index] = is_first
+                    else:
+                        # No tool calls => just treat this as normal text
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=index,
+                            delta=DeltaMessage(content=delta),
+                            finish_reason=(
+                                finish_reason["type"] if finish_reason else ""
+                            ),
+                            matched_stop=(
+                                finish_reason["matched"]
+                                if finish_reason and "matched" in finish_reason
+                                else None
+                            ),
+                            logprobs=choice_logprobs,
+                        )
+                        chunk = ChatCompletionStreamResponse(
+                            id=content["meta_info"]["id"],
+                            choices=[choice_data],
+                            model=request.model,
+                        )
+                        yield f"data: {chunk.model_dump_json()}\n\n"
+                        stream_buffers[index] = new_stream_buffer
+                        is_firsts[index] = is_first
                 if request.stream_options and request.stream_options.include_usage:
                     total_prompt_tokens = sum(
                         tokens
@@ -1333,7 +1432,10 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
         ret = [ret]
     response = v1_chat_generate_response(
-        request, ret, cache_report=tokenizer_manager.server_args.enable_cache_report
+        request,
+        ret,
+        cache_report=tokenizer_manager.server_args.enable_cache_report,
+        tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
     )
     return response

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -262,7 +262,7 @@ class Function(BaseModel):
     """Function descriptions."""
     description: Optional[str] = Field(default=None, examples=[None])
-    name: str
+    name: Optional[str] = None
     parameters: Optional[object] = None
@@ -276,7 +276,7 @@ class Tool(BaseModel):
 class ToolChoiceFuncName(BaseModel):
     """The name of tool choice function."""
-    name: str
+    name: Optional[str] = None
 class ToolChoice(BaseModel):
@@ -329,8 +329,8 @@ class ChatCompletionRequest(BaseModel):
 class FunctionResponse(BaseModel):
     """Function response."""
-    name: str
-    arguments: str
+    name: Optional[str] = None
+    arguments: Optional[str] = None
 class ToolCall(BaseModel):
@@ -367,6 +367,7 @@ class ChatCompletionResponse(BaseModel):
 class DeltaMessage(BaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
+    tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
 class ChatCompletionResponseStreamChoice(BaseModel):

sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py CHANGED Viewed

@@ -3,11 +3,16 @@ from typing import List
 import torch
 from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
-from sglang.srt.utils import is_cuda_available
+from sglang.srt.utils import get_compiler_backend
-is_cuda = is_cuda_available()
-if is_cuda:
-    from sgl_kernel import sampling_scaling_penalties
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def apply_scaling_penalties(logits, scaling_penalties):
+    logits[:] = torch.where(
+        logits > 0,
+        logits / scaling_penalties,
+        logits * scaling_penalties,
+    )
 class BatchedRepetitionPenalizer(_BatchedPenalizer):
@@ -61,16 +66,8 @@ class BatchedRepetitionPenalizer(_BatchedPenalizer):
         self.cumulated_repetition_penalties[mask] = self.repetition_penalties[mask]
     def _apply(self, logits: torch.Tensor) -> torch.Tensor:
-        if is_cuda:
-            return sampling_scaling_penalties(
-                logits, self.cumulated_repetition_penalties
-            )
-        else:
-            return torch.where(
-                logits > 0,
-                logits / self.cumulated_repetition_penalties,
-                logits * self.cumulated_repetition_penalties,
-            )
+        apply_scaling_penalties(logits, self.cumulated_repetition_penalties)
+        return logits
     def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.repetition_penalties = self.repetition_penalties[indices_tensor_to_keep]

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -7,14 +7,11 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple
 import torch
-from sglang.srt.utils import is_cuda_available
-is_cuda = is_cuda_available()
-if is_cuda:
-    from sgl_kernel import sampling_scaling_penalties
 import sglang.srt.sampling.penaltylib as penaltylib
 from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
+from sglang.srt.sampling.penaltylib.penalizers.repetition_penalty import (
+    apply_scaling_penalties,
+)
 logger = logging.getLogger(__name__)
@@ -386,14 +383,7 @@ class SamplingBatchInfo:
         # repetition
         if self.scaling_penalties is not None:
-            if is_cuda:
-                logits[:] = sampling_scaling_penalties(logits, self.scaling_penalties)
-            else:
-                logits[:] = torch.where(
-                    logits > 0,
-                    logits / self.scaling_penalties,
-                    logits * self.scaling_penalties,
-                )
+            apply_scaling_penalties(logits, self.scaling_penalties)
         # Apply regex vocab_mask
         if self.vocab_mask is not None:

sglang/srt/server.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # limitations under the License.
 # ==============================================================================
-# Some shortcuts for backward compatbility.
+# Some shortcuts for backward compatibility.
 # They will be removed in new versions.
 from sglang.srt.entrypoints.engine import Engine
-from sglang.srt.entrypoints.http_server import launch_server
+from sglang.srt.entrypoints.http_server import kill_process_tree, launch_server

sglang/srt/server_args.py CHANGED Viewed

@@ -75,6 +75,7 @@ class ServerArgs:
     # Other runtime options
     tp_size: int = 1
     stream_interval: int = 1
+    stream_output: bool = False
     random_seed: Optional[int] = None
     constrained_json_whitespace_pattern: Optional[str] = None
     watchdog_timeout: float = 300
@@ -161,6 +162,7 @@ class ServerArgs:
     # Custom logit processor
     enable_custom_logit_processor: bool = False
+    tool_call_parser: str = None
     def __post_init__(self):
         # Set missing default values
@@ -317,6 +319,7 @@ class ServerArgs:
                 "dummy",
                 "gguf",
                 "bitsandbytes",
+                "layered",
             ],
             help="The format of the model weights to load. "
             '"auto" will try to load the weights in the safetensors format '
@@ -330,7 +333,10 @@ class ServerArgs:
             "which is mainly for profiling."
             '"gguf" will load the weights in the gguf format. '
             '"bitsandbytes" will load the weights using bitsandbytes '
-            "quantization.",
+            "quantization."
+            '"layered" loads weights layer by layer so that one can quantize a '
+            "layer before loading another to make the peak memory envelope "
+            "smaller.",
         )
         parser.add_argument(
             "--trust-remote-code",
@@ -495,6 +501,11 @@ class ServerArgs:
             default=ServerArgs.stream_interval,
             help="The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher",
         )
+        parser.add_argument(
+            "--stream-output",
+            action="store_true",
+            help="Whether to output as a sequence of disjoint segments.",
+        )
         parser.add_argument(
             "--random-seed",
             type=int,
@@ -873,6 +884,14 @@ class ServerArgs:
             action="store_true",
             help="Enable users to pass custom logit processors to the server (disabled by default for security)",
         )
+        # Function Calling
+        parser.add_argument(
+            "--tool-call-parser",
+            type=str,
+            choices=["qwen25", "mistral", "llama3"],
+            default=ServerArgs.tool_call_parser,
+            help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and 'llama3'.",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):

sglang/srt/speculative/eagle_utils.py CHANGED Viewed

@@ -180,7 +180,6 @@ def generate_draft_decode_kv_indices(
 class EAGLEDraftInput(SpecInfo):
     def __init__(self):
         self.prev_mode = ForwardMode.DECODE
-        self.sample_output = None
         self.scores: torch.Tensor = None
         self.score_list: List[torch.Tensor] = []
@@ -190,12 +189,16 @@ class EAGLEDraftInput(SpecInfo):
         self.cache_list: List[torch.Tenor] = []
         self.iter = 0
+        # shape: (b, hidden_size)
         self.hidden_states: torch.Tensor = None
+        # shape: (b,)
         self.verified_id: torch.Tensor = None
+        # shape: (b, vocab_size)
+        self.sample_output: torch.Tensor = None
         self.positions: torch.Tensor = None
         self.accept_length: torch.Tensor = None
-        self.has_finished: bool = False
-        self.unfinished_index: List[int] = None
+        self.accept_length_cpu: List[int] = None
     def load_server_args(self, server_args: ServerArgs):
         self.topk: int = server_args.speculative_eagle_topk
@@ -218,7 +221,7 @@ class EAGLEDraftInput(SpecInfo):
                     :pre_len
                 ] = req.prefix_indices
-            batch.req_to_token_pool.req_to_token[req.req_pool_idx][pre_len:seq_len] = (
+            batch.req_to_token_pool.req_to_token[req.req_pool_idx, pre_len:seq_len] = (
                 out_cache_loc[pt : pt + req.extend_input_len]
             )
@@ -228,6 +231,14 @@ class EAGLEDraftInput(SpecInfo):
         assert len(batch.extend_lens) == 1
         batch.input_ids = torch.concat((batch.input_ids[1:], self.verified_id))
+    def filter_batch(
+        self,
+        new_indices: torch.Tensor,
+    ):
+        self.sample_output = self.sample_output[: len(new_indices)]
+        self.hidden_states = self.hidden_states[: len(new_indices)]
+        self.verified_id = self.verified_id[: len(new_indices)]
     def prepare_for_decode(self, batch: ScheduleBatch):
         prob = self.sample_output  # shape: (b * top_k, vocab) or (b, vocab)
         top = torch.topk(prob, self.topk, dim=-1)
@@ -287,7 +298,9 @@ class EAGLEDraftInput(SpecInfo):
         self.cache_list.append(batch.out_cache_loc)
         self.positions = (
             batch.seq_lens[:, None]
-            + torch.ones([1, self.topk], device="cuda", dtype=torch.long) * self.iter
+            + torch.full(
+                [1, self.topk], fill_value=self.iter, device="cuda", dtype=torch.long
+            )
         ).flatten()
         bs = len(batch.seq_lens)
@@ -304,24 +317,25 @@ class EAGLEDraftInput(SpecInfo):
     def prepare_extend_after_decode(self, batch: ScheduleBatch):
         batch.out_cache_loc = batch.alloc_token_slots(self.verified_id.numel())
-        batch.extend_lens = (self.accept_length + 1).tolist()
+        accept_length_cpu = batch.spec_info.accept_length_cpu
+        batch.extend_lens = [x + 1 for x in accept_length_cpu]
+        batch.seq_lens = batch.spec_info.seq_lens_for_draft_extend
+        seq_lens_cpu = batch.seq_lens.tolist()
         pt = 0
-        seq_lens = batch.seq_lens.tolist()
         i = 0
         for req in batch.reqs:
             if req.finished():
                 continue
             # assert seq_len - pre_len == req.extend_input_len
-            input_len = self.accept_length[i] + 1
-            seq_len = seq_lens[i]
+            input_len = batch.extend_lens[i]
+            seq_len = seq_lens_cpu[i]
             batch.req_to_token_pool.req_to_token[req.req_pool_idx][
                 seq_len - input_len : seq_len
             ] = batch.out_cache_loc[pt : pt + input_len]
             pt += input_len
             i += 1
+        assert pt == batch.out_cache_loc.shape[0]
         self.positions = torch.empty_like(self.verified_id)
         new_verified_id = torch.empty_like(self.accept_length, dtype=torch.long)
@@ -337,7 +351,7 @@ class EAGLEDraftInput(SpecInfo):
             triton.next_power_of_2(self.spec_steps + 1),
         )
-        batch.seq_lens_sum = sum(batch.seq_lens)
+        batch.seq_lens_sum = sum(seq_lens_cpu)
         batch.input_ids = self.verified_id
         self.verified_id = new_verified_id
@@ -565,6 +579,8 @@ class EagleVerifyInput(SpecInfo):
         finished_extend_len = {}  # {rid:accept_length + 1}
         accept_index_cpu = accept_index.tolist()
         predict_cpu = predict.tolist()
+        has_finished = False
         # iterate every accepted token and check if req has finished after append the token
         # should be checked BEFORE free kv cache slots
         for i, (req, accept_index_row) in enumerate(zip(batch.reqs, accept_index_cpu)):
@@ -578,7 +594,7 @@ class EagleVerifyInput(SpecInfo):
                 finished_extend_len[req.rid] = j + 1
                 req.check_finished()
                 if req.finished():
-                    draft_input.has_finished = True
+                    has_finished = True
                     # set all tokens after finished token to -1 and break
                     accept_index[i, j + 1 :] = -1
                     break
@@ -587,12 +603,12 @@ class EagleVerifyInput(SpecInfo):
             if not req.finished():
                 new_accept_index.extend(new_accept_index_)
                 unfinished_index.append(i)
+            req.spec_verify_ct += 1
         accept_length = (accept_index != -1).sum(dim=1) - 1
         accept_index = accept_index[accept_index != -1]
         accept_length_cpu = accept_length.tolist()
         verified_id = predict[accept_index]
-        verified_id_cpu = verified_id.tolist()
         evict_mask = torch.full_like(self.draft_token, True, dtype=torch.bool)
         evict_mask[accept_index] = False
@@ -614,7 +630,13 @@ class EagleVerifyInput(SpecInfo):
             draft_input.verified_id = predict[new_accept_index]
             draft_input.hidden_states = batch.spec_info.hidden_states[new_accept_index]
             draft_input.accept_length = accept_length[unfinished_index]
-            draft_input.unfinished_index = unfinished_index
+            draft_input.accept_length_cpu = [
+                accept_length_cpu[i] for i in unfinished_index
+            ]
+            if has_finished:
+                draft_input.seq_lens_for_draft_extend = batch.seq_lens[unfinished_index]
+            else:
+                draft_input.seq_lens_for_draft_extend = batch.seq_lens
         logits_output.next_token_logits = logits_output.next_token_logits[accept_index]
         return (

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -13,6 +13,7 @@ from sglang.srt.model_executor.forward_batch_info import (
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.speculative.eagle_utils import EAGLEDraftInput
+from sglang.srt.utils import rank0_print
 class EAGLEWorker(TpModelWorker):
@@ -50,18 +51,18 @@ class EAGLEWorker(TpModelWorker):
     def forward_draft_decode(self, batch: ScheduleBatch):
         batch.spec_info.prepare_for_decode(batch)
+        batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         model_worker_batch = batch.get_model_worker_batch()
         forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
-        forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
         logits_output = self.model_runner.forward(forward_batch)
         self.capture_for_decode(logits_output, forward_batch)
     def forward_draft_extend(self, batch: ScheduleBatch):
         self._set_mem_pool(batch, self.model_runner)
         batch.spec_info.prepare_for_extend(batch)
+        batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         model_worker_batch = batch.get_model_worker_batch()
         forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
-        forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
         logits_output = self.model_runner.forward(forward_batch)
         self.capture_for_decode(logits_output, forward_batch)
         self._set_mem_pool(batch, self.target_worker.model_runner)
@@ -134,26 +135,23 @@ class EAGLEWorker(TpModelWorker):
         batch.req_to_token_pool = runner.req_to_token_pool
     def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
+        seq_lens_backup = batch.seq_lens
         self._set_mem_pool(batch, self.model_runner)
         batch.forward_mode = ForwardMode.DRAFT_EXTEND
-        if batch.spec_info.has_finished:
-            index = batch.spec_info.unfinished_index
-            seq_lens = batch.seq_lens
-            batch.seq_lens = batch.seq_lens[index]
         batch.spec_info.prepare_extend_after_decode(batch)
+        batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         model_worker_batch = batch.get_model_worker_batch()
         forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
-        forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
         logits_output = self.model_runner.forward(forward_batch)
-        batch.spec_info.hidden_states = logits_output.hidden_states
         self.capture_for_decode(logits_output, forward_batch)
-        batch.forward_mode = ForwardMode.DECODE
-        if batch.spec_info.has_finished:
-            batch.seq_lens = seq_lens
         self._set_mem_pool(batch, self.target_worker.model_runner)
+        # Restore backup.
+        # This is because `seq_lens` can be modified in `prepare_extend_after_decode`
+        batch.forward_mode = ForwardMode.DECODE
+        batch.seq_lens = seq_lens_backup
     def capture_for_decode(
         self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
     ):

sglang 0.4.1.post7__py3-none-any.whl → 0.4.2__py3-none-any.whl

sglang 0.4.1.post7py3-none-any.whl → 0.4.2py3-none-any.whl