PyPI - sglang - Versions diffs - 0.4.1.post7__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

sglang 0.4.1.post7py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

sglang/bench_offline_throughput.py +17 -11
sglang/bench_one_batch.py +14 -6
sglang/bench_serving.py +47 -44
sglang/lang/chat_template.py +31 -0
sglang/srt/configs/load_config.py +1 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +5 -2
sglang/srt/entrypoints/engine.py +5 -2
sglang/srt/entrypoints/http_server.py +24 -0
sglang/srt/function_call_parser.py +494 -0
sglang/srt/layers/activation.py +5 -5
sglang/srt/layers/dp_attention.py +3 -1
sglang/srt/layers/layernorm.py +5 -5
sglang/srt/layers/linear.py +24 -9
sglang/srt/layers/logits_processor.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +20 -12
sglang/srt/layers/moe/fused_moe_native.py +17 -3
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +9 -0
sglang/srt/layers/parameter.py +16 -7
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/fp8.py +4 -1
sglang/srt/layers/rotary_embedding.py +6 -1
sglang/srt/layers/sampler.py +28 -8
sglang/srt/layers/torchao_utils.py +12 -6
sglang/srt/managers/detokenizer_manager.py +1 -0
sglang/srt/managers/io_struct.py +36 -5
sglang/srt/managers/schedule_batch.py +31 -25
sglang/srt/managers/scheduler.py +61 -35
sglang/srt/managers/tokenizer_manager.py +4 -0
sglang/srt/model_executor/cuda_graph_runner.py +23 -25
sglang/srt/model_executor/forward_batch_info.py +5 -7
sglang/srt/model_executor/model_runner.py +7 -4
sglang/srt/model_loader/loader.py +75 -0
sglang/srt/model_loader/weight_utils.py +91 -5
sglang/srt/models/commandr.py +14 -2
sglang/srt/models/dbrx.py +9 -1
sglang/srt/models/deepseek_v2.py +3 -3
sglang/srt/models/gemma2.py +9 -1
sglang/srt/models/grok.py +1 -0
sglang/srt/models/minicpm3.py +3 -3
sglang/srt/models/torch_native_llama.py +17 -4
sglang/srt/openai_api/adapter.py +139 -37
sglang/srt/openai_api/protocol.py +5 -4
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
sglang/srt/sampling/sampling_batch_info.py +4 -14
sglang/srt/server.py +2 -2
sglang/srt/server_args.py +20 -1
sglang/srt/speculative/eagle_utils.py +37 -15
sglang/srt/speculative/eagle_worker.py +11 -13
sglang/srt/utils.py +62 -65
sglang/test/test_programs.py +1 -0
sglang/test/test_utils.py +81 -22
sglang/version.py +1 -1
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/METADATA +7 -7
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/RECORD +67 -56
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -1,17 +1,19 @@
 import logging
-from typing import Dict, List
+from typing import List
 import torch
+import torch.distributed as dist
 from torch import nn
+from sglang.srt.distributed import get_tensor_model_parallel_group
+from sglang.srt.layers.dp_attention import get_attention_tp_group
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import global_server_args_dict
-from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
-from sglang.srt.utils import crash_on_warnings, is_flashinfer_available
+from sglang.srt.utils import crash_on_warnings, get_bool_env_var, is_cuda_available
-if is_flashinfer_available():
-    from flashinfer.sampling import (
+if is_cuda_available():
+    from sgl_kernel import (
         min_p_sampling_from_probs,
         top_k_renorm_prob,
         top_k_top_p_sampling_from_probs,
@@ -21,11 +23,17 @@ if is_flashinfer_available():
 logger = logging.getLogger(__name__)
+SYNC_TOKEN_IDS_ACROSS_TP = get_bool_env_var("SYNC_TOKEN_IDS_ACROSS_TP")
 class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
         self.use_nan_detectioin = global_server_args_dict["enable_nan_detection"]
+        self.tp_sync_group = get_tensor_model_parallel_group().device_group
+        if global_server_args_dict["enable_dp_attention"]:
+            self.tp_sync_group = get_attention_tp_group().device_group
     def forward(
         self,
@@ -109,8 +117,6 @@ class Sampler(nn.Module):
                     f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
                 )
-        batch_next_token_ids = batch_next_token_ids.to(torch.int32)
         # Attach logprobs to logits_output (in-place modification)
         if return_logprob:
             if any(x > 0 for x in top_logprobs_nums):
@@ -124,7 +130,21 @@ class Sampler(nn.Module):
                 batch_next_token_ids,
             ]
-        return batch_next_token_ids
+        if SYNC_TOKEN_IDS_ACROSS_TP or sampling_info.grammars:
+            # For performance reasons, SGLang does not sync the final token IDs across TP ranks by default.
+            # This saves one all-reduce, but the correctness of this approach depends on the determinism of several operators:
+            # the last all-reduce, the last lm_head matmul, and all sampling kernels.
+            # These kernels are deterministic in most cases, but there are some rare instances where they are not deterministic.
+            # In such cases, enable this env variable to prevent hanging due to TP ranks becoming desynchronized.
+            # When using xgrammar, this becomes more likely so we also do the sync when grammar is used.
+            torch.distributed.all_reduce(
+                batch_next_token_ids,
+                op=dist.ReduceOp.MIN,
+                group=self.tp_sync_group,
+            )
+        return batch_next_token_ids.to(torch.int32)
     def _apply_custom_logit_processor(
         self, logits: torch.Tensor, sampling_batch_info: SamplingBatchInfo

sglang/srt/layers/torchao_utils.py CHANGED Viewed

@@ -5,6 +5,7 @@ Common utilities for torchao.
 import logging
 import os
 import pwd
+from typing import Callable, Optional
 import torch
@@ -27,8 +28,18 @@ def save_gemlite_cache(print_error: bool = False) -> bool:
     return True
+def proj_filter(
+    module: torch.nn.Module,
+    fqn: str,
+):
+    """Filter function for quantizing projection layers."""
+    return "proj" in fqn
 def apply_torchao_config_to_model(
-    model: torch.nn.Module, torchao_config: str, filter_fn=None
+    model: torch.nn.Module,
+    torchao_config: str,
+    filter_fn: Optional[Callable] = proj_filter,
 ):
     """Quantize a modelwith torchao quantization specified by torchao_config
@@ -49,11 +60,6 @@ def apply_torchao_config_to_model(
     )
     from torchao.quantization.observer import PerRow, PerTensor
-    if filter_fn is None:
-        def filter_fn(module, fqn):
-            return "proj" in fqn
     if torchao_config == "" or torchao_config is None:
         return model
     elif "int8wo" in torchao_config:

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -201,6 +201,7 @@ class DetokenizerManager:
                     prompt_tokens=recv_obj.prompt_tokens,
                     completion_tokens=recv_obj.completion_tokens,
                     cached_tokens=recv_obj.cached_tokens,
+                    spec_verify_ct=recv_obj.spec_verify_ct,
                     input_token_logprobs_val=recv_obj.input_token_logprobs_val,
                     input_token_logprobs_idx=recv_obj.input_token_logprobs_idx,
                     output_token_logprobs_val=recv_obj.output_token_logprobs_val,

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -17,7 +17,7 @@ processes (TokenizerManager, DetokenizerManager, Controller).
 """
 import uuid
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import Enum
 from typing import Dict, List, Optional, Union
@@ -69,8 +69,10 @@ class GenerateReqInput:
     # Session info for continual prompting
     session_params: Optional[Union[List[Dict], Dict]] = None
-    # Custom logit processor (serialized function)
-    custom_logit_processor: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    # Custom logit processor for advanced sampling control. Must be a serialized instance
+    # of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
+    # Use the processor's `to_str()` method to generate the serialized string.
+    custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
     def normalize_batch_and_arguments(self):
         if (
@@ -248,8 +250,9 @@ class TokenizedGenerateReqInput:
     # Session info for continual prompting
     session_params: Optional[SessionParams] = None
-    # Custom logit processor (serialized function)
-    # TODO (hpguo): Add an example and update doc string here
+    # Custom logit processor for advanced sampling control. Must be a serialized instance
+    # of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
+    # Use the processor's `to_str()` method to generate the serialized string.
     custom_logit_processor: Optional[str] = None
@@ -351,10 +354,13 @@ class BatchTokenIDOut:
     skip_special_tokens: List[bool]
     spaces_between_special_tokens: List[bool]
     no_stop_trim: List[bool]
     # Token counts
     prompt_tokens: List[int]
     completion_tokens: List[int]
     cached_tokens: List[int]
+    spec_verify_ct: List[int]
     # Logprobs
     input_token_logprobs_val: List[float]
     input_token_logprobs_idx: List[int]
@@ -379,6 +385,7 @@ class BatchStrOut:
     prompt_tokens: List[int]
     completion_tokens: List[int]
     cached_tokens: List[int]
+    spec_verify_ct: List[int]
     # Logprobs
     input_token_logprobs_val: List[float]
@@ -533,3 +540,27 @@ class CloseSessionReqInput:
 class OpenSessionReqOutput:
     session_id: Optional[str]
     success: bool
+@dataclass
+class Function:
+    description: Optional[str] = None
+    name: Optional[str] = None
+    parameters: Optional[object] = None
+@dataclass
+class Tool:
+    function: Function
+    type: Optional[str] = "function"
+@dataclass
+class FunctionCallReqInput:
+    text: str  # The text to parse.
+    tools: List[Tool] = field(
+        default_factory=list
+    )  # A list of available function tools (name, parameters, etc.).
+    tool_call_parser: Optional[str] = (
+        None  # Specify the parser type, e.g. 'llama3', 'qwen25', or 'mistral'. If not specified, tries all.
+    )

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -247,12 +247,12 @@ class Req:
         # Each decode stage's output ids
         self.output_ids = []
         # fill_ids = origin_input_ids + output_ids. Updated if chunked.
+        self.fill_ids = None
         self.session_id = session_id
         self.input_embeds = input_embeds
         # Sampling info
         self.sampling_params = sampling_params
-        self.lora_path = lora_path
         self.custom_logit_processor = custom_logit_processor
         # Memory pool info
@@ -300,7 +300,7 @@ class Req:
         self.logprob_start_len = 0
         self.top_logprobs_num = top_logprobs_num
-        # Logprobs (return value)
+        # Logprobs (return values)
         self.input_token_logprobs_val: Optional[List[float]] = None
         self.input_token_logprobs_idx: Optional[List[int]] = None
         self.input_top_logprobs_val: Optional[List[float]] = None
@@ -329,8 +329,14 @@ class Req:
         # Constrained decoding
         self.grammar: Optional[BaseGrammarObject] = None
-        # The number of cached tokens, that were already cached in the KV cache
+        # The number of cached tokens that were already cached in the KV cache
         self.cached_tokens = 0
+        self.already_computed = 0
+        # The number of verification forward passes in the speculative decoding.
+        # This is used to compute the average acceptance length per request.
+        self.spec_verify_ct = 0
+        self.lora_path = lora_path
     def extend_image_inputs(self, image_inputs):
         if self.image_inputs is None:
@@ -550,13 +556,13 @@ class ScheduleBatch:
     next_batch_sampling_info: SamplingBatchInfo = None
     # Batched arguments to model runner
-    input_ids: torch.Tensor = None
-    input_embeds: torch.Tensor = None
-    req_pool_indices: torch.Tensor = None
-    seq_lens: torch.Tensor = None
+    input_ids: torch.Tensor = None  # shape: [b], int32
+    input_embeds: torch.Tensor = None  # shape: [b, hidden_size], float32
+    req_pool_indices: torch.Tensor = None  # shape: [b], int32
+    seq_lens: torch.Tensor = None  # shape: [b], int64
     # The output locations of the KV cache
-    out_cache_loc: torch.Tensor = None
-    output_ids: torch.Tensor = None
+    out_cache_loc: torch.Tensor = None  # shape: [b], int32
+    output_ids: torch.Tensor = None  # shape: [b], int32
     # The sum of all sequence lengths
     seq_lens_sum: int = None
@@ -750,13 +756,6 @@ class ScheduleBatch:
         pt = 0
         for i, req in enumerate(reqs):
-            already_computed = (
-                req.extend_logprob_start_len + 1 + req.cached_tokens
-                if req.extend_logprob_start_len > 0
-                else 0
-            )
-            req.cached_tokens += len(req.prefix_indices) - already_computed
             req.req_pool_idx = req_pool_indices[i]
             pre_len, seq_len = len(req.prefix_indices), len(req.fill_ids)
             seq_lens.append(seq_len)
@@ -772,15 +771,20 @@ class ScheduleBatch:
                 # If req.input_embeds is already a list, append its content directly
                 input_embeds.extend(req.input_embeds)  # Use extend to avoid nesting
-            # Compute the relative logprob_start_len in an extend batch
-            if req.logprob_start_len >= pre_len:
-                extend_logprob_start_len = min(
-                    req.logprob_start_len - pre_len, req.extend_input_len - 1
-                )
-            else:
-                extend_logprob_start_len = req.extend_input_len - 1
+            if req.return_logprob:
+                # Compute the relative logprob_start_len in an extend batch
+                if req.logprob_start_len >= pre_len:
+                    extend_logprob_start_len = min(
+                        req.logprob_start_len - pre_len, req.extend_input_len - 1
+                    )
+                else:
+                    raise RuntimeError(
+                        f"This should never happen. {req.logprob_start_len=}, {pre_len=}"
+                    )
+                req.extend_logprob_start_len = extend_logprob_start_len
-            req.extend_logprob_start_len = extend_logprob_start_len
+            req.cached_tokens += pre_len - req.already_computed
+            req.already_computed = seq_len
             req.is_retracted = False
             pre_lens.append(pre_len)
@@ -1026,7 +1030,7 @@ class ScheduleBatch:
         self.input_ids = torch.empty(0, dtype=torch.int32, device=self.device)
         self.seq_lens = torch.empty(0, dtype=torch.int64, device=self.device)
         self.out_cache_loc = torch.empty(0, dtype=torch.int32, device=self.device)
-        self.req_pool_indices = torch.empty(0, dtype=torch.int64, device=self.device)
+        self.req_pool_indices = torch.empty(0, dtype=torch.int32, device=self.device)
         self.seq_lens_sum = 0
         self.extend_num_tokens = 0
         self.sampling_info = SamplingBatchInfo.from_schedule_batch(
@@ -1112,6 +1116,8 @@ class ScheduleBatch:
         self.has_grammar = any(req.grammar for req in self.reqs)
         self.sampling_info.filter_batch(keep_indices, new_indices)
+        if self.spec_info:
+            self.spec_info.filter_batch(new_indices)
     def merge_batch(self, other: "ScheduleBatch"):
         # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -281,6 +281,7 @@ class Scheduler:
         # Print debug info
         logger.info(
             f"max_total_num_tokens={self.max_total_num_tokens}, "
+            f"chunked_prefill_size={server_args.chunked_prefill_size}, "
             f"max_prefill_tokens={self.max_prefill_tokens}, "
             f"max_running_requests={self.max_running_requests}, "
             f"context_len={self.model_config.context_len}"
@@ -408,6 +409,11 @@ class Scheduler:
                 },
             )
+        # The largest prefill length of a single request
+        self._largest_prefill_len: int = 0
+        # The largest context length (prefill + generation) of a single request
+        self._largest_prefill_decode_len: int = 0
         # Init request dispatcher
         self._request_dispatcher = TypeBasedDispatcher(
             [
@@ -480,7 +486,7 @@ class Scheduler:
     @torch.no_grad()
     def event_loop_overlap(self):
         """A scheduler loop that overlaps the CPU processing and GPU computation."""
-        result_queue = deque()
+        self.result_queue = deque()
         while True:
             recv_reqs = self.recv_requests()
@@ -491,7 +497,7 @@ class Scheduler:
             if batch:
                 result = self.run_batch(batch)
-                result_queue.append((batch.copy(), result))
+                self.result_queue.append((batch.copy(), result))
                 if self.last_batch is None:
                     # Create a dummy first batch to start the pipeline for overlap schedule.
@@ -505,7 +511,7 @@ class Scheduler:
             if self.last_batch:
                 # Process the results of the last batch
-                tmp_batch, tmp_result = result_queue.popleft()
+                tmp_batch, tmp_result = self.result_queue.popleft()
                 tmp_batch.next_batch_sampling_info = (
                     self.tp_worker.cur_sampling_info if batch else None
                 )
@@ -636,7 +642,7 @@ class Scheduler:
                 self.waiting_queue.append(req)
                 return
-        # Handle image inputs
+        # Handle multimodal inputs
         if recv_req.image_inputs is not None:
             image_inputs = ImageInputs.from_dict(recv_req.image_inputs)
             # Expand a single image token into multiple dummy tokens for receiving image embeddings
@@ -660,24 +666,23 @@ class Scheduler:
                 self.waiting_queue.append(req)
                 return
-        # Copy more attributes
-        req.logprob_start_len = recv_req.logprob_start_len
-        if req.logprob_start_len == -1:
-            # By default, only return the logprobs for output tokens
-            req.logprob_start_len = len(req.origin_input_ids) - 1
         # Validate prompts length
         error_msg = validate_input_length(
             req,
             self.max_req_input_len,
             self.server_args.allow_auto_truncate,
         )
         if error_msg:
             self.waiting_queue.append(req)
             return
+        # Copy more attributes
+        if recv_req.logprob_start_len == -1:
+            # By default, only return the logprobs for output tokens
+            req.logprob_start_len = len(req.origin_input_ids) - 1
+        else:
+            req.logprob_start_len = recv_req.logprob_start_len
         req.sampling_params.max_new_tokens = min(
             (
                 req.sampling_params.max_new_tokens
@@ -725,15 +730,26 @@ class Scheduler:
         req.tokenizer = self.tokenizer
         # Validate prompts length
-        validate_input_length(
+        error_msg = validate_input_length(
             req,
             self.max_req_input_len,
             self.server_args.allow_auto_truncate,
         )
+        if error_msg:
+            self.waiting_queue.append(req)
+            return
+        # Copy more attributes
+        req.logprob_start_len = len(req.origin_input_ids) - 1
         self.waiting_queue.append(req)
-    def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
+    def log_prefill_stats(
+        self,
+        adder: PrefillAdder,
+        can_run_list: List[Req],
+        running_bs: ScheduleBatch,
+        has_being_chunked: bool,
+    ):
         self.tree_cache_metrics["total"] += (
             adder.log_input_tokens + adder.log_hit_tokens
         ) / 10**9
@@ -1023,7 +1039,7 @@ class Scheduler:
             )
         # Check for jump-forward
-        if not self.disable_jump_forward:
+        if not self.disable_jump_forward and batch.has_grammar:
             jump_forward_reqs = batch.check_for_jump_forward(self.pad_input_ids_func)
             self.waiting_queue.extend(jump_forward_reqs)
             if batch.is_empty():
@@ -1044,26 +1060,23 @@ class Scheduler:
         self.forward_ct += 1
         if self.is_generation:
-            if batch.forward_mode.is_decode_or_idle() or batch.extend_num_tokens != 0:
-                if self.spec_algorithm.is_none():
-                    model_worker_batch = batch.get_model_worker_batch()
-                    logits_output, next_token_ids = (
-                        self.tp_worker.forward_batch_generation(model_worker_batch)
-                    )
-                else:
-                    (
-                        logits_output,
-                        next_token_ids,
-                        model_worker_batch,
-                        num_accepted_tokens,
-                    ) = self.draft_worker.forward_batch_speculative_generation(batch)
-                    self.spec_num_total_accepted_tokens += (
-                        num_accepted_tokens + batch.batch_size()
-                    )
-                    self.spec_num_total_forward_ct += batch.batch_size()
-                    self.num_generated_tokens += num_accepted_tokens
+            if self.spec_algorithm.is_none():
+                model_worker_batch = batch.get_model_worker_batch()
+                logits_output, next_token_ids = self.tp_worker.forward_batch_generation(
+                    model_worker_batch
+                )
             else:
-                assert False, "batch.extend_num_tokens == 0, this is unexpected!"
+                (
+                    logits_output,
+                    next_token_ids,
+                    model_worker_batch,
+                    num_accepted_tokens,
+                ) = self.draft_worker.forward_batch_speculative_generation(batch)
+                self.spec_num_total_accepted_tokens += (
+                    num_accepted_tokens + batch.batch_size()
+                )
+                self.spec_num_total_forward_ct += batch.batch_size()
+                self.num_generated_tokens += num_accepted_tokens
             batch.output_ids = next_token_ids
             ret = GenerationBatchResult(
@@ -1072,7 +1085,6 @@ class Scheduler:
                 bid=model_worker_batch.bid,
             )
         else:  # embedding or reward model
-            assert batch.extend_num_tokens != 0
             model_worker_batch = batch.get_model_worker_batch()
             embeddings = self.tp_worker.forward_batch_embedding(model_worker_batch)
             ret = EmbeddingBatchResult(
@@ -1371,6 +1383,7 @@ class Scheduler:
             prompt_tokens = []
             completion_tokens = []
             cached_tokens = []
+            spec_verify_ct = []
             if return_logprob:
                 input_token_logprobs_val = []
@@ -1424,6 +1437,9 @@ class Scheduler:
                     completion_tokens.append(len(req.output_ids))
                     cached_tokens.append(req.cached_tokens)
+                    if not self.spec_algorithm.is_none():
+                        spec_verify_ct.append(req.spec_verify_ct)
                     if return_logprob:
                         input_token_logprobs_val.append(req.input_token_logprobs_val)
                         input_token_logprobs_idx.append(req.input_token_logprobs_idx)
@@ -1451,6 +1467,7 @@ class Scheduler:
                         prompt_tokens,
                         completion_tokens,
                         cached_tokens,
+                        spec_verify_ct,
                         input_token_logprobs_val,
                         input_token_logprobs_idx,
                         output_token_logprobs_val,
@@ -1564,6 +1581,15 @@ class Scheduler:
                 self.grammar_backend.reset()
             self.req_to_token_pool.clear()
             self.token_to_kv_pool.clear()
+            if not self.spec_algorithm.is_none():
+                self.draft_worker.model_runner.req_to_token_pool.clear()
+                self.draft_worker.model_runner.token_to_kv_pool.clear()
+            self.num_generated_tokens = 0
+            self.forward_ct_decode = 0
+            self.spec_num_total_accepted_tokens = 0
+            self.spec_num_total_forward_ct = 0
             torch.cuda.empty_cache()
             logger.info("Cache flushed successfully!")
             if_success = True

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -785,6 +785,9 @@ class TokenizerManager:
                     i,
                 )
+            if self.server_args.speculative_algorithm:
+                meta_info["spec_verify_ct"] = recv_obj.spec_verify_ct[i]
             if not isinstance(recv_obj, BatchEmbeddingOut):
                 meta_info.update(
                     {
@@ -809,6 +812,7 @@ class TokenizerManager:
                     "embedding": recv_obj.embeddings[i],
                     "meta_info": meta_info,
                 }
             state.out_list.append(out_dict)
             state.finished = recv_obj.finished_reasons[i] is not None
             state.event.set()

sglang 0.4.1.post7__py3-none-any.whl → 0.4.2__py3-none-any.whl

sglang 0.4.1.post7py3-none-any.whl → 0.4.2py3-none-any.whl