PyPI - sglang - Versions diffs - 0.4.6.post3__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl - Mend

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

sglang/bench_offline_throughput.py +4 -2
sglang/bench_one_batch.py +2 -2
sglang/bench_one_batch_server.py +143 -15
sglang/bench_serving.py +9 -7
sglang/compile_deep_gemm.py +1 -1
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +78 -78
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +2 -2
sglang/srt/configs/model_config.py +1 -0
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +69 -43
sglang/srt/conversation.py +48 -43
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +7 -2
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +227 -120
sglang/srt/disaggregation/nixl/conn.py +1 -0
sglang/srt/disaggregation/prefill.py +7 -4
sglang/srt/disaggregation/utils.py +7 -1
sglang/srt/entrypoints/engine.py +17 -2
sglang/srt/entrypoints/http_server.py +17 -2
sglang/srt/function_call_parser.py +2 -2
sglang/srt/layers/attention/flashattention_backend.py +1 -1
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/utils.py +4 -2
sglang/srt/layers/dp_attention.py +71 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
sglang/srt/layers/moe/topk.py +1 -1
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/deep_gemm.py +72 -71
sglang/srt/layers/quantization/fp8.py +2 -2
sglang/srt/layers/quantization/fp8_kernel.py +3 -3
sglang/srt/layers/quantization/int8_kernel.py +2 -2
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +1 -1
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/io_struct.py +3 -1
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/schedule_batch.py +76 -24
sglang/srt/managers/schedule_policy.py +0 -3
sglang/srt/managers/scheduler.py +113 -88
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/tokenizer_manager.py +133 -34
sglang/srt/managers/tp_worker.py +12 -9
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/memory_pool.py +2 -0
sglang/srt/metrics/collector.py +312 -37
sglang/srt/model_executor/cuda_graph_runner.py +10 -11
sglang/srt/model_executor/forward_batch_info.py +1 -1
sglang/srt/model_executor/model_runner.py +19 -14
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +23 -20
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llama4.py +5 -6
sglang/srt/models/llava.py +248 -5
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/openai_api/adapter.py +30 -4
sglang/srt/openai_api/protocol.py +0 -8
sglang/srt/reasoning_parser.py +3 -3
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +4 -56
sglang/srt/sampling/sampling_params.py +2 -2
sglang/srt/server_args.py +34 -4
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +7 -7
sglang/srt/speculative/eagle_worker.py +22 -19
sglang/srt/utils.py +6 -5
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +89 -14
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +6 -5
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +107 -104
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -175,6 +175,32 @@ def guess_chat_template_name_from_model_path(model_path):
         )
+def _validate_prompt(prompt: str):
+    """Validate that the prompt is not empty or whitespace only."""
+    is_invalid = False
+    # Check for empty/whitespace string
+    if isinstance(prompt, str):
+        is_invalid = not prompt.strip()
+    # Check for various invalid list cases: [], [""], [" "], [[]]
+    elif isinstance(prompt, list):
+        is_invalid = not prompt or (
+            len(prompt) == 1
+            and (
+                (isinstance(prompt[0], str) and not prompt[0].strip())
+                or (isinstance(prompt[0], list) and not prompt[0])
+            )
+        )
+    if is_invalid:
+        raise HTTPException(
+            status_code=400,
+            detail="Input cannot be empty or contain only whitespace.",
+        )
+    return prompt
 async def v1_files_create(
     file: UploadFile, purpose: str, file_storage_path: str = None
 ):
@@ -529,7 +555,6 @@ def v1_generate_request(
                 "temperature": request.temperature,
                 "max_new_tokens": request.max_tokens,
                 "min_new_tokens": request.min_tokens,
-                "thinking_budget": request.thinking_budget,
                 "stop": request.stop,
                 "stop_token_ids": request.stop_token_ids,
                 "top_p": request.top_p,
@@ -591,7 +616,7 @@ def v1_generate_response(
     echo = False
     if (not isinstance(request, list)) and request.echo:
-        # TODO: handle the case propmt is token ids
+        # TODO: handle the case prompt is token ids
         if isinstance(request.prompt, list) and isinstance(request.prompt[0], str):
             # for the case of multiple str prompts
             prompts = request.prompt
@@ -647,7 +672,7 @@ def v1_generate_response(
         finish_reason = ret_item["meta_info"]["finish_reason"]
         if to_file:
-            # to make the choise data json serializable
+            # to make the choice data json serializable
             choice_data = {
                 "index": 0,
                 "text": text,
@@ -1102,7 +1127,6 @@ def v1_chat_generate_request(
             "temperature": request.temperature,
             "max_new_tokens": request.max_tokens or request.max_completion_tokens,
             "min_new_tokens": request.min_tokens,
-            "thinking_budget": request.thinking_budget,
             "stop": stop,
             "stop_token_ids": request.stop_token_ids,
             "top_p": request.top_p,
@@ -1755,6 +1779,8 @@ def v1_embedding_request(all_requests, tokenizer_manager):
     for request in all_requests:
         prompt = request.input
+        # Check for empty/whitespace string
+        prompt = _validate_prompt(request.input)
         assert (
             type(prompt) is first_prompt_type
         ), "All prompts must be of the same type in file input settings"

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -172,7 +172,6 @@ class CompletionRequest(BaseModel):
     top_k: int = -1
     min_p: float = 0.0
     min_tokens: int = 0
-    thinking_budget: Optional[int] = None
     json_schema: Optional[str] = None
     regex: Optional[str] = None
     ebnf: Optional[str] = None
@@ -351,13 +350,6 @@ class ChatCompletionRequest(BaseModel):
         description="The maximum number of completion tokens for a chat completion request, "
         "including visible output tokens and reasoning tokens. Input tokens are not included. ",
     )
-    thinking_budget: Optional[int] = Field(
-        default=None,
-        description="The maximum number of reasoning tokens that can be generated for a request. "
-        "This setting of does not affect the thinking process of models. "
-        "If the number of tokens generated by the model's thinking process exceeds thinking_budget, "
-        "the reasoning content will be truncated and the final response content will be generated immediately.",
-    )
     n: int = 1
     presence_penalty: float = 0.0
     response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None

sglang/srt/reasoning_parser.py CHANGED Viewed

@@ -32,7 +32,7 @@ class BaseReasoningFormatDetector:
         One-time parsing: Detects and parses reasoning sections in the provided text.
         Returns both reasoning content and normal text separately.
         """
-        text = text.replace(self.think_start_token, "")
+        text = text.replace(self.think_start_token, "").strip()
         if self.think_end_token not in text:
             # Assume reasoning was truncated before `</think>` token
             return StreamingParseResult(reasoning_text=text)
@@ -73,7 +73,7 @@ class BaseReasoningFormatDetector:
             normal_text = current_text[end_idx + len(self.think_end_token) :]
             return StreamingParseResult(
-                normal_text=normal_text, reasoning_text=reasoning_text
+                normal_text=normal_text, reasoning_text=reasoning_text.rstrip()
             )
         # Continue with reasoning content
@@ -147,7 +147,7 @@ class ReasoningParser:
     Args:
         model_type (str): Type of model to parse reasoning from
-        stream_reasoning (bool): If Flase, accumulates reasoning content until complete.
+        stream_reasoning (bool): If False, accumulates reasoning content until complete.
             If True, streams reasoning content as it arrives.
     """

sglang/srt/sampling/custom_logit_processor.py CHANGED Viewed

@@ -28,11 +28,26 @@ class CustomLogitProcessor(ABC):
         """Define the callable behavior."""
         raise NotImplementedError
-    def to_str(self) -> str:
+    @classmethod
+    def to_str(cls) -> str:
         """Serialize the callable function to a JSON-compatible string."""
-        return json.dumps({"callable": dill.dumps(self).hex()})
+        return json.dumps({"callable": dill.dumps(cls).hex()})
     @classmethod
     def from_str(cls, json_str: str):
         """Deserialize a callable function from a JSON string."""
-        return _cache_from_str(json_str)
+        return _cache_from_str(json_str)()
+class DisallowedTokensLogitsProcessor(CustomLogitProcessor):
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        custom_param_list: Optional[List[Dict[str, Any]]] = None,
+    ) -> torch.Tensor:
+        disallowed_token_ids = custom_param_list[0]["token_ids"]
+        assert all(
+            disallowed_token_ids == c["token_ids"] for c in custom_param_list
+        ), f"{custom_param_list=}"
+        logits[..., disallowed_token_ids] = -float("inf")
+        return logits

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -30,13 +30,8 @@ class SamplingBatchInfo:
     # Whether any request needs min_p sampling
     need_min_p_sampling: bool
-    # Use thinking_budget to truncate thinking
-    num_thinking_tokens: Optional[torch.Tensor] = None
-    think_end_ids: Optional[torch.Tensor] = None
-    thinking_budgets: Optional[torch.Tensor] = None
     # Masking tensors for grammar-guided structured outputs
-    vocab_size: int = 0
+    vocab_size: int
     grammars: Optional[List] = None
     vocab_mask: Optional[torch.Tensor] = None
     apply_mask_func: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None
@@ -81,22 +76,7 @@ class SamplingBatchInfo:
         min_ps = torch.tensor(
             [r.sampling_params.min_p for r in reqs], dtype=torch.float
         ).to(device, non_blocking=True)
-        if any(hasattr(r.tokenizer, "think_end_id") for r in reqs):
-            think_end_ids = torch.tensor(
-                [getattr(r.tokenizer, "think_end_id", -1) for r in reqs],
-                dtype=torch.int64,
-            ).to(device, non_blocking=True)
-            num_thinking_tokens = torch.tensor([0 for _ in reqs], dtype=torch.int64).to(
-                device, non_blocking=True
-            )
-            thinking_budgets = torch.tensor(
-                [r.sampling_params.thinking_budget or -1 for r in reqs],
-                dtype=torch.int64,
-            ).to(device, non_blocking=True)
-        else:
-            think_end_ids = None
-            num_thinking_tokens = None
-            thinking_budgets = None
         # Check if any request has custom logit processor
         has_custom_logit_processor = (
             batch.enable_custom_logit_processor  # check the flag first.
@@ -152,9 +132,6 @@ class SamplingBatchInfo:
             top_ps=top_ps,
             top_ks=top_ks,
             min_ps=min_ps,
-            think_end_ids=think_end_ids,
-            num_thinking_tokens=num_thinking_tokens,
-            thinking_budgets=thinking_budgets,
             is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
             need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
             vocab_size=vocab_size,
@@ -169,35 +146,6 @@ class SamplingBatchInfo:
     def __len__(self):
         return len(self.temperatures)
-    def apply_thinking_budgets(self, next_token_logits: torch.Tensor):
-        has_budget = self.thinking_budgets > 0
-        if not has_budget.any():
-            return
-        torch.where(
-            has_budget,
-            self.num_thinking_tokens + 1,
-            self.num_thinking_tokens,
-            out=self.num_thinking_tokens,
-        )
-        should_stop = has_budget & (
-            self.num_thinking_tokens - 1 > self.thinking_budgets
-        )
-        next_token_logits.masked_fill_(should_stop.unsqueeze(0), float("-inf"))
-        batch_indices = torch.nonzero(should_stop, as_tuple=True)[0]
-        if len(batch_indices) > 0:
-            end_token_indices = self.think_end_ids[batch_indices]
-            next_token_logits[batch_indices, end_token_indices] = 0.0
-    def update_thinking_budgets(self, next_token_ids: torch.Tensor):
-        if not torch.any(self.thinking_budgets > 0):
-            return
-        torch.where(
-            next_token_ids == self.think_end_ids,
-            torch.tensor(-1, device=self.thinking_budgets.device),
-            self.thinking_budgets,
-            out=self.thinking_budgets,
-        )
     def update_regex_vocab_mask(self):
         if not self.grammars:
             self.vocab_mask = None
@@ -346,7 +294,7 @@ class SamplingBatchInfo:
             # Set the flag to True if any of the two has custom logit processor
             self.has_custom_logit_processor = True
-        # Note: becasue the __len()__ operator is defined on the temperatures tensor,
+        # Note: because the __len()__ operator is defined on the temperatures tensor,
         # please make sure any merge operation with len(self) or len(other) is done before
         # the merge operation of the temperatures tensor below.
         for item in [
@@ -359,5 +307,5 @@ class SamplingBatchInfo:
             other_val = getattr(other, item, None)
             setattr(self, item, torch.cat([self_val, other_val]))
-        self.is_all_greedy |= other.is_all_greedy
+        self.is_all_greedy &= other.is_all_greedy
         self.need_min_p_sampling |= other.need_min_p_sampling

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -30,7 +30,6 @@ class SamplingParams:
     def __init__(
         self,
         max_new_tokens: int = 128,
-        thinking_budget: Optional[int] = None,
         stop: Optional[Union[str, List[str]]] = None,
         stop_token_ids: Optional[List[int]] = None,
         temperature: float = 1.0,
@@ -51,6 +50,7 @@ class SamplingParams:
         spaces_between_special_tokens: bool = True,
         no_stop_trim: bool = False,
         custom_params: Optional[Dict[str, Any]] = None,
+        stream_interval: Optional[int] = None,
     ) -> None:
         self.max_new_tokens = max_new_tokens
         self.stop_strs = stop
@@ -58,7 +58,6 @@ class SamplingParams:
             self.stop_token_ids = set(stop_token_ids)
         else:
             self.stop_token_ids = None
-        self.thinking_budget = thinking_budget
         self.temperature = temperature
         self.top_p = top_p
         self.top_k = top_k
@@ -77,6 +76,7 @@ class SamplingParams:
         self.spaces_between_special_tokens = spaces_between_special_tokens
         self.no_stop_trim = no_stop_trim
         self.custom_params = custom_params
+        self.stream_interval = stream_interval
         # Process some special cases
         if 0 <= self.temperature < _SAMPLING_EPS:

sglang/srt/server_args.py CHANGED Viewed

@@ -98,6 +98,7 @@ class ServerArgs:
     show_time_cost: bool = False
     enable_metrics: bool = False
     decode_log_interval: int = 40
+    enable_request_time_stats_logging: bool = False
     # API related
     api_key: Optional[str] = None
@@ -159,6 +160,7 @@ class ServerArgs:
     disable_overlap_schedule: bool = False
     enable_mixed_chunk: bool = False
     enable_dp_attention: bool = False
+    enable_dp_lm_head: bool = False
     enable_ep_moe: bool = False
     enable_deepep_moe: bool = False
     deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
@@ -305,6 +307,12 @@ class ServerArgs:
         if self.grammar_backend is None:
             self.grammar_backend = "xgrammar"
+        if self.pp_size > 1:
+            self.disable_overlap_schedule = True
+            logger.warning(
+                "Overlap scheduler is disabled because of using pipeline parallelism."
+            )
         # Data parallelism attention
         if self.enable_dp_attention:
             self.schedule_conservativeness = self.schedule_conservativeness * 0.3
@@ -317,6 +325,11 @@ class ServerArgs:
                 f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
             )
+        if self.enable_dp_lm_head:
+            assert (
+                self.enable_dp_attention
+            ), "Please enable dp attention when setting enable_dp_attention. "
         # DeepEP MoE
         self.enable_sp_layernorm = False
         if self.enable_deepep_moe:
@@ -335,6 +348,12 @@ class ServerArgs:
                 f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
             )
+        if self.pp_size > 1:
+            self.disable_overlap_schedule = True
+            logger.warning(
+                "Pipeline parallelism is incompatible with overlap schedule."
+            )
         # Speculative Decoding
         if self.speculative_algorithm == "NEXTN":
             # NEXTN shares the same implementation of EAGLE
@@ -767,6 +786,12 @@ class ServerArgs:
             default=ServerArgs.decode_log_interval,
             help="The log interval of decode batch.",
         )
+        parser.add_argument(
+            "--enable-request-time-stats-logging",
+            action="store_true",
+            default=ServerArgs.enable_request_time_stats_logging,
+            help="Enable per request time stats logging",
+        )
         # API related
         parser.add_argument(
@@ -825,7 +850,7 @@ class ServerArgs:
         # Multi-node distributed serving
         parser.add_argument(
             "--dist-init-addr",
-            "--nccl-init-addr",  # For backward compatbility. This will be removed in the future.
+            "--nccl-init-addr",  # For backward compatibility. This will be removed in the future.
             type=str,
             help="The host address for initializing distributed backend (e.g., `192.168.0.2:25000`).",
         )
@@ -1049,6 +1074,11 @@ class ServerArgs:
             action="store_true",
             help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently only DeepSeek-V2 is supported.",
         )
+        parser.add_argument(
+            "--enable-dp-lm-head",
+            action="store_true",
+            help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
+        )
         parser.add_argument(
             "--enable-ep-moe",
             action="store_true",
@@ -1069,7 +1099,7 @@ class ServerArgs:
             "--cuda-graph-max-bs",
             type=int,
             default=ServerArgs.cuda_graph_max_bs,
-            help="Set the maximum batch size for cuda graph.",
+            help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
         )
         parser.add_argument(
             "--cuda-graph-bs",
@@ -1096,7 +1126,7 @@ class ServerArgs:
         parser.add_argument(
             "--triton-attention-reduce-in-fp32",
             action="store_true",
-            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
+            help="Cast the intermediate attention results to fp32 to avoid possible crashes related to fp16."
             "This only affects Triton attention kernels.",
         )
         parser.add_argument(
@@ -1188,7 +1218,7 @@ class ServerArgs:
             type=int,
             default=0,
             help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
-            "set it to tp_size can get best optimized performace.",
+            "set it to tp_size can get best optimized performance. Note that for architectures with SM==90, we have enabled the shared experts fusion optimization by default for DeepSeek V3/R1, with n_share_experts_fusion automatically set to the TP size.",
         )
         parser.add_argument(
             "--disable-chunked-prefix-cache",

sglang/srt/speculative/eagle_draft_cuda_graph_runner.py CHANGED Viewed

@@ -82,12 +82,12 @@ class EAGLEDraftCudaGraphRunner:
             self.capture()
         except RuntimeError as e:
             raise Exception(
-                f"Capture cuda graph failed: {e}\n"
+                f"Capture CUDA graph failed: {e}\n"
                 "Possible solutions:\n"
                 "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
                 "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
                 "3. disable torch compile by not using --enable-torch-compile\n"
-                "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
+                "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
                 "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
             )
@@ -149,7 +149,7 @@ class EAGLEDraftCudaGraphRunner:
         # Run and capture
         def run_once():
-            # Backup two fileds, which will be modified in-place in `draft_forward`.
+            # Backup two fields, which will be modified in-place in `draft_forward`.
             output_cache_loc_backup = forward_batch.out_cache_loc
             hidden_states_backup = forward_batch.spec_info.hidden_states

sglang/srt/speculative/eagle_utils.py CHANGED Viewed

@@ -167,12 +167,12 @@ class EagleVerifyOutput:
     draft_input: EagleDraftInput
     # Logit outputs from target worker
     logits_output: LogitsProcessorOutput
-    # Accepeted token ids including the bonus token
+    # Accepted token ids including the bonus token
     verified_id: torch.Tensor
-    # Accepeted token length per sequence in a batch in CPU.
+    # Accepted token length per sequence in a batch in CPU.
     accept_length_per_req_cpu: List[int]
-    # Accepeted indices from logits_output.next_token_logits
-    accepeted_indices: torch.Tensor
+    # Accepted indices from logits_output.next_token_logits
+    accepted_indices: torch.Tensor
 @dataclass
@@ -316,7 +316,7 @@ class EagleVerifyInput:
         This API updates values inside logits_output based on the accepted
         tokens. I.e., logits_output.next_token_logits only contains
-        accepeted token logits.
+        accepted token logits.
         """
         bs = self.retrive_index.shape[0]
         candidates = self.draft_token.reshape(bs, self.draft_token_num)
@@ -493,7 +493,7 @@ class EagleVerifyInput:
                 logits_output=logits_output,
                 verified_id=verified_id,
                 accept_length_per_req_cpu=accept_length_cpu,
-                accepeted_indices=accept_index,
+                accepted_indices=accept_index,
             )
         else:
             assign_req_to_token_pool[(bs,)](
@@ -539,7 +539,7 @@ class EagleVerifyInput:
                 logits_output=logits_output,
                 verified_id=verified_id,
                 accept_length_per_req_cpu=accept_length_cpu,
-                accepeted_indices=accept_index,
+                accepted_indices=accept_index,
             )

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -201,7 +201,7 @@ class EAGLEWorker(TpModelWorker):
             self.has_prefill_wrapper_verify = False
         else:
             raise ValueError(
-                f"EAGLE is not supportted in attention backend {self.server_args.attention_backend}"
+                f"EAGLE is not supported in attention backend {self.server_args.attention_backend}"
             )
         self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
@@ -245,14 +245,14 @@ class EAGLEWorker(TpModelWorker):
         Args:
             batch: The batch to run forward. The state of the batch is modified as it runs.
         Returns:
-            A tuple of the final logit output of the target model, next tokens accepeted,
-            the batch id (used for overlap schedule), and number of accepeted tokens.
+            A tuple of the final logit output of the target model, next tokens accepted,
+            the batch id (used for overlap schedule), and number of accepted tokens.
         """
         if batch.forward_mode.is_decode():
             with self.draft_tp_context(self.draft_model_runner.tp_group):
                 spec_info = self.draft(batch)
-            logits_output, verify_output, model_worker_batch = self.verify(
-                batch, spec_info
+            logits_output, verify_output, model_worker_batch, can_run_cuda_graph = (
+                self.verify(batch, spec_info)
             )
             # If it is None, it means all requests are finished
@@ -264,21 +264,22 @@ class EAGLEWorker(TpModelWorker):
                 verify_output.verified_id,
                 model_worker_batch.bid,
                 sum(verify_output.accept_length_per_req_cpu),
+                can_run_cuda_graph,
             )
         elif batch.forward_mode.is_idle():
             model_worker_batch = batch.get_model_worker_batch()
-            logits_output, next_token_ids = self.target_worker.forward_batch_generation(
-                model_worker_batch
+            logits_output, next_token_ids, _ = (
+                self.target_worker.forward_batch_generation(model_worker_batch)
             )
-            return logits_output, next_token_ids, model_worker_batch.bid, 0
+            return logits_output, next_token_ids, model_worker_batch.bid, 0, False
         else:
             logits_output, next_token_ids, bid = self.forward_target_extend(batch)
             with self.draft_tp_context(self.draft_model_runner.tp_group):
                 self.forward_draft_extend(
                     batch, logits_output.hidden_states, next_token_ids
                 )
-            return logits_output, next_token_ids, bid, 0
+            return logits_output, next_token_ids, bid, 0, False
     def forward_target_extend(
         self, batch: ScheduleBatch
@@ -297,7 +298,7 @@ class EAGLEWorker(TpModelWorker):
         # We need the full hidden states to prefill the KV cache of the draft model.
         model_worker_batch = batch.get_model_worker_batch()
         model_worker_batch.capture_hidden_mode = CaptureHiddenMode.FULL
-        logits_output, next_token_ids = self.target_worker.forward_batch_generation(
+        logits_output, next_token_ids, _ = self.target_worker.forward_batch_generation(
             model_worker_batch
         )
         return logits_output, next_token_ids, model_worker_batch.bid
@@ -478,8 +479,10 @@ class EAGLEWorker(TpModelWorker):
         batch.forward_mode = ForwardMode.TARGET_VERIFY
         batch.spec_info = spec_info
         model_worker_batch = batch.get_model_worker_batch()
-        logits_output, _ = self.target_worker.forward_batch_generation(
-            model_worker_batch, skip_sample=True
+        logits_output, _, can_run_cuda_graph = (
+            self.target_worker.forward_batch_generation(
+                model_worker_batch, skip_sample=True
+            )
         )
         self._detect_nan_if_needed(logits_output)
         spec_info.hidden_states = logits_output.hidden_states
@@ -491,11 +494,11 @@ class EAGLEWorker(TpModelWorker):
         )
         # Post process based on verified outputs.
-        # Pick indices that we care (accepeted)
+        # Pick indices that we care (accepted)
         logits_output.next_token_logits = logits_output.next_token_logits[
-            res.accepeted_indices
+            res.accepted_indices
         ]
-        logits_output.hidden_states = logits_output.hidden_states[res.accepeted_indices]
+        logits_output.hidden_states = logits_output.hidden_states[res.accepted_indices]
         # Prepare the batch for the next draft forwards.
         batch.forward_mode = ForwardMode.DECODE
@@ -504,7 +507,7 @@ class EAGLEWorker(TpModelWorker):
         if batch.return_logprob:
             self.add_logprob_values(batch, res, logits_output)
-        return logits_output, res, model_worker_batch
+        return logits_output, res, model_worker_batch, can_run_cuda_graph
     def add_logprob_values(
         self,
@@ -590,14 +593,14 @@ class EAGLEWorker(TpModelWorker):
             model_worker_batch, self.draft_model_runner
         )
         forward_batch.return_logprob = False
-        logits_output = self.draft_model_runner.forward(forward_batch)
+        logits_output, _ = self.draft_model_runner.forward(forward_batch)
         self._detect_nan_if_needed(logits_output)
         assert isinstance(forward_batch.spec_info, EagleDraftInput)
         assert forward_batch.spec_info is batch.spec_info
         self.capture_for_decode(logits_output, forward_batch.spec_info)
     def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
-        # Backup fileds that will be modified in-place
+        # Backup fields that will be modified in-place
         seq_lens_backup = batch.seq_lens.clone()
         req_pool_indices_backup = batch.req_pool_indices
         accept_length_backup = batch.spec_info.accept_length
@@ -617,7 +620,7 @@ class EAGLEWorker(TpModelWorker):
         )
         # Run
-        logits_output = self.draft_model_runner.forward(forward_batch)
+        logits_output, _ = self.draft_model_runner.forward(forward_batch)
         self._detect_nan_if_needed(logits_output)
         self.capture_for_decode(logits_output, forward_batch.spec_info)

sglang/srt/utils.py CHANGED Viewed

@@ -282,7 +282,9 @@ def calculate_time(show=False, min_cost_ms=0.0):
     return wrapper
-def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True):
+def get_available_gpu_memory(
+    device, gpu_id, distributed=False, empty_cache=True, cpu_group=None
+):
     """
     Get available memory for cuda:gpu_id device.
     When distributed is True, the available memory is the minimum available memory of all GPUs.
@@ -344,10 +346,10 @@ def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True
         free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
     if distributed:
-        tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
-            torch.device(device, gpu_id)
+        tensor = torch.tensor(free_gpu_memory, dtype=torch.float32)
+        torch.distributed.all_reduce(
+            tensor, op=torch.distributed.ReduceOp.MIN, group=cpu_group
         )
-        torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.MIN)
         free_gpu_memory = tensor.item()
     return free_gpu_memory / (1 << 30)
@@ -2076,7 +2078,6 @@ def is_fa3_default_architecture(hf_config):
         "Llama4ForConditionalGeneration",
         "LlamaForCausalLM",
         "MistralForCausalLM",
-        "MixtralForCausalLM",
         "Gemma2ForCausalLM",
         "Gemma3ForConditionalGeneration",
         "Qwen3ForCausalLM",

sglang/test/few_shot_gsm8k.py CHANGED Viewed

@@ -90,7 +90,7 @@ def run_eval(args):
     #####################################
     # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
     states = few_shot_gsm8k.run_batch(
         arguments,
         temperature=args.temperature if hasattr(args, "temperature") else 0,
@@ -99,7 +99,7 @@ def run_eval(args):
         return_logprob=getattr(args, "return_logprob", None),
         logprob_start_len=getattr(args, "logprob_start_len", None),
     )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
     preds = []
     for i in range(len(states)):

sglang/test/few_shot_gsm8k_engine.py CHANGED Viewed

@@ -89,7 +89,7 @@ def run_eval(args):
     }
     # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
     loop = asyncio.get_event_loop()
@@ -98,7 +98,7 @@ def run_eval(args):
     )
     # End requests
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
     # Shutdown the engine
     engine.shutdown()

sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post4py3-none-any.whl