PyPI - sglang - Versions diffs - 0.4.3.post2__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl - Mend

sglang 0.4.3.post2py3-none-any.whl → 0.4.3.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (205) hide show

sglang/api.py +1 -1
sglang/bench_offline_throughput.py +19 -0
sglang/bench_one_batch.py +2 -2
sglang/bench_serving.py +123 -79
sglang/global_config.py +8 -3
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/ir.py +1 -1
sglang/srt/_custom_ops.py +83 -91
sglang/srt/configs/load_config.py +4 -1
sglang/srt/configs/model_config.py +48 -2
sglang/srt/configs/qwen2_5_vl_config.py +5 -2
sglang/srt/constrained/base_grammar_backend.py +117 -15
sglang/srt/constrained/llguidance_backend.py +151 -0
sglang/srt/constrained/outlines_backend.py +24 -33
sglang/srt/constrained/xgrammar_backend.py +69 -38
sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
sglang/srt/distributed/parallel_state.py +48 -3
sglang/srt/entrypoints/engine.py +67 -9
sglang/srt/entrypoints/http_server.py +190 -41
sglang/srt/entrypoints/verl_engine.py +147 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/activation.py +11 -0
sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
sglang/srt/layers/attention/flashinfer_backend.py +220 -378
sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
sglang/srt/layers/attention/torch_native_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +9 -6
sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
sglang/srt/layers/attention/utils.py +39 -0
sglang/srt/layers/attention/vision.py +60 -63
sglang/srt/layers/dp_attention.py +142 -1
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/linear.py +3 -1
sglang/srt/layers/logits_processor.py +281 -45
sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
sglang/srt/layers/moe/ep_moe/layer.py +140 -28
sglang/srt/layers/moe/fused_moe_native.py +2 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
sglang/srt/layers/moe/topk.py +13 -4
sglang/srt/layers/quantization/__init__.py +111 -7
sglang/srt/layers/quantization/blockwise_int8.py +409 -0
sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/fp8.py +69 -28
sglang/srt/layers/quantization/fp8_utils.py +17 -1
sglang/srt/layers/quantization/gptq.py +416 -0
sglang/srt/layers/quantization/int8_kernel.py +327 -0
sglang/srt/layers/quantization/int8_utils.py +73 -0
sglang/srt/layers/quantization/modelopt_quant.py +18 -1
sglang/srt/layers/radix_attention.py +1 -0
sglang/srt/layers/rotary_embedding.py +0 -1
sglang/srt/layers/sampler.py +76 -31
sglang/srt/layers/vocab_parallel_embedding.py +14 -13
sglang/srt/lora/lora.py +17 -1
sglang/srt/lora/lora_config.py +5 -0
sglang/srt/lora/lora_manager.py +1 -3
sglang/srt/managers/cache_controller.py +193 -62
sglang/srt/managers/configure_logging.py +2 -1
sglang/srt/managers/data_parallel_controller.py +6 -2
sglang/srt/managers/detokenizer_manager.py +124 -102
sglang/srt/managers/image_processor.py +2 -1
sglang/srt/managers/io_struct.py +143 -6
sglang/srt/managers/schedule_batch.py +237 -197
sglang/srt/managers/schedule_policy.py +29 -29
sglang/srt/managers/scheduler.py +681 -259
sglang/srt/managers/session_controller.py +6 -2
sglang/srt/managers/tokenizer_manager.py +224 -68
sglang/srt/managers/tp_worker.py +15 -4
sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
sglang/srt/mem_cache/chunk_cache.py +18 -11
sglang/srt/mem_cache/hiradix_cache.py +394 -0
sglang/srt/mem_cache/memory_pool.py +44 -18
sglang/srt/mem_cache/radix_cache.py +58 -47
sglang/srt/metrics/collector.py +94 -36
sglang/srt/model_executor/cuda_graph_runner.py +55 -24
sglang/srt/model_executor/forward_batch_info.py +49 -16
sglang/srt/model_executor/model_runner.py +208 -28
sglang/srt/model_loader/loader.py +3 -3
sglang/srt/model_loader/weight_utils.py +36 -14
sglang/srt/models/baichuan.py +31 -6
sglang/srt/models/chatglm.py +39 -7
sglang/srt/models/commandr.py +29 -5
sglang/srt/models/dbrx.py +31 -5
sglang/srt/models/deepseek.py +43 -6
sglang/srt/models/deepseek_nextn.py +32 -19
sglang/srt/models/deepseek_v2.py +265 -32
sglang/srt/models/exaone.py +19 -9
sglang/srt/models/gemma.py +22 -8
sglang/srt/models/gemma2.py +25 -12
sglang/srt/models/gemma2_reward.py +5 -1
sglang/srt/models/gpt2.py +28 -13
sglang/srt/models/gpt_bigcode.py +27 -5
sglang/srt/models/granite.py +21 -9
sglang/srt/models/grok.py +21 -4
sglang/srt/models/internlm2.py +36 -6
sglang/srt/models/internlm2_reward.py +5 -1
sglang/srt/models/llama.py +26 -9
sglang/srt/models/llama_classification.py +5 -1
sglang/srt/models/llama_eagle.py +17 -4
sglang/srt/models/llama_embedding.py +5 -1
sglang/srt/models/llama_reward.py +7 -2
sglang/srt/models/llava.py +19 -3
sglang/srt/models/llavavid.py +10 -1
sglang/srt/models/minicpm.py +26 -2
sglang/srt/models/minicpm3.py +39 -3
sglang/srt/models/minicpmv.py +45 -14
sglang/srt/models/mixtral.py +20 -9
sglang/srt/models/mixtral_quant.py +50 -8
sglang/srt/models/mllama.py +57 -11
sglang/srt/models/olmo.py +34 -6
sglang/srt/models/olmo2.py +34 -13
sglang/srt/models/olmoe.py +26 -4
sglang/srt/models/phi3_small.py +29 -10
sglang/srt/models/qwen.py +26 -3
sglang/srt/models/qwen2.py +26 -4
sglang/srt/models/qwen2_5_vl.py +46 -8
sglang/srt/models/qwen2_eagle.py +17 -5
sglang/srt/models/qwen2_moe.py +44 -6
sglang/srt/models/qwen2_rm.py +78 -0
sglang/srt/models/qwen2_vl.py +39 -8
sglang/srt/models/stablelm.py +32 -5
sglang/srt/models/torch_native_llama.py +5 -2
sglang/srt/models/xverse.py +21 -9
sglang/srt/models/xverse_moe.py +45 -7
sglang/srt/models/yivl.py +2 -1
sglang/srt/openai_api/adapter.py +109 -24
sglang/srt/openai_api/protocol.py +17 -1
sglang/srt/reasoning_parser.py +154 -0
sglang/srt/sampling/penaltylib/__init__.py +4 -6
sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
sglang/srt/sampling/sampling_batch_info.py +79 -157
sglang/srt/sampling/sampling_params.py +16 -13
sglang/srt/server_args.py +136 -52
sglang/srt/speculative/build_eagle_tree.py +2 -8
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
sglang/srt/speculative/eagle_utils.py +92 -58
sglang/srt/speculative/eagle_worker.py +186 -94
sglang/srt/speculative/spec_info.py +1 -13
sglang/srt/utils.py +43 -17
sglang/srt/warmup.py +47 -0
sglang/test/few_shot_gsm8k.py +4 -1
sglang/test/runners.py +389 -126
sglang/test/send_one.py +88 -0
sglang/test/test_block_fp8_ep.py +361 -0
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +138 -84
sglang/utils.py +50 -60
sglang/version.py +1 -1
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/METADATA +21 -15
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/RECORD +200 -166
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/WHEEL +1 -1
sglang/bench_latency.py +0 -1
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
sglang/test/srt/sampling/penaltylib/utils.py +0 -344
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/LICENSE +0 -0
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/top_level.txt +0 -0

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import logging
+import os
 import time
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 import torch
+from huggingface_hub import snapshot_download
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
@@ -20,11 +22,13 @@ from sglang.srt.speculative.eagle_draft_cuda_graph_runner import (
 from sglang.srt.speculative.eagle_utils import (
     EagleDraftInput,
     EagleVerifyInput,
+    EagleVerifyOutput,
     assign_draft_cache_locs,
     fast_topk,
     select_top_k_tokens,
 )
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import get_available_gpu_memory
 logger = logging.getLogger(__name__)
@@ -40,10 +44,31 @@ class EAGLEWorker(TpModelWorker):
         nccl_port: int,
         target_worker: TpModelWorker,
     ):
+        # Override context length with target model's context length
+        server_args.context_length = target_worker.model_runner.model_config.context_len
+        os.environ["SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN"] = "1"
         # Do not capture cuda graph in `super().__init__()`
         # We will capture it later
         backup_disable_cuda_graph = server_args.disable_cuda_graph
         server_args.disable_cuda_graph = True
+        # Lossy optimization by using hot tokens
+        if server_args.speculative_token_map is not None:
+            self.hot_token_id = load_token_map(server_args.speculative_token_map)
+            server_args.json_model_override_args = (
+                f'{{"hot_vocab_size": {len(self.hot_token_id)}}}'
+            )
+        else:
+            self.hot_token_id = None
+        # We share the allocator with a target worker. Draft/target worker
+        # owns its own KV cache.
+        self.req_to_token_pool, self.token_to_kv_pool_allocator = (
+            target_worker.get_memory_pool()
+        )
+        # Init target worker
         super().__init__(
             gpu_id=gpu_id,
             tp_rank=tp_rank,
@@ -51,9 +76,10 @@ class EAGLEWorker(TpModelWorker):
             nccl_port=nccl_port,
             dp_rank=dp_rank,
             is_draft_worker=True,
+            req_to_token_pool=self.req_to_token_pool,
+            token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
         )
         self.target_worker = target_worker
-        self.finish_extend_len = []
         # Parse arguments
         self.topk = server_args.speculative_eagle_topk
@@ -62,12 +88,20 @@ class EAGLEWorker(TpModelWorker):
             server_args.speculative_algorithm
         )
         self.server_args = server_args
+        self.use_nan_detection = self.server_args.enable_nan_detection
+        self.device = self.model_runner.device
+        self.gpu_id = self.model_runner.gpu_id
         # Share the embedding and lm_head
-        if not self.speculative_algorithm.is_nextn():
-            embed, head = self.target_worker.model_runner.model.get_embed_and_head()
-            self.model_runner.model.set_embed_and_head(embed, head)
-        self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
+        embed, head = self.target_worker.model_runner.model.get_embed_and_head()
+        if self.hot_token_id is not None:
+            head = head.clone()
+            self.hot_token_id = self.hot_token_id.to(head.device)
+            head.data = head.data[self.hot_token_id]
+        self.draft_model_runner.model.set_embed_and_head(embed, head)
+        self.draft_model_runner.server_args.disable_cuda_graph = (
+            backup_disable_cuda_graph
+        )
         # Create multi-step attn backends and cuda graph runners
         if server_args.attention_backend == "flashinfer":
@@ -95,7 +129,7 @@ class EAGLEWorker(TpModelWorker):
                 f"EAGLE is not supportted in attention backend {server_args.attention_backend}"
             )
-        self.model_runner.draft_attn_backend = self.draft_attn_backend
+        self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
         self.init_cuda_graphs()
     def init_cuda_graphs(self):
@@ -106,55 +140,81 @@ class EAGLEWorker(TpModelWorker):
             return
         tic = time.time()
-        logger.info("Capture cuda graph begin. This can take up to several minutes.")
+        logger.info(
+            f"Capture draft cuda graph begin. This can take up to several minutes. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
         self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self)
-        logger.info(f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f} s")
+        logger.info(
+            f"Capture draft cuda graph end. Time elapsed: {time.time() - tic:.2f} s. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
-    def forward_batch_speculative_generation(self, batch: ScheduleBatch):
+    @property
+    def draft_model_runner(self):
+        return self.model_runner
+    def forward_batch_speculative_generation(
+        self, batch: ScheduleBatch
+    ) -> Tuple[LogitsProcessorOutput, List[int], int, int]:
+        """Run speculative decoding forward.
+        NOTE: Many states of batch is modified as you go through. It is not guaranteed
+        the final output batch doesn't have the same state as the input.
+        Args:
+            batch: The batch to run forward. The state of the batch is modified as it runs.
+        Returns:
+            A tuple of the final logit output of the target model, next tokens accepeted,
+            the batch id (used for overlap schedule), and number of accepeted tokens.
+        """
+        assert not batch.spec_algorithm.is_none()
         if batch.forward_mode.is_decode():
-            # Draft
-            spec_info: EagleVerifyInput = self.draft(batch)
-            # Verify
-            (
-                next_draft_input,
-                logits_output,
-                verified_id,
-                self.finish_extend_len,
-                accept_length_cpu,
-                model_worker_batch,
-            ) = self.verify(batch, spec_info)
-            batch.spec_info = next_draft_input
-            # if it is None, means all requsets are finished
+            spec_info, to_free_cache_loc = self.draft(batch)
+            logits_output, verify_output, model_worker_batch = self.verify(
+                batch, spec_info
+            )
+            # Free cache loc (we put it here to avoid synchronization and hide kernel launch overhead.)
+            self.token_to_kv_pool_allocator.free(to_free_cache_loc)
+            # if it is None, means all requests are finished
             if batch.spec_info.verified_id is not None:
                 self.forward_draft_extend_after_decode(batch)
             return (
                 logits_output,
-                verified_id,
-                model_worker_batch,
-                sum(accept_length_cpu),
+                verify_output.verified_id,
+                model_worker_batch.bid,
+                sum(verify_output.accept_length_per_req_cpu),
             )
         else:
-            # Forward with the target model and get hidden states.
-            # We need the full hidden states to prefill the KV cache of the draft model.
-            model_worker_batch = batch.get_model_worker_batch()
-            model_worker_batch.capture_hidden_mode = CaptureHiddenMode.FULL
-            logits_output, next_token_ids = self.target_worker.forward_batch_generation(
-                model_worker_batch
-            )
-            # Forward with the draft model.
-            batch.spec_info = EagleDraftInput(
-                hidden_states=logits_output.hidden_states,
-                verified_id=next_token_ids,
+            logits_output, next_token_ids, bid = self.forward_target_extend(batch)
+            self.forward_draft_extend(
+                batch, logits_output.hidden_states, next_token_ids
             )
-            self.forward_draft_extend(batch)
-            return logits_output, next_token_ids, model_worker_batch, 0
+            return logits_output, next_token_ids, bid, 0
+    def forward_target_extend(
+        self, batch: ScheduleBatch
+    ) -> Tuple[LogitsProcessorOutput, List[int], int]:
+        """Run the target extend.
+        Args:
+            batch: The batch to run. States could be modified.
+        Returns:
+            logits_output: The output of logits. It will contain the full hidden states.
+            next_token_ids: Next token ids generated.
+            bid: The model batch ID. Used for overlap schedule.
+        """
+        # Forward with the target model and get hidden states.
+        # We need the full hidden states to prefill the KV cache of the draft model.
+        model_worker_batch = batch.get_model_worker_batch()
+        model_worker_batch.capture_hidden_mode = CaptureHiddenMode.FULL
+        logits_output, next_token_ids = self.target_worker.forward_batch_generation(
+            model_worker_batch
+        )
+        return logits_output, next_token_ids, model_worker_batch.bid
     def draft(self, batch: ScheduleBatch):
-        self._set_mem_pool(batch, self.model_runner)
         # Parse args
         num_seqs = batch.batch_size()
         spec_info = batch.spec_info
@@ -172,7 +232,6 @@ class EAGLEWorker(TpModelWorker):
             self.topk,
             self.speculative_num_steps,
         )
         batch.out_cache_loc = out_cache_loc
         batch.seq_lens_sum = torch.sum(batch.seq_lens).item()
         spec_info.positions = batch.seq_lens.repeat_interleave(self.topk, dim=0)
@@ -180,11 +239,12 @@ class EAGLEWorker(TpModelWorker):
         # Get forward batch
         spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         model_worker_batch = batch.get_model_worker_batch()
-        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+        forward_batch = ForwardBatch.init_new(
+            model_worker_batch, self.draft_model_runner
+        )
         can_cuda_graph = self.cuda_graph_runner and self.cuda_graph_runner.can_run(
             forward_batch
         )
         if can_cuda_graph:
             score_list, token_list, parents_list = self.cuda_graph_runner.replay(
                 forward_batch
@@ -192,7 +252,9 @@ class EAGLEWorker(TpModelWorker):
         else:
             # Initialize attention backend
             self.draft_attn_backend.init_forward_metadata(forward_batch)
+            forward_batch = ForwardBatch.init_new(
+                model_worker_batch, self.draft_model_runner
+            )
             # Run forward steps
             score_list, token_list, parents_list = self.draft_forward(forward_batch)
@@ -209,10 +271,7 @@ class EAGLEWorker(TpModelWorker):
             batch.sampling_info.is_all_greedy,
         )
-        # Free cache locations
-        batch.token_to_kv_pool.free(out_cache_loc)
-        self._set_mem_pool(batch, self.target_worker.model_runner)
-        return ret
+        return ret, out_cache_loc
     def draft_forward(self, forward_batch: ForwardBatch):
         # Parse args
@@ -223,6 +282,8 @@ class EAGLEWorker(TpModelWorker):
             spec_info.topk_index,
             spec_info.hidden_states,
         )
+        if self.hot_token_id is not None:
+            topk_index = self.hot_token_id[topk_index]
         # Return values
         score_list: List[torch.Tensor] = []
@@ -260,8 +321,11 @@ class EAGLEWorker(TpModelWorker):
             logits_output = self.model_runner.model.forward(
                 forward_batch.input_ids, forward_batch.positions, forward_batch
             )
+            self._detect_nan_if_needed(logits_output)
             probs = torch.softmax(logits_output.next_token_logits, dim=-1)
             topk_p, topk_index = fast_topk(probs, self.topk, dim=-1)
+            if self.hot_token_id is not None:
+                topk_index = self.hot_token_id[topk_index]
             hidden_states = logits_output.hidden_states
         return score_list, token_list, parents_list
@@ -274,68 +338,96 @@ class EAGLEWorker(TpModelWorker):
         logits_output, _ = self.target_worker.forward_batch_generation(
             model_worker_batch, skip_sample=True
         )
+        self._detect_nan_if_needed(logits_output)
         spec_info.hidden_states = logits_output.hidden_states
-        res = spec_info.verify(batch, logits_output)
+        res: EagleVerifyOutput = spec_info.verify(
+            batch, logits_output, self.token_to_kv_pool_allocator
+        )
+        # Post process based on verified outputs.
+        # Pick indices that we care (accepeted)
+        logits_output.next_token_logits = logits_output.next_token_logits[
+            res.accepeted_indices_cpu
+        ]
+        logits_output.hidden_states = logits_output.hidden_states[
+            res.accepeted_indices_cpu
+        ]
+        # Prepare the batch for the next draft forwards.
         batch.forward_mode = ForwardMode.DECODE
-        return res + (model_worker_batch,)
+        batch.spec_info = res.draft_input
+        return logits_output, res, model_worker_batch
-    def forward_draft_extend(self, batch: ScheduleBatch):
-        self._set_mem_pool(batch, self.model_runner)
+    def forward_draft_extend(
+        self,
+        batch: ScheduleBatch,
+        hidden_states: torch.Tensor,
+        next_token_ids: List[int],
+    ):
+        """Run draft model extend. This API modifies the states of the batch.
+        Args:
+            batch: The batch to run.
+            hidden_states: Hidden states from the target model forward
+            next_token_ids: Next token ids generated from the target forward.
+        """
+        batch.spec_info = EagleDraftInput(
+            hidden_states=hidden_states,
+            verified_id=next_token_ids,
+        )
         batch.spec_info.prepare_for_extend(batch)
         batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         model_worker_batch = batch.get_model_worker_batch()
-        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
-        logits_output = self.model_runner.forward(forward_batch)
-        self.capture_for_decode(logits_output, forward_batch)
-        self._set_mem_pool(batch, self.target_worker.model_runner)
-    def _set_mem_pool(self, batch: ScheduleBatch, runner: ModelRunner):
-        batch.token_to_kv_pool = runner.token_to_kv_pool
-        batch.req_to_token_pool = runner.req_to_token_pool
+        forward_batch = ForwardBatch.init_new(
+            model_worker_batch, self.draft_model_runner
+        )
+        logits_output = self.draft_model_runner.forward(forward_batch)
+        self._detect_nan_if_needed(logits_output)
+        assert isinstance(forward_batch.spec_info, EagleDraftInput)
+        assert forward_batch.spec_info is batch.spec_info
+        self.capture_for_decode(logits_output, forward_batch.spec_info)
     def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
         seq_lens_backup = batch.seq_lens
-        req_pool_indices_backup = batch.req_pool_indices
-        self._set_mem_pool(batch, self.model_runner)
         batch.forward_mode = ForwardMode.DRAFT_EXTEND
         batch.spec_info.prepare_extend_after_decode(batch, self.speculative_num_steps)
         batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
+        # We don't need logprob for this extend.
         model_worker_batch = batch.get_model_worker_batch()
-        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
-        logits_output = self.model_runner.forward(forward_batch)
-        self.capture_for_decode(logits_output, forward_batch)
-        self._set_mem_pool(batch, self.target_worker.model_runner)
+        forward_batch = ForwardBatch.init_new(
+            model_worker_batch, self.draft_model_runner
+        )
+        logits_output = self.draft_model_runner.forward(forward_batch)
+        self._detect_nan_if_needed(logits_output)
+        assert forward_batch.spec_info is batch.spec_info
+        self.capture_for_decode(logits_output, forward_batch.spec_info)
         # Restore backup.
         # This is because `seq_lens` can be modified in `prepare_extend_after_decode`
         batch.forward_mode = ForwardMode.DECODE
         batch.seq_lens = seq_lens_backup
-        batch.req_pool_indices = req_pool_indices_backup
     def capture_for_decode(
-        self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
+        self, logits_output: LogitsProcessorOutput, draft_input: EagleDraftInput
     ):
         probs = torch.softmax(logits_output.next_token_logits, dim=-1)
-        spec_info = forward_batch.spec_info
-        spec_info.topk_p, spec_info.topk_index = fast_topk(probs, self.topk, dim=-1)
-        spec_info.hidden_states = logits_output.hidden_states
-    # Don't support prefix share now.
-    def finish_request(self, reqs: Union[Req, List[Req]]):
-        if not isinstance(reqs, List):
-            reqs = [reqs]
-        for req in reqs:
-            if req.rid not in self.finish_extend_len:
-                continue
-            req_len = (
-                len(req.origin_input_ids)
-                + len(req.output_ids)
-                - self.finish_extend_len[req.rid]
-                - 1
-            )
-            kv_indices = self.model_runner.req_to_token_pool.req_to_token[
-                req.req_pool_idx
-            ][:req_len]
-            self.model_runner.token_to_kv_pool.free(kv_indices)
-            self.model_runner.req_to_token_pool.free(req.req_pool_idx)
+        draft_input.topk_p, draft_input.topk_index = fast_topk(probs, self.topk, dim=-1)
+        draft_input.hidden_states = logits_output.hidden_states
+    def _detect_nan_if_needed(self, logits_output: LogitsProcessorOutput):
+        if self.use_nan_detection:
+            logits = logits_output.next_token_logits
+            if torch.any(torch.isnan(logits)):
+                logger.warning("Detected errors during sampling! NaN in the logits.")
+                raise ValueError("Detected errors during sampling! NaN in the logits.")
+def load_token_map(token_map_path: str) -> List[int]:
+    if not os.path.exists(token_map_path):
+        cache_dir = snapshot_download(
+            os.path.dirname(token_map_path),
+            ignore_patterns=["*.bin", "*.safetensors"],
+        )
+        token_map_path = os.path.join(cache_dir, os.path.basename(token_map_path))
+    hot_token_id = torch.load(token_map_path)
+    return torch.tensor(hot_token_id, dtype=torch.int32)

sglang/srt/speculative/spec_info.py CHANGED Viewed

@@ -5,30 +5,18 @@ class SpeculativeAlgorithm(IntEnum):
     NONE = auto()
     EAGLE = auto()
-    # NEXTN spec decoding is for DeepSeek V3/R1
-    # currently it's implemented based on EAGLE
-    NEXTN = auto()
     def is_none(self):
         return self == SpeculativeAlgorithm.NONE
     def is_eagle(self):
-        return self == SpeculativeAlgorithm.EAGLE or self == SpeculativeAlgorithm.NEXTN
-    def is_nextn(self):
-        return self == SpeculativeAlgorithm.NEXTN
+        return self == SpeculativeAlgorithm.EAGLE
     @staticmethod
     def from_string(name: str):
         name_map = {
             "EAGLE": SpeculativeAlgorithm.EAGLE,
-            "NEXTN": SpeculativeAlgorithm.NEXTN,
             None: SpeculativeAlgorithm.NONE,
         }
         if name is not None:
             name = name.upper()
         return name_map[name]
-class SpecInfo:
-    pass

sglang/srt/utils.py CHANGED Viewed

@@ -32,13 +32,15 @@ import socket
 import subprocess
 import sys
 import tempfile
+import threading
 import time
 import warnings
 from functools import lru_cache
 from importlib.metadata import PackageNotFoundError, version
 from io import BytesIO
+from multiprocessing import Pool
 from multiprocessing.reduction import ForkingPickler
-from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Protocol, Set, Tuple, Union
 import numpy as np
 import psutil
@@ -311,7 +313,7 @@ def make_layers(
     """Make a list of layers with the given layer function"""
     modules = torch.nn.ModuleList(
         [
-            maybe_offload_to_cpu(layer_fn(idx=idx, prefix=f"{prefix}.{idx}"))
+            maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
             for idx in range(num_hidden_layers)
         ]
     )
@@ -480,6 +482,10 @@ def assert_pkg_version(pkg: str, min_version: str, message: str):
 def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = None):
     """Kill the process and all its child processes."""
+    # Remove sigchld handler to avoid spammy logs.
+    if threading.current_thread() is threading.main_thread():
+        signal.signal(signal.SIGCHLD, signal.SIG_DFL)
     if parent_pid is None:
         parent_pid = os.getpid()
         include_parent = False
@@ -735,13 +741,6 @@ def pytorch_profile(name, func, *args, data_size=-1):
     return result
-def first_rank_print(*args, **kwargs):
-    if torch.cuda.current_device() == 0:
-        print(*args, **kwargs)
-    else:
-        pass
 def get_zmq_socket(
     context: zmq.Context, socket_type: zmq.SocketType, endpoint: str, bind: bool
 ):
@@ -1154,9 +1153,9 @@ def set_gpu_proc_affinity(
     if psutil.cpu_count() != psutil.cpu_count(logical=False):
         # HT on
-        upper_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
-        lower_cpu_ids = [id + total_pcores for id in range(start_cpu_id, end_cpu_id)]
-        bind_cpu_ids = list(itertools.chain(upper_cpu_ids, lower_cpu_ids))
+        lower_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
+        upper_cpu_ids = [id + total_pcores for id in range(start_cpu_id, end_cpu_id)]
+        bind_cpu_ids = list(itertools.chain(lower_cpu_ids, upper_cpu_ids))
     else:
         # HT off
         bind_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
@@ -1171,6 +1170,11 @@ def get_bool_env_var(name: str, default: str = "false") -> bool:
     return value.lower() in ("true", "1")
+@lru_cache(maxsize=2)
+def disable_request_logging() -> bool:
+    return get_bool_env_var("SGLANG_DISABLE_REQUEST_LOGGING")
 @lru_cache(maxsize=8)
 def _cuda_device_count_stateless(cuda_visible_devices: Optional[str] = None) -> int:
     # Note: cuda_visible_devices is not used, but we keep it as an argument for
@@ -1212,7 +1216,11 @@ def cuda_device_count_stateless() -> int:
     return _cuda_device_count_stateless(os.environ.get("CUDA_VISIBLE_DEVICES", None))
-def dataclass_to_string_truncated(data, max_length=2048):
+def dataclass_to_string_truncated(
+    data, max_length=2048, skip_names: Optional[Set[str]] = None
+):
+    if skip_names is None:
+        skip_names = set()
     if isinstance(data, str):
         if len(data) > max_length:
             half_length = max_length // 2
@@ -1231,6 +1239,7 @@ def dataclass_to_string_truncated(data, max_length=2048):
             + ", ".join(
                 f"'{k}': {dataclass_to_string_truncated(v, max_length)}"
                 for k, v in data.items()
+                if k not in skip_names
             )
             + "}"
         )
@@ -1241,6 +1250,7 @@ def dataclass_to_string_truncated(data, max_length=2048):
             + ", ".join(
                 f"{f.name}={dataclass_to_string_truncated(getattr(data, f.name), max_length)}"
                 for f in fields
+                if f.name not in skip_names
             )
             + ")"
         )
@@ -1289,7 +1299,7 @@ def debug_timing(func):
             tic.record()
             result = func(*args, **kwargs)
             toc.record()
-            torch.cuda.synchronize()  # Ensure all CUDA operations are complete
+            toc.synchronize()  # Wait for the function to complete without synchronizing all ops on the GPU
             elapsed = tic.elapsed_time(toc)
             indices = kwargs.get("indices", args[1] if len(args) > 1 else None)
             num_tokens = len(indices) if indices is not None else 0
@@ -1319,9 +1329,9 @@ def pyspy_dump_schedulers():
         result = subprocess.run(
             cmd, shell=True, capture_output=True, text=True, check=True
         )
-        logger.info(f"Profile for PID {pid}:\n{result.stdout}")
+        logger.error(f"Pyspy dump for PID {pid}:\n{result.stdout}")
     except subprocess.CalledProcessError as e:
-        logger.info(f"Failed to profile PID {pid}. Error: {e.stderr}")
+        logger.error(f"Pyspy failed to dump PID {pid}. Error: {e.stderr}")
 def kill_itself_when_parent_died():
@@ -1383,7 +1393,6 @@ def get_ip() -> str:
 def get_open_port() -> int:
     port = os.getenv("SGLANG_PORT")
     if port is not None:
         while True:
@@ -1446,8 +1455,25 @@ def launch_dummy_health_check_server(host, port):
     )
+def create_checksum(directory: str):
+    raise NotImplementedError()
 def set_cuda_arch():
     if is_flashinfer_available():
         capability = torch.cuda.get_device_capability()
         arch = f"{capability[0]}.{capability[1]}"
         os.environ["TORCH_CUDA_ARCH_LIST"] = f"{arch}{'+PTX' if arch == '9.0' else ''}"
+def add_prefix(name: str, prefix: str) -> str:
+    """Add a weight path prefix to a module name.
+    Args:
+        name: base module name.
+        prefix: weight prefix str to added to the front of `name` concatenated with `.`.
+    Returns:
+        The string `prefix.name` if prefix is non-empty, otherwise just `name`.
+    """
+    return name if not prefix else f"{prefix}.{name}"

sglang/srt/warmup.py ADDED Viewed

@@ -0,0 +1,47 @@
+import logging
+from typing import List
+import numpy as np
+import tqdm
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+logger = logging.getLogger(__file__)
+_warmup_registry = {}
+def warmup(name: str) -> callable:
+    def decorator(fn: callable):
+        _warmup_registry[name] = fn
+        return fn
+    return decorator
+async def execute_warmups(warmup_names: List[str], tokenizer_manager: TokenizerManager):
+    for warmup_name in warmup_names:
+        if warmup_name not in _warmup_registry:
+            logger.warning(f"Could not find custom warmup {warmup_name}")
+            continue
+        logger.info(f"Running warmup {warmup_name}")
+        await _warmup_registry[warmup_name](tokenizer_manager)
+@warmup("voice_chat")
+async def voice_chat(tokenizer_manager: TokenizerManager):
+    # this warms up the fused_moe triton kernels and caches them
+    # if we don't do this we break real time inference for voice chat
+    for i in tqdm.trange(1, 512):
+        size = i * 4
+        generate_req_input = GenerateReqInput(
+            input_ids=(np.random.randint(2**16, size=[size])).tolist(),
+            sampling_params={
+                "max_new_tokens": 30,
+                "temperature": 0.8,
+                "stop_token_ids": [1],
+                "min_p": 0.0,
+            },
+        )
+        await tokenizer_manager.generate_request(generate_req_input, None).__anext__()

sglang/test/few_shot_gsm8k.py CHANGED Viewed

@@ -93,9 +93,11 @@ def run_eval(args):
     tic = time.time()
     states = few_shot_gsm8k.run_batch(
         arguments,
-        temperature=0,
+        temperature=args.temperature if hasattr(args, "temperature") else 0,
         num_threads=args.parallel,
         progress_bar=True,
+        return_logprob=getattr(args, "return_logprob", None),
+        logprob_start_len=getattr(args, "logprob_start_len", None),
     )
     latency = time.time() - tic
@@ -141,5 +143,6 @@ if __name__ == "__main__":
     parser.add_argument("--parallel", type=int, default=128)
     parser.add_argument("--host", type=str, default="http://127.0.0.1")
     parser.add_argument("--port", type=int, default=30000)
+    parser.add_argument("--temperature", type=float, default=0.0)
     args = parser.parse_args()
     run_eval(args)

sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl

sglang 0.4.3.post2py3-none-any.whl → 0.4.3.post3py3-none-any.whl