PyPI - sglang - Versions diffs - 0.4.3.post2__py3-none-any.whl → 0.4.3.post4__py3-none-any.whl - Mend

sglang 0.4.3.post2py3-none-any.whl → 0.4.3.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (205) hide show

sglang/api.py +1 -1
sglang/bench_offline_throughput.py +19 -0
sglang/bench_one_batch.py +2 -2
sglang/bench_serving.py +123 -79
sglang/global_config.py +8 -3
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/ir.py +1 -1
sglang/srt/_custom_ops.py +83 -91
sglang/srt/configs/load_config.py +4 -1
sglang/srt/configs/model_config.py +48 -2
sglang/srt/configs/qwen2_5_vl_config.py +5 -2
sglang/srt/constrained/base_grammar_backend.py +117 -15
sglang/srt/constrained/llguidance_backend.py +151 -0
sglang/srt/constrained/outlines_backend.py +24 -33
sglang/srt/constrained/xgrammar_backend.py +69 -38
sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
sglang/srt/distributed/parallel_state.py +48 -3
sglang/srt/entrypoints/engine.py +67 -9
sglang/srt/entrypoints/http_server.py +190 -41
sglang/srt/entrypoints/verl_engine.py +147 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/activation.py +11 -0
sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
sglang/srt/layers/attention/flashinfer_backend.py +302 -414
sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
sglang/srt/layers/attention/torch_native_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +13 -8
sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
sglang/srt/layers/attention/utils.py +39 -0
sglang/srt/layers/attention/vision.py +60 -63
sglang/srt/layers/dp_attention.py +142 -1
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/linear.py +3 -1
sglang/srt/layers/logits_processor.py +281 -45
sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
sglang/srt/layers/moe/ep_moe/layer.py +140 -28
sglang/srt/layers/moe/fused_moe_native.py +2 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
sglang/srt/layers/moe/topk.py +13 -4
sglang/srt/layers/quantization/__init__.py +111 -7
sglang/srt/layers/quantization/blockwise_int8.py +409 -0
sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/fp8.py +69 -28
sglang/srt/layers/quantization/fp8_utils.py +17 -1
sglang/srt/layers/quantization/gptq.py +416 -0
sglang/srt/layers/quantization/int8_kernel.py +327 -0
sglang/srt/layers/quantization/int8_utils.py +73 -0
sglang/srt/layers/quantization/modelopt_quant.py +18 -1
sglang/srt/layers/radix_attention.py +1 -0
sglang/srt/layers/rotary_embedding.py +0 -1
sglang/srt/layers/sampler.py +76 -31
sglang/srt/layers/vocab_parallel_embedding.py +14 -13
sglang/srt/lora/lora.py +17 -1
sglang/srt/lora/lora_config.py +5 -0
sglang/srt/lora/lora_manager.py +1 -3
sglang/srt/managers/cache_controller.py +193 -62
sglang/srt/managers/configure_logging.py +2 -1
sglang/srt/managers/data_parallel_controller.py +6 -2
sglang/srt/managers/detokenizer_manager.py +124 -102
sglang/srt/managers/image_processor.py +2 -1
sglang/srt/managers/io_struct.py +144 -6
sglang/srt/managers/schedule_batch.py +237 -197
sglang/srt/managers/schedule_policy.py +29 -29
sglang/srt/managers/scheduler.py +773 -334
sglang/srt/managers/session_controller.py +6 -2
sglang/srt/managers/tokenizer_manager.py +225 -68
sglang/srt/managers/tp_worker.py +15 -4
sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
sglang/srt/mem_cache/chunk_cache.py +18 -11
sglang/srt/mem_cache/hiradix_cache.py +394 -0
sglang/srt/mem_cache/memory_pool.py +68 -37
sglang/srt/mem_cache/radix_cache.py +58 -47
sglang/srt/metrics/collector.py +102 -36
sglang/srt/model_executor/cuda_graph_runner.py +56 -31
sglang/srt/model_executor/forward_batch_info.py +49 -16
sglang/srt/model_executor/model_runner.py +280 -81
sglang/srt/model_loader/loader.py +3 -3
sglang/srt/model_loader/weight_utils.py +36 -14
sglang/srt/models/baichuan.py +31 -6
sglang/srt/models/chatglm.py +39 -7
sglang/srt/models/commandr.py +29 -5
sglang/srt/models/dbrx.py +31 -5
sglang/srt/models/deepseek.py +43 -6
sglang/srt/models/deepseek_nextn.py +32 -19
sglang/srt/models/deepseek_v2.py +265 -32
sglang/srt/models/exaone.py +19 -9
sglang/srt/models/gemma.py +22 -8
sglang/srt/models/gemma2.py +25 -12
sglang/srt/models/gemma2_reward.py +5 -1
sglang/srt/models/gpt2.py +28 -13
sglang/srt/models/gpt_bigcode.py +27 -5
sglang/srt/models/granite.py +21 -9
sglang/srt/models/grok.py +21 -4
sglang/srt/models/internlm2.py +36 -6
sglang/srt/models/internlm2_reward.py +5 -1
sglang/srt/models/llama.py +26 -9
sglang/srt/models/llama_classification.py +5 -1
sglang/srt/models/llama_eagle.py +17 -4
sglang/srt/models/llama_embedding.py +5 -1
sglang/srt/models/llama_reward.py +7 -2
sglang/srt/models/llava.py +19 -3
sglang/srt/models/llavavid.py +10 -1
sglang/srt/models/minicpm.py +26 -2
sglang/srt/models/minicpm3.py +39 -3
sglang/srt/models/minicpmv.py +45 -14
sglang/srt/models/mixtral.py +20 -9
sglang/srt/models/mixtral_quant.py +50 -8
sglang/srt/models/mllama.py +57 -11
sglang/srt/models/olmo.py +34 -6
sglang/srt/models/olmo2.py +34 -13
sglang/srt/models/olmoe.py +26 -4
sglang/srt/models/phi3_small.py +29 -10
sglang/srt/models/qwen.py +26 -3
sglang/srt/models/qwen2.py +26 -4
sglang/srt/models/qwen2_5_vl.py +46 -8
sglang/srt/models/qwen2_eagle.py +17 -5
sglang/srt/models/qwen2_moe.py +44 -6
sglang/srt/models/qwen2_rm.py +78 -0
sglang/srt/models/qwen2_vl.py +39 -8
sglang/srt/models/stablelm.py +32 -5
sglang/srt/models/torch_native_llama.py +5 -2
sglang/srt/models/xverse.py +21 -9
sglang/srt/models/xverse_moe.py +45 -7
sglang/srt/models/yivl.py +2 -1
sglang/srt/openai_api/adapter.py +109 -24
sglang/srt/openai_api/protocol.py +17 -1
sglang/srt/reasoning_parser.py +154 -0
sglang/srt/sampling/penaltylib/__init__.py +4 -6
sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
sglang/srt/sampling/sampling_batch_info.py +79 -157
sglang/srt/sampling/sampling_params.py +16 -13
sglang/srt/server_args.py +135 -60
sglang/srt/speculative/build_eagle_tree.py +8 -9
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -12
sglang/srt/speculative/eagle_utils.py +92 -57
sglang/srt/speculative/eagle_worker.py +238 -111
sglang/srt/speculative/spec_info.py +1 -13
sglang/srt/utils.py +43 -17
sglang/srt/warmup.py +47 -0
sglang/test/few_shot_gsm8k.py +4 -1
sglang/test/runners.py +389 -126
sglang/test/send_one.py +88 -0
sglang/test/test_block_fp8_ep.py +361 -0
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +138 -84
sglang/utils.py +50 -60
sglang/version.py +1 -1
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/METADATA +22 -15
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/RECORD +200 -166
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/WHEEL +1 -1
sglang/bench_latency.py +0 -1
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
sglang/test/srt/sampling/penaltylib/utils.py +0 -344
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/LICENSE +0 -0
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/top_level.txt +0 -0

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -1,18 +1,19 @@
 import logging
+import os
 import time
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple
 import torch
+from huggingface_hub import snapshot_download
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
-from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+from sglang.srt.managers.schedule_batch import ScheduleBatch
 from sglang.srt.managers.tp_worker import TpModelWorker
 from sglang.srt.model_executor.forward_batch_info import (
     CaptureHiddenMode,
     ForwardBatch,
     ForwardMode,
 )
-from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.speculative.eagle_draft_cuda_graph_runner import (
     EAGLEDraftCudaGraphRunner,
@@ -20,11 +21,12 @@ from sglang.srt.speculative.eagle_draft_cuda_graph_runner import (
 from sglang.srt.speculative.eagle_utils import (
     EagleDraftInput,
     EagleVerifyInput,
+    EagleVerifyOutput,
     assign_draft_cache_locs,
     fast_topk,
     select_top_k_tokens,
 )
-from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import get_available_gpu_memory
 logger = logging.getLogger(__name__)
@@ -40,10 +42,39 @@ class EAGLEWorker(TpModelWorker):
         nccl_port: int,
         target_worker: TpModelWorker,
     ):
+        # Parse arguments
+        self.server_args = server_args
+        self.topk = server_args.speculative_eagle_topk
+        self.speculative_num_steps = server_args.speculative_num_steps
+        self.padded_static_len = self.speculative_num_steps + 1
+        self.enable_nan_detection = server_args.enable_nan_detection
+        self.gpu_id = gpu_id
+        self.device = server_args.device
+        self.target_worker = target_worker
+        # Override context length with target model's context length
+        server_args.context_length = target_worker.model_runner.model_config.context_len
         # Do not capture cuda graph in `super().__init__()`
-        # We will capture it later
+        # It will be captured later.
         backup_disable_cuda_graph = server_args.disable_cuda_graph
         server_args.disable_cuda_graph = True
+        # Share the allocator with a target worker.
+        # Draft and target worker own their own KV cache pools.
+        self.req_to_token_pool, self.token_to_kv_pool_allocator = (
+            target_worker.get_memory_pool()
+        )
+        # Load hot token ids
+        if server_args.speculative_token_map is not None:
+            self.hot_token_id = load_token_map(server_args.speculative_token_map)
+            server_args.json_model_override_args = (
+                f'{{"hot_vocab_size": {len(self.hot_token_id)}}}'
+            )
+        else:
+            self.hot_token_id = None
+        # Init draft worker
         super().__init__(
             gpu_id=gpu_id,
             tp_rank=tp_rank,
@@ -51,26 +82,27 @@ class EAGLEWorker(TpModelWorker):
             nccl_port=nccl_port,
             dp_rank=dp_rank,
             is_draft_worker=True,
+            req_to_token_pool=self.req_to_token_pool,
+            token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
         )
-        self.target_worker = target_worker
-        self.finish_extend_len = []
-        # Parse arguments
-        self.topk = server_args.speculative_eagle_topk
-        self.speculative_num_steps = server_args.speculative_num_steps
-        self.speculative_algorithm = SpeculativeAlgorithm.from_string(
-            server_args.speculative_algorithm
+        # Share the embedding and lm_head
+        embed, head = self.target_worker.model_runner.model.get_embed_and_head()
+        if self.hot_token_id is not None:
+            head = head.clone()
+            self.hot_token_id = self.hot_token_id.to(head.device)
+            head.data = head.data[self.hot_token_id]
+        self.draft_model_runner.model.set_embed_and_head(embed, head)
+        self.draft_model_runner.server_args.disable_cuda_graph = (
+            backup_disable_cuda_graph
         )
-        self.server_args = server_args
-        # Share the embedding and lm_head
-        if not self.speculative_algorithm.is_nextn():
-            embed, head = self.target_worker.model_runner.model.get_embed_and_head()
-            self.model_runner.model.set_embed_and_head(embed, head)
-        self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
+        self.init_attention_backend()
+        self.init_cuda_graphs()
+    def init_attention_backend(self):
         # Create multi-step attn backends and cuda graph runners
-        if server_args.attention_backend == "flashinfer":
+        if self.server_args.attention_backend == "flashinfer":
             from sglang.srt.layers.attention.flashinfer_backend import (
                 FlashInferMultiStepDraftBackend,
             )
@@ -80,7 +112,7 @@ class EAGLEWorker(TpModelWorker):
                 self.topk,
                 self.speculative_num_steps,
             )
-        elif server_args.attention_backend == "triton":
+        elif self.server_args.attention_backend == "triton":
             from sglang.srt.layers.attention.triton_backend import (
                 TritonMultiStepDraftBackend,
             )
@@ -92,11 +124,9 @@ class EAGLEWorker(TpModelWorker):
             )
         else:
             raise ValueError(
-                f"EAGLE is not supportted in attention backend {server_args.attention_backend}"
+                f"EAGLE is not supportted in attention backend {self.server_args.attention_backend}"
             )
-        self.model_runner.draft_attn_backend = self.draft_attn_backend
-        self.init_cuda_graphs()
+        self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
     def init_cuda_graphs(self):
         """Capture cuda graphs."""
@@ -106,55 +136,81 @@ class EAGLEWorker(TpModelWorker):
             return
         tic = time.time()
-        logger.info("Capture cuda graph begin. This can take up to several minutes.")
+        logger.info(
+            f"Capture draft cuda graph begin. This can take up to several minutes. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
         self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self)
-        logger.info(f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f} s")
+        logger.info(
+            f"Capture draft cuda graph end. Time elapsed: {time.time() - tic:.2f} s. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
-    def forward_batch_speculative_generation(self, batch: ScheduleBatch):
+    @property
+    def draft_model_runner(self):
+        return self.model_runner
+    def forward_batch_speculative_generation(
+        self, batch: ScheduleBatch
+    ) -> Tuple[LogitsProcessorOutput, List[int], int, int]:
+        """Run speculative decoding forward.
+        NOTE: Many states of batch is modified as you go through. It is not guaranteed
+        the final output batch doesn't have the same state as the input.
+        Args:
+            batch: The batch to run forward. The state of the batch is modified as it runs.
+        Returns:
+            A tuple of the final logit output of the target model, next tokens accepeted,
+            the batch id (used for overlap schedule), and number of accepeted tokens.
+        """
+        assert not batch.spec_algorithm.is_none()
         if batch.forward_mode.is_decode():
-            # Draft
-            spec_info: EagleVerifyInput = self.draft(batch)
-            # Verify
-            (
-                next_draft_input,
-                logits_output,
-                verified_id,
-                self.finish_extend_len,
-                accept_length_cpu,
-                model_worker_batch,
-            ) = self.verify(batch, spec_info)
-            batch.spec_info = next_draft_input
-            # if it is None, means all requsets are finished
+            spec_info, to_free_cache_loc = self.draft(batch)
+            logits_output, verify_output, model_worker_batch = self.verify(
+                batch, spec_info
+            )
+            # Free cache loc (we put it here to avoid synchronization and hide kernel launch overhead.)
+            self.token_to_kv_pool_allocator.free(to_free_cache_loc)
+            # if it is None, means all requests are finished
             if batch.spec_info.verified_id is not None:
                 self.forward_draft_extend_after_decode(batch)
             return (
                 logits_output,
-                verified_id,
-                model_worker_batch,
-                sum(accept_length_cpu),
+                verify_output.verified_id,
+                model_worker_batch.bid,
+                sum(verify_output.accept_length_per_req_cpu),
             )
         else:
-            # Forward with the target model and get hidden states.
-            # We need the full hidden states to prefill the KV cache of the draft model.
-            model_worker_batch = batch.get_model_worker_batch()
-            model_worker_batch.capture_hidden_mode = CaptureHiddenMode.FULL
-            logits_output, next_token_ids = self.target_worker.forward_batch_generation(
-                model_worker_batch
+            logits_output, next_token_ids, bid = self.forward_target_extend(batch)
+            self.forward_draft_extend(
+                batch, logits_output.hidden_states, next_token_ids
             )
-            # Forward with the draft model.
-            batch.spec_info = EagleDraftInput(
-                hidden_states=logits_output.hidden_states,
-                verified_id=next_token_ids,
-            )
-            self.forward_draft_extend(batch)
-            return logits_output, next_token_ids, model_worker_batch, 0
+            return logits_output, next_token_ids, bid, 0
+    def forward_target_extend(
+        self, batch: ScheduleBatch
+    ) -> Tuple[LogitsProcessorOutput, List[int], int]:
+        """Run the target extend.
+        Args:
+            batch: The batch to run. States could be modified.
+        Returns:
+            logits_output: The output of logits. It will contain the full hidden states.
+            next_token_ids: Next token ids generated.
+            bid: The model batch ID. Used for overlap schedule.
+        """
+        # Forward with the target model and get hidden states.
+        # We need the full hidden states to prefill the KV cache of the draft model.
+        model_worker_batch = batch.get_model_worker_batch()
+        model_worker_batch.capture_hidden_mode = CaptureHiddenMode.FULL
+        logits_output, next_token_ids = self.target_worker.forward_batch_generation(
+            model_worker_batch
+        )
+        return logits_output, next_token_ids, model_worker_batch.bid
     def draft(self, batch: ScheduleBatch):
-        self._set_mem_pool(batch, self.model_runner)
         # Parse args
         num_seqs = batch.batch_size()
         spec_info = batch.spec_info
@@ -172,7 +228,6 @@ class EAGLEWorker(TpModelWorker):
             self.topk,
             self.speculative_num_steps,
         )
         batch.out_cache_loc = out_cache_loc
         batch.seq_lens_sum = torch.sum(batch.seq_lens).item()
         spec_info.positions = batch.seq_lens.repeat_interleave(self.topk, dim=0)
@@ -180,11 +235,12 @@ class EAGLEWorker(TpModelWorker):
         # Get forward batch
         spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         model_worker_batch = batch.get_model_worker_batch()
-        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+        forward_batch = ForwardBatch.init_new(
+            model_worker_batch, self.draft_model_runner
+        )
         can_cuda_graph = self.cuda_graph_runner and self.cuda_graph_runner.can_run(
             forward_batch
         )
         if can_cuda_graph:
             score_list, token_list, parents_list = self.cuda_graph_runner.replay(
                 forward_batch
@@ -192,7 +248,9 @@ class EAGLEWorker(TpModelWorker):
         else:
             # Initialize attention backend
             self.draft_attn_backend.init_forward_metadata(forward_batch)
+            forward_batch = ForwardBatch.init_new(
+                model_worker_batch, self.draft_model_runner
+            )
             # Run forward steps
             score_list, token_list, parents_list = self.draft_forward(forward_batch)
@@ -209,10 +267,7 @@ class EAGLEWorker(TpModelWorker):
             batch.sampling_info.is_all_greedy,
         )
-        # Free cache locations
-        batch.token_to_kv_pool.free(out_cache_loc)
-        self._set_mem_pool(batch, self.target_worker.model_runner)
-        return ret
+        return ret, out_cache_loc
     def draft_forward(self, forward_batch: ForwardBatch):
         # Parse args
@@ -223,6 +278,8 @@ class EAGLEWorker(TpModelWorker):
             spec_info.topk_index,
             spec_info.hidden_states,
         )
+        if self.hot_token_id is not None:
+            topk_index = self.hot_token_id[topk_index]
         # Return values
         score_list: List[torch.Tensor] = []
@@ -260,8 +317,11 @@ class EAGLEWorker(TpModelWorker):
             logits_output = self.model_runner.model.forward(
                 forward_batch.input_ids, forward_batch.positions, forward_batch
             )
+            self._detect_nan_if_needed(logits_output)
             probs = torch.softmax(logits_output.next_token_logits, dim=-1)
             topk_p, topk_index = fast_topk(probs, self.topk, dim=-1)
+            if self.hot_token_id is not None:
+                topk_index = self.hot_token_id[topk_index]
             hidden_states = logits_output.hidden_states
         return score_list, token_list, parents_list
@@ -274,68 +334,135 @@ class EAGLEWorker(TpModelWorker):
         logits_output, _ = self.target_worker.forward_batch_generation(
             model_worker_batch, skip_sample=True
         )
+        self._detect_nan_if_needed(logits_output)
         spec_info.hidden_states = logits_output.hidden_states
-        res = spec_info.verify(batch, logits_output)
+        res: EagleVerifyOutput = spec_info.verify(
+            batch, logits_output, self.token_to_kv_pool_allocator
+        )
+        # Post process based on verified outputs.
+        # Pick indices that we care (accepeted)
+        logits_output.next_token_logits = logits_output.next_token_logits[
+            res.accepeted_indices_cpu
+        ]
+        logits_output.hidden_states = logits_output.hidden_states[
+            res.accepeted_indices_cpu
+        ]
+        # Prepare the batch for the next draft forwards.
         batch.forward_mode = ForwardMode.DECODE
-        return res + (model_worker_batch,)
+        batch.spec_info = res.draft_input
+        if batch.return_logprob:
+            # Compute output logprobs using the sampler.
+            num_tokens_per_req = [
+                accept + 1 for accept in res.accept_length_per_req_cpu
+            ]
+            self.target_worker.model_runner.update_output_logprobs(
+                logits_output,
+                batch.sampling_info,
+                batch.top_logprobs_nums,
+                batch.token_ids_logprobs,
+                res.verified_id,
+                # +1 for bonus token.
+                num_tokens_per_req=num_tokens_per_req,
+            )
-    def forward_draft_extend(self, batch: ScheduleBatch):
-        self._set_mem_pool(batch, self.model_runner)
+            # Add output logprobs to the request.
+            pt = 0
+            # NOTE: tolist() of these values are skipped when output is processed
+            next_token_logprobs = res.logits_output.next_token_logprobs.tolist()
+            verified_ids = res.verified_id.tolist()
+            for req, num_tokens in zip(batch.reqs, num_tokens_per_req):
+                for _ in range(num_tokens):
+                    if req.return_logprob:
+                        token_id = verified_ids[pt]
+                        req.output_token_logprobs_val.append(next_token_logprobs[pt])
+                        req.output_token_logprobs_idx.append(token_id)
+                        if req.top_logprobs_num > 0:
+                            req.output_top_logprobs_val.append(
+                                res.logits_output.next_token_top_logprobs_val[pt]
+                            )
+                            req.output_top_logprobs_idx.append(
+                                res.logits_output.next_token_top_logprobs_idx[pt]
+                            )
+                    pt += 1
+        return logits_output, res, model_worker_batch
+    def forward_draft_extend(
+        self,
+        batch: ScheduleBatch,
+        hidden_states: torch.Tensor,
+        next_token_ids: List[int],
+    ):
+        """Run draft model extend. This API modifies the states of the batch.
+        Args:
+            batch: The batch to run.
+            hidden_states: Hidden states from the target model forward
+            next_token_ids: Next token ids generated from the target forward.
+        """
+        batch.spec_info = EagleDraftInput(
+            hidden_states=hidden_states,
+            verified_id=next_token_ids,
+        )
         batch.spec_info.prepare_for_extend(batch)
         batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
         model_worker_batch = batch.get_model_worker_batch()
-        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
-        logits_output = self.model_runner.forward(forward_batch)
-        self.capture_for_decode(logits_output, forward_batch)
-        self._set_mem_pool(batch, self.target_worker.model_runner)
-    def _set_mem_pool(self, batch: ScheduleBatch, runner: ModelRunner):
-        batch.token_to_kv_pool = runner.token_to_kv_pool
-        batch.req_to_token_pool = runner.req_to_token_pool
+        forward_batch = ForwardBatch.init_new(
+            model_worker_batch, self.draft_model_runner
+        )
+        forward_batch.return_logprob = False
+        logits_output = self.draft_model_runner.forward(forward_batch)
+        self._detect_nan_if_needed(logits_output)
+        assert isinstance(forward_batch.spec_info, EagleDraftInput)
+        assert forward_batch.spec_info is batch.spec_info
+        self.capture_for_decode(logits_output, forward_batch.spec_info)
     def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
         seq_lens_backup = batch.seq_lens
-        req_pool_indices_backup = batch.req_pool_indices
-        self._set_mem_pool(batch, self.model_runner)
         batch.forward_mode = ForwardMode.DRAFT_EXTEND
         batch.spec_info.prepare_extend_after_decode(batch, self.speculative_num_steps)
         batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
+        # We don't need logprob for this extend.
+        original_return_logprob = batch.return_logprob
+        batch.return_logprob = False
         model_worker_batch = batch.get_model_worker_batch()
-        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
-        logits_output = self.model_runner.forward(forward_batch)
-        self.capture_for_decode(logits_output, forward_batch)
-        self._set_mem_pool(batch, self.target_worker.model_runner)
+        forward_batch = ForwardBatch.init_new(
+            model_worker_batch, self.draft_model_runner
+        )
+        logits_output = self.draft_model_runner.forward(forward_batch)
+        self._detect_nan_if_needed(logits_output)
+        assert forward_batch.spec_info is batch.spec_info
+        self.capture_for_decode(logits_output, forward_batch.spec_info)
         # Restore backup.
         # This is because `seq_lens` can be modified in `prepare_extend_after_decode`
+        batch.return_logprob = original_return_logprob
         batch.forward_mode = ForwardMode.DECODE
         batch.seq_lens = seq_lens_backup
-        batch.req_pool_indices = req_pool_indices_backup
     def capture_for_decode(
-        self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
+        self, logits_output: LogitsProcessorOutput, draft_input: EagleDraftInput
     ):
         probs = torch.softmax(logits_output.next_token_logits, dim=-1)
-        spec_info = forward_batch.spec_info
-        spec_info.topk_p, spec_info.topk_index = fast_topk(probs, self.topk, dim=-1)
-        spec_info.hidden_states = logits_output.hidden_states
-    # Don't support prefix share now.
-    def finish_request(self, reqs: Union[Req, List[Req]]):
-        if not isinstance(reqs, List):
-            reqs = [reqs]
-        for req in reqs:
-            if req.rid not in self.finish_extend_len:
-                continue
-            req_len = (
-                len(req.origin_input_ids)
-                + len(req.output_ids)
-                - self.finish_extend_len[req.rid]
-                - 1
-            )
-            kv_indices = self.model_runner.req_to_token_pool.req_to_token[
-                req.req_pool_idx
-            ][:req_len]
-            self.model_runner.token_to_kv_pool.free(kv_indices)
-            self.model_runner.req_to_token_pool.free(req.req_pool_idx)
+        draft_input.topk_p, draft_input.topk_index = fast_topk(probs, self.topk, dim=-1)
+        draft_input.hidden_states = logits_output.hidden_states
+    def _detect_nan_if_needed(self, logits_output: LogitsProcessorOutput):
+        if self.enable_nan_detection:
+            logits = logits_output.next_token_logits
+            if torch.any(torch.isnan(logits)):
+                logger.warning("Detected errors during sampling! NaN in the logits.")
+                raise ValueError("Detected errors during sampling! NaN in the logits.")
+def load_token_map(token_map_path: str) -> List[int]:
+    if not os.path.exists(token_map_path):
+        cache_dir = snapshot_download(
+            os.path.dirname(token_map_path),
+            ignore_patterns=["*.bin", "*.safetensors"],
+        )
+        token_map_path = os.path.join(cache_dir, os.path.basename(token_map_path))
+    hot_token_id = torch.load(token_map_path)
+    return torch.tensor(hot_token_id, dtype=torch.int32)

sglang/srt/speculative/spec_info.py CHANGED Viewed

@@ -5,30 +5,18 @@ class SpeculativeAlgorithm(IntEnum):
     NONE = auto()
     EAGLE = auto()
-    # NEXTN spec decoding is for DeepSeek V3/R1
-    # currently it's implemented based on EAGLE
-    NEXTN = auto()
     def is_none(self):
         return self == SpeculativeAlgorithm.NONE
     def is_eagle(self):
-        return self == SpeculativeAlgorithm.EAGLE or self == SpeculativeAlgorithm.NEXTN
-    def is_nextn(self):
-        return self == SpeculativeAlgorithm.NEXTN
+        return self == SpeculativeAlgorithm.EAGLE
     @staticmethod
     def from_string(name: str):
         name_map = {
             "EAGLE": SpeculativeAlgorithm.EAGLE,
-            "NEXTN": SpeculativeAlgorithm.NEXTN,
             None: SpeculativeAlgorithm.NONE,
         }
         if name is not None:
             name = name.upper()
         return name_map[name]
-class SpecInfo:
-    pass

sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post4__py3-none-any.whl

sglang 0.4.3.post2py3-none-any.whl → 0.4.3.post4py3-none-any.whl