PyPI - sglang - Versions diffs - 0.4.3.post2__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl - Mend

sglang 0.4.3.post2py3-none-any.whl → 0.4.3.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (205) hide show

sglang/api.py +1 -1
sglang/bench_offline_throughput.py +19 -0
sglang/bench_one_batch.py +2 -2
sglang/bench_serving.py +123 -79
sglang/global_config.py +8 -3
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/ir.py +1 -1
sglang/srt/_custom_ops.py +83 -91
sglang/srt/configs/load_config.py +4 -1
sglang/srt/configs/model_config.py +48 -2
sglang/srt/configs/qwen2_5_vl_config.py +5 -2
sglang/srt/constrained/base_grammar_backend.py +117 -15
sglang/srt/constrained/llguidance_backend.py +151 -0
sglang/srt/constrained/outlines_backend.py +24 -33
sglang/srt/constrained/xgrammar_backend.py +69 -38
sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
sglang/srt/distributed/parallel_state.py +48 -3
sglang/srt/entrypoints/engine.py +67 -9
sglang/srt/entrypoints/http_server.py +190 -41
sglang/srt/entrypoints/verl_engine.py +147 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/activation.py +11 -0
sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
sglang/srt/layers/attention/flashinfer_backend.py +220 -378
sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
sglang/srt/layers/attention/torch_native_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +9 -6
sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
sglang/srt/layers/attention/utils.py +39 -0
sglang/srt/layers/attention/vision.py +60 -63
sglang/srt/layers/dp_attention.py +142 -1
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/linear.py +3 -1
sglang/srt/layers/logits_processor.py +281 -45
sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
sglang/srt/layers/moe/ep_moe/layer.py +140 -28
sglang/srt/layers/moe/fused_moe_native.py +2 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
sglang/srt/layers/moe/topk.py +13 -4
sglang/srt/layers/quantization/__init__.py +111 -7
sglang/srt/layers/quantization/blockwise_int8.py +409 -0
sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/fp8.py +69 -28
sglang/srt/layers/quantization/fp8_utils.py +17 -1
sglang/srt/layers/quantization/gptq.py +416 -0
sglang/srt/layers/quantization/int8_kernel.py +327 -0
sglang/srt/layers/quantization/int8_utils.py +73 -0
sglang/srt/layers/quantization/modelopt_quant.py +18 -1
sglang/srt/layers/radix_attention.py +1 -0
sglang/srt/layers/rotary_embedding.py +0 -1
sglang/srt/layers/sampler.py +76 -31
sglang/srt/layers/vocab_parallel_embedding.py +14 -13
sglang/srt/lora/lora.py +17 -1
sglang/srt/lora/lora_config.py +5 -0
sglang/srt/lora/lora_manager.py +1 -3
sglang/srt/managers/cache_controller.py +193 -62
sglang/srt/managers/configure_logging.py +2 -1
sglang/srt/managers/data_parallel_controller.py +6 -2
sglang/srt/managers/detokenizer_manager.py +124 -102
sglang/srt/managers/image_processor.py +2 -1
sglang/srt/managers/io_struct.py +143 -6
sglang/srt/managers/schedule_batch.py +237 -197
sglang/srt/managers/schedule_policy.py +29 -29
sglang/srt/managers/scheduler.py +681 -259
sglang/srt/managers/session_controller.py +6 -2
sglang/srt/managers/tokenizer_manager.py +224 -68
sglang/srt/managers/tp_worker.py +15 -4
sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
sglang/srt/mem_cache/chunk_cache.py +18 -11
sglang/srt/mem_cache/hiradix_cache.py +394 -0
sglang/srt/mem_cache/memory_pool.py +44 -18
sglang/srt/mem_cache/radix_cache.py +58 -47
sglang/srt/metrics/collector.py +94 -36
sglang/srt/model_executor/cuda_graph_runner.py +55 -24
sglang/srt/model_executor/forward_batch_info.py +49 -16
sglang/srt/model_executor/model_runner.py +208 -28
sglang/srt/model_loader/loader.py +3 -3
sglang/srt/model_loader/weight_utils.py +36 -14
sglang/srt/models/baichuan.py +31 -6
sglang/srt/models/chatglm.py +39 -7
sglang/srt/models/commandr.py +29 -5
sglang/srt/models/dbrx.py +31 -5
sglang/srt/models/deepseek.py +43 -6
sglang/srt/models/deepseek_nextn.py +32 -19
sglang/srt/models/deepseek_v2.py +265 -32
sglang/srt/models/exaone.py +19 -9
sglang/srt/models/gemma.py +22 -8
sglang/srt/models/gemma2.py +25 -12
sglang/srt/models/gemma2_reward.py +5 -1
sglang/srt/models/gpt2.py +28 -13
sglang/srt/models/gpt_bigcode.py +27 -5
sglang/srt/models/granite.py +21 -9
sglang/srt/models/grok.py +21 -4
sglang/srt/models/internlm2.py +36 -6
sglang/srt/models/internlm2_reward.py +5 -1
sglang/srt/models/llama.py +26 -9
sglang/srt/models/llama_classification.py +5 -1
sglang/srt/models/llama_eagle.py +17 -4
sglang/srt/models/llama_embedding.py +5 -1
sglang/srt/models/llama_reward.py +7 -2
sglang/srt/models/llava.py +19 -3
sglang/srt/models/llavavid.py +10 -1
sglang/srt/models/minicpm.py +26 -2
sglang/srt/models/minicpm3.py +39 -3
sglang/srt/models/minicpmv.py +45 -14
sglang/srt/models/mixtral.py +20 -9
sglang/srt/models/mixtral_quant.py +50 -8
sglang/srt/models/mllama.py +57 -11
sglang/srt/models/olmo.py +34 -6
sglang/srt/models/olmo2.py +34 -13
sglang/srt/models/olmoe.py +26 -4
sglang/srt/models/phi3_small.py +29 -10
sglang/srt/models/qwen.py +26 -3
sglang/srt/models/qwen2.py +26 -4
sglang/srt/models/qwen2_5_vl.py +46 -8
sglang/srt/models/qwen2_eagle.py +17 -5
sglang/srt/models/qwen2_moe.py +44 -6
sglang/srt/models/qwen2_rm.py +78 -0
sglang/srt/models/qwen2_vl.py +39 -8
sglang/srt/models/stablelm.py +32 -5
sglang/srt/models/torch_native_llama.py +5 -2
sglang/srt/models/xverse.py +21 -9
sglang/srt/models/xverse_moe.py +45 -7
sglang/srt/models/yivl.py +2 -1
sglang/srt/openai_api/adapter.py +109 -24
sglang/srt/openai_api/protocol.py +17 -1
sglang/srt/reasoning_parser.py +154 -0
sglang/srt/sampling/penaltylib/__init__.py +4 -6
sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
sglang/srt/sampling/sampling_batch_info.py +79 -157
sglang/srt/sampling/sampling_params.py +16 -13
sglang/srt/server_args.py +136 -52
sglang/srt/speculative/build_eagle_tree.py +2 -8
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
sglang/srt/speculative/eagle_utils.py +92 -58
sglang/srt/speculative/eagle_worker.py +186 -94
sglang/srt/speculative/spec_info.py +1 -13
sglang/srt/utils.py +43 -17
sglang/srt/warmup.py +47 -0
sglang/test/few_shot_gsm8k.py +4 -1
sglang/test/runners.py +389 -126
sglang/test/send_one.py +88 -0
sglang/test/test_block_fp8_ep.py +361 -0
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +138 -84
sglang/utils.py +50 -60
sglang/version.py +1 -1
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/METADATA +21 -15
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/RECORD +200 -166
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/WHEEL +1 -1
sglang/bench_latency.py +0 -1
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
sglang/test/srt/sampling/penaltylib/utils.py +0 -344
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/LICENSE +0 -0
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -31,7 +31,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 from enum import IntEnum, auto
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, List, Optional, Union
 import torch
 import triton
@@ -41,12 +41,13 @@ from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
 from sglang.srt.utils import get_compiler_backend
 if TYPE_CHECKING:
-    from sglang.srt.layers.attention import AttentionBackend
+    from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
     from sglang.srt.managers.schedule_batch import ImageInputs, ModelWorkerBatch
     from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
     from sglang.srt.model_executor.model_runner import ModelRunner
     from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
-    from sglang.srt.speculative.spec_info import SpecInfo, SpeculativeAlgorithm
+    from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+    from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 class ForwardMode(IntEnum):
@@ -112,7 +113,9 @@ class ForwardMode(IntEnum):
 class CaptureHiddenMode(IntEnum):
     NULL = auto()
+    # Capture hidden states of all tokens.
     FULL = auto()
+    # Capture a hidden state of the last token.
     LAST = auto()
     def need_capture(self):
@@ -148,10 +151,14 @@ class ForwardBatch:
     # For logprob
     return_logprob: bool = False
     top_logprobs_nums: Optional[List[int]] = None
+    token_ids_logprobs: Optional[List[List[int]]] = None
     # Position information
     positions: torch.Tensor = None
+    # For decode
+    decode_seq_lens_cpu: Optional[torch.Tensor] = None
     # For extend
     extend_num_tokens: Optional[int] = None
     extend_seq_lens: Optional[torch.Tensor] = None
@@ -160,6 +167,7 @@ class ForwardBatch:
     extend_prefix_lens_cpu: Optional[List[int]] = None
     extend_seq_lens_cpu: Optional[List[int]] = None
     extend_logprob_start_lens_cpu: Optional[List[int]] = None
+    extend_input_logprob_token_ids_gpu: Optional[torch.Tensor] = None
     # For multimodal
     image_inputs: Optional[List[ImageInputs]] = None
@@ -185,15 +193,27 @@ class ForwardBatch:
     attn_backend: AttentionBackend = None
     # For DP attention
-    global_num_tokens: Optional[List[int]] = None
+    global_num_tokens_cpu: Optional[List[int]] = None
+    global_num_tokens_gpu: Optional[torch.Tensor] = None
+    # Has to be None when cuda graph is captured.
+    global_num_tokens_for_logprob_cpu: Optional[List[int]] = None
+    global_num_tokens_for_logprob_gpu: Optional[torch.Tensor] = None
+    # for extend, local start pos and num tokens is different in logits processor
+    # this will be computed in get_dp_local_info
+    # this will be recomputed in LogitsMetadata.from_forward_batch
+    dp_local_start_pos: Optional[torch.Tensor] = None  # cached info at runtime
+    dp_local_num_tokens: Optional[torch.Tensor] = None  # cached info at runtime
     gathered_buffer: Optional[torch.Tensor] = None
     can_run_dp_cuda_graph: bool = False
     # Speculative decoding
-    spec_info: SpecInfo = None
+    spec_info: Optional[Union[EagleVerifyInput, EagleDraftInput]] = None
     spec_algorithm: SpeculativeAlgorithm = None
     capture_hidden_mode: CaptureHiddenMode = None
+    # For padding
+    padded_static_len: int = -1  # -1 if not padded
     # For Qwen2-VL
     mrope_positions: torch.Tensor = None
@@ -203,8 +223,13 @@ class ForwardBatch:
         batch: ModelWorkerBatch,
         model_runner: ModelRunner,
     ):
         device = model_runner.device
+        extend_input_logprob_token_ids_gpu = None
+        if batch.extend_input_logprob_token_ids is not None:
+            extend_input_logprob_token_ids_gpu = (
+                batch.extend_input_logprob_token_ids.to(device, non_blocking=True)
+            )
         ret = cls(
             forward_mode=batch.forward_mode,
             batch_size=len(batch.seq_lens),
@@ -220,7 +245,7 @@ class ForwardBatch:
             seq_lens_sum=batch.seq_lens_sum,
             return_logprob=batch.return_logprob,
             top_logprobs_nums=batch.top_logprobs_nums,
-            global_num_tokens=batch.global_num_tokens,
+            token_ids_logprobs=batch.token_ids_logprobs,
             can_run_dp_cuda_graph=batch.can_run_dp_cuda_graph,
             lora_paths=batch.lora_paths,
             sampling_info=batch.sampling_info,
@@ -231,10 +256,12 @@ class ForwardBatch:
             spec_info=batch.spec_info,
             capture_hidden_mode=batch.capture_hidden_mode,
             input_embeds=batch.input_embeds,
+            extend_input_logprob_token_ids_gpu=extend_input_logprob_token_ids_gpu,
         )
-        if ret.global_num_tokens is not None:
-            max_len = max(ret.global_num_tokens)
+        if batch.global_num_tokens is not None:
+            ret.global_num_tokens_cpu = batch.global_num_tokens
+            max_len = max(ret.global_num_tokens_cpu)
             ret.gathered_buffer = torch.zeros(
                 (max_len * model_runner.tp_size, model_runner.model_config.hidden_size),
                 dtype=model_runner.dtype,
@@ -256,6 +283,8 @@ class ForwardBatch:
         if ret.forward_mode.is_decode():
             if ret.positions is None:
                 ret.positions = clamp_position(batch.seq_lens)
+            if ret.decode_seq_lens_cpu is None:
+                ret.decode_seq_lens_cpu = batch.decode_seq_lens
         else:
             ret.extend_seq_lens = torch.tensor(
                 batch.extend_seq_lens, dtype=torch.int32
@@ -263,13 +292,12 @@ class ForwardBatch:
             ret.extend_prefix_lens = torch.tensor(
                 batch.extend_prefix_lens, dtype=torch.int32
             ).to(device, non_blocking=True)
-            if (
-                model_runner.server_args.attention_backend != "torch_native"
-                and model_runner.server_args.speculative_algorithm != "NEXTN"
-            ):
+            if model_runner.server_args.attention_backend != "torch_native":
                 ret.extend_num_tokens = batch.extend_num_tokens
                 positions, ret.extend_start_loc = compute_position_triton(
-                    ret.extend_prefix_lens, ret.extend_seq_lens, ret.extend_num_tokens
+                    ret.extend_prefix_lens,
+                    ret.extend_seq_lens,
+                    ret.extend_num_tokens,
                 )
             else:
                 positions, ret.extend_start_loc = compute_position_torch(
@@ -341,6 +369,7 @@ class ForwardBatch:
                     )
                     batch.image_inputs[i].mrope_position_delta = mrope_position_delta
                 mrope_positions_list[i] = mrope_positions
         self.mrope_positions = torch.concat(
             [torch.tensor(pos, device=device) for pos in mrope_positions_list],
             axis=1,
@@ -353,6 +382,8 @@ def compute_position_triton(
 ):
     """Compute positions. It is a fused version of `compute_position_torch`."""
     batch_size = extend_seq_lens.shape[0]
+    has_prefix = extend_prefix_lens.shape[0] == batch_size
     positions = torch.empty(
         extend_seq_lens_sum, dtype=torch.int64, device=extend_seq_lens.device
     )
@@ -366,6 +397,7 @@ def compute_position_triton(
         extend_start_loc,
         extend_prefix_lens,
         extend_seq_lens,
+        has_prefix,
     )
     return positions, extend_start_loc
@@ -377,11 +409,12 @@ def compute_position_kernel(
     extend_start_loc,
     extend_prefix_lens,
     extend_seq_lens,
+    has_prefix: tl.constexpr,
 ):
     BLOCK_SIZE: tl.constexpr = 512
-    pid = tl.program_id(0)
+    pid = tl.program_id(0).to(tl.int64)
-    prefix_len = tl.load(extend_prefix_lens + pid)
+    prefix_len = tl.load(extend_prefix_lens + pid) if has_prefix else 0
     seq_len = tl.load(extend_seq_lens + pid)
     # TODO: optimize this?

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -13,11 +13,14 @@
 # ==============================================================================
 """ModelRunner runs the forward passes of the models."""
+import datetime
 import gc
 import json
 import logging
+import os
 import time
-from typing import List, Optional, Tuple
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
 import torch
 import torch.distributed as dist
@@ -34,6 +37,7 @@ from sglang.srt.distributed import (
 from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state
 from sglang.srt.layers.attention.double_sparsity_backend import DoubleSparseAttnBackend
 from sglang.srt.layers.attention.flashinfer_backend import FlashInferAttnBackend
+from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
 from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBackend
 from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
 from sglang.srt.layers.dp_attention import (
@@ -51,14 +55,18 @@ from sglang.srt.mem_cache.memory_pool import (
     MHATokenToKVPool,
     MLATokenToKVPool,
     ReqToTokenPool,
+    TokenToKVPoolAllocator,
 )
 from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader import get_model
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.srt.utils import (
+    MultiprocessingSerializer,
     enable_show_time_cost,
     get_available_gpu_memory,
     init_custom_process_group,
@@ -69,10 +77,15 @@ from sglang.srt.utils import (
     set_cpu_offload_max_bytes,
     set_cuda_arch,
 )
+from sglang.utils import get_exception_traceback
 logger = logging.getLogger(__name__)
+SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None)
+UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300
 class ModelRunner:
     """ModelRunner runs the forward passes of the models."""
@@ -86,6 +99,8 @@ class ModelRunner:
         nccl_port: int,
         server_args: ServerArgs,
         is_draft_worker: bool = False,
+        req_to_token_pool: Optional[ReqToTokenPool] = None,
+        token_to_kv_pool_allocator: Optional[TokenToKVPoolAllocator] = None,
     ):
         # Parse args
         self.model_config = model_config
@@ -103,6 +118,8 @@ class ModelRunner:
         self.spec_algorithm = SpeculativeAlgorithm.from_string(
             server_args.speculative_algorithm
         )
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
         # Model-specific adjustment
         if (
@@ -113,9 +130,9 @@ class ModelRunner:
             if self.server_args.device != "cpu":
                 if server_args.enable_flashinfer_mla:
                     logger.info(
-                        "FlashInfer MLA optimization is turned on. Use flashinfer backend for DeepseekV3ForCausalLM."
+                        "MLA optimization is turned on. Use flashinfer mla backend."
                     )
-                    self.server_args.attention_backend = "flashinfer"
+                    self.server_args.attention_backend = "flashinfer_mla"
                 else:
                     logger.info("MLA optimization is turned on. Use triton backend.")
                     self.server_args.attention_backend = "triton"
@@ -176,8 +193,13 @@ class ModelRunner:
                 "enable_dp_attention": server_args.enable_dp_attention,
                 "enable_ep_moe": server_args.enable_ep_moe,
                 "device": server_args.device,
+                "speculative_accept_threshold_single": server_args.speculative_accept_threshold_single,
+                "speculative_accept_threshold_acc": server_args.speculative_accept_threshold_acc,
                 "enable_flashinfer_mla": server_args.enable_flashinfer_mla,
                 "disable_radix_cache": server_args.disable_radix_cache,
+                "flashinfer_mla_disable_ragged": server_args.flashinfer_mla_disable_ragged,
+                "debug_tensor_dump_output_folder": server_args.debug_tensor_dump_output_folder,
+                "debug_tensor_dump_inject": server_args.debug_tensor_dump_inject,
             }
         )
@@ -194,6 +216,18 @@ class ModelRunner:
         self.sampler = Sampler()
         self.load_model()
+        # Handle the case where some of models don't finish loading.
+        try:
+            dist.monitored_barrier(
+                group=get_tp_group().cpu_group,
+                timeout=datetime.timedelta(seconds=UNBALANCED_MODEL_LOADING_TIMEOUT_S),
+                wait_all_ranks=True,
+            )
+        except RuntimeError:
+            raise ValueError(
+                f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node."
+            ) from None
         # Apply torchao quantization
         torchao_applied = getattr(self.model, "torchao_applied", False)
         # In layered loading, torchao may have been applied
@@ -228,19 +262,18 @@ class ModelRunner:
     def init_torch_distributed(self):
         logger.info("Init torch distributed begin.")
         torch.get_device_module(self.device).set_device(self.gpu_id)
         if self.device == "cuda":
             backend = "nccl"
         elif self.device == "xpu":
-            # TODO(liangan1): Just use gloo to bypass the initilization fail
-            # Need to use xccl for xpu backend in the future
-            backend = "gloo"
+            backend = "xccl"
         elif self.device == "hpu":
             backend = "hccl"
         elif self.device == "cpu":
             backend = "gloo"
+        before_avail_memory = get_available_gpu_memory(self.device, self.gpu_id)
         if not self.server_args.enable_p2p_check:
             monkey_patch_p2p_access_check()
@@ -258,6 +291,7 @@ class ModelRunner:
                 rank=self.tp_rank,
                 local_rank=self.gpu_id,
                 distributed_init_method=dist_init_method,
+                timeout=self.server_args.dist_timeout,
             )
             initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
             initialize_dp_attention(
@@ -270,20 +304,24 @@ class ModelRunner:
         min_per_gpu_memory = get_available_gpu_memory(
             self.device, self.gpu_id, distributed=self.tp_size > 1
         )
+        local_gpu_memory = get_available_gpu_memory(self.device, self.gpu_id)
         self.tp_group = get_tp_group()
         self.attention_tp_group = get_attention_tp_group()
         # Check memory for tensor parallelism
         if self.tp_size > 1:
-            local_gpu_memory = get_available_gpu_memory(self.device, self.gpu_id)
             if min_per_gpu_memory < local_gpu_memory * 0.9:
                 raise ValueError(
                     "The memory capacity is unbalanced. Some GPUs may be occupied by other processes."
                 )
+        logger.info(
+            f"Init torch distributed ends. mem usage={(before_avail_memory - local_gpu_memory):.2f} GB"
+        )
         return min_per_gpu_memory
     def load_model(self):
+        before_avail_memory = get_available_gpu_memory(self.device, self.gpu_id)
         logger.info(
             f"Load weight begin. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
         )
@@ -353,11 +391,13 @@ class ModelRunner:
         )
         self.dtype = self.model_config.dtype
+        after_avail_memory = get_available_gpu_memory(self.device, self.gpu_id)
         logger.info(
             f"Load weight end. "
             f"type={type(self.model).__name__}, "
             f"dtype={self.dtype}, "
-            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+            f"avail mem={after_avail_memory:.2f} GB, "
+            f"mem usage={(before_avail_memory - after_avail_memory):.2f} GB."
         )
     def update_weights_from_disk(
@@ -512,8 +552,21 @@ class ModelRunner:
             logger.error(error_msg)
             return False, error_msg
-    def update_weights_from_tensor(self, named_tensors: List[Tuple[str, torch.Tensor]]):
-        self.model.load_weights(named_tensors)
+    def update_weights_from_tensor(
+        self,
+        named_tensors: List[Tuple[str, Union[torch.Tensor, "LocalSerializedTensor"]]],
+        load_format: Optional[str] = None,
+    ):
+        named_tensors = [
+            (name, _unwrap_tensor(tensor, tp_rank=self.tp_rank))
+            for name, tensor in named_tensors
+        ]
+        if load_format == "direct":
+            _model_load_weights_direct(self.model, named_tensors)
+        elif load_format is None:
+            self.model.load_weights(named_tensors)
+        else:
+            raise NotImplementedError(f"Unknown load_format={load_format}")
         return True, "Success"
     def get_weights_by_name(
@@ -606,15 +659,31 @@ class ModelRunner:
                 4096,
             )
+        if SGLANG_CI_SMALL_KV_SIZE:
+            self.max_total_num_tokens = int(SGLANG_CI_SMALL_KV_SIZE)
         if not self.spec_algorithm.is_none():
             if self.is_draft_worker:
                 self.max_total_num_tokens = self.server_args.draft_runner_cache_size
+                max_num_reqs = self.server_args.max_num_reqs
             else:
+                # We are sharing the `token_to_kv_pool`, and both verify and draft tokens
+                # can be concurrently allocated, so we should give a headroom for it.
                 self.server_args.draft_runner_cache_size = (
                     self.max_total_num_tokens
-                    + max_num_reqs * self.server_args.speculative_num_steps
+                    # draft
+                    + max_num_reqs
+                    * self.server_args.speculative_num_steps
+                    * self.server_args.speculative_eagle_topk
+                    # verify
+                    + max_num_reqs * self.server_args.speculative_num_draft_tokens
+                    # buffer
                     + 100
                 )
+                # Target worker and draft worker shares the same indices for the
+                # token_to_kv_pool, so we should make sure to match max_total_num_tokens.
+                self.max_total_num_tokens = self.server_args.draft_runner_cache_size
+                self.server_args.max_num_reqs = max_num_reqs
         if max_total_tokens is not None:
             if max_total_tokens > self.max_total_num_tokens:
@@ -630,12 +699,26 @@ class ModelRunner:
                 "Not enough memory. Please try to increase --mem-fraction-static."
             )
-        self.req_to_token_pool = ReqToTokenPool(
-            size=max_num_reqs + 1,
-            max_context_len=self.model_config.context_len + 4,
-            device=self.device,
-            enable_memory_saver=self.server_args.enable_memory_saver,
-        )
+        if self.req_to_token_pool is None:
+            self.req_to_token_pool = ReqToTokenPool(
+                size=max_num_reqs + 1,
+                max_context_len=self.model_config.context_len + 4,
+                device=self.device,
+                enable_memory_saver=self.server_args.enable_memory_saver,
+            )
+        else:
+            # Draft worker shares req_to_token_pool with the target worker.
+            assert self.is_draft_worker
+        if self.token_to_kv_pool_allocator is None:
+            self.token_to_kv_pool_allocator = TokenToKVPoolAllocator(
+                self.max_total_num_tokens,
+                dtype=self.kv_cache_dtype,
+                device=self.device,
+            )
+        else:
+            assert self.is_draft_worker
         if (
             self.model_config.attention_arch == AttentionArch.MLA
             and not self.server_args.disable_mla
@@ -703,6 +786,8 @@ class ModelRunner:
                 self.attn_backend = TritonAttnBackend(self)
         elif self.server_args.attention_backend == "torch_native":
             self.attn_backend = TorchNativeAttnBackend(self)
+        elif self.server_args.attention_backend == "flashinfer_mla":
+            self.attn_backend = FlashInferMLAAttnBackend(self)
         else:
             raise ValueError(
                 f"Invalid attention backend: {self.server_args.attention_backend}"
@@ -737,9 +822,16 @@ class ModelRunner:
             return
         tic = time.time()
-        logger.info("Capture cuda graph begin. This can take up to several minutes.")
+        before_mem = get_available_gpu_memory(self.device, self.gpu_id)
+        logger.info(
+            f"Capture cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
+        )
         self.cuda_graph_runner = CudaGraphRunner(self)
-        logger.info(f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f} s")
+        after_mem = get_available_gpu_memory(self.device, self.gpu_id)
+        logger.info(
+            f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f} s. "
+            f"avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB."
+        )
     def apply_torch_tp(self):
         logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
@@ -754,8 +846,12 @@ class ModelRunner:
             forward_batch.input_ids, forward_batch.positions, forward_batch
         )
-    def forward_extend(self, forward_batch: ForwardBatch):
-        self.attn_backend.init_forward_metadata(forward_batch)
+    def forward_extend(
+        self, forward_batch: ForwardBatch, skip_attn_backend_init: bool = False
+    ):
+        if not skip_attn_backend_init:
+            self.attn_backend.init_forward_metadata(forward_batch)
         if self.is_generation:
             if forward_batch.input_embeds is None:
                 return self.model.forward(
@@ -799,11 +895,10 @@ class ModelRunner:
         else:
             raise ValueError(f"Invalid forward mode: {forward_batch.forward_mode}")
-    def sample(
-        self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
-    ) -> torch.Tensor:
+    def _preprocess_logits(
+        self, logits_output: LogitsProcessorOutput, sampling_info: SamplingBatchInfo
+    ):
         # Apply logit bias
-        sampling_info = forward_batch.sampling_info
         if sampling_info.sampling_info_done:
             # Overlap mode: the function update_regex_vocab_mask was executed
             # in process_batch_result of the last batch.
@@ -812,15 +907,77 @@ class ModelRunner:
         else:
             # Normal mode: Put CPU-heavy tasks here. They will be overlapped with the forward pass.
             sampling_info.update_regex_vocab_mask()
-            sampling_info.update_penalties()
         sampling_info.apply_logits_bias(logits_output.next_token_logits)
+    def update_output_logprobs(
+        self,
+        logits_output: LogitsProcessorOutput,
+        sampling_info: SamplingBatchInfo,
+        top_logprobs_nums: List[int],
+        token_ids_logprobs: List[int],
+        next_token_ids: torch.Tensor,
+        *,
+        num_tokens_per_req: List[int],
+    ):
+        """Update the logits_output's output logprob based on next_token_ids
+        Args:
+            logits_output: The logits output from the model forward
+            sampling_info: Sampling info for logprob calculation
+            top_logprobs_nums: Number of logprobs per request.
+            next_token_ids: Next token ids.
+            num_tokens_per_req: The number of tokens per request.
+        Returns:
+            A list of next_token_ids
+        """
+        self._preprocess_logits(logits_output, sampling_info)
+        # We should repeat top_logprobs_nums to match num_tokens_per_req.
+        top_logprobs_nums_repeat_interleaved = []
+        token_ids_logprobs_repeat_interleaved = []
+        for num, num_tokens in zip(top_logprobs_nums, num_tokens_per_req):
+            top_logprobs_nums_repeat_interleaved.extend([num] * num_tokens)
+        for token_ids, num_tokens in zip(token_ids_logprobs, num_tokens_per_req):
+            token_ids_logprobs_repeat_interleaved.extend([token_ids] * num_tokens)
+        self.sampler(
+            logits_output,
+            sampling_info,
+            True,
+            top_logprobs_nums_repeat_interleaved,
+            token_ids_logprobs_repeat_interleaved,
+            batch_next_token_ids=next_token_ids,
+        )
+    def sample(
+        self,
+        logits_output: LogitsProcessorOutput,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        """Sample and compute logprobs and update logits_output.
+        Args:
+            logits_output: The logits output from the model forward
+            forward_batch: The forward batch that generates logits_output
+        Returns:
+            A list of next_token_ids
+        """
+        # For duplex models with multiple output streams.
+        if isinstance(logits_output, tuple):
+            return torch.stack(
+                [self.sample(values, forward_batch) for values in logits_output],
+                axis=-1,
+            )
+        self._preprocess_logits(logits_output, forward_batch.sampling_info)
         # Sample the next tokens
         next_token_ids = self.sampler(
             logits_output,
-            sampling_info,
+            forward_batch.sampling_info,
             forward_batch.return_logprob,
             forward_batch.top_logprobs_nums,
+            forward_batch.token_ids_logprobs,
         )
         return next_token_ids
@@ -832,3 +989,26 @@ class ModelRunner:
         if rope_scaling is None:
             return False
         return rope_scaling.get("type", None) == "mrope"
+def _model_load_weights_direct(model, named_tensors: List[Tuple[str, torch.Tensor]]):
+    params_dict = dict(model.named_parameters())
+    for name, tensor in named_tensors:
+        default_weight_loader(params_dict[name], tensor)
+def _unwrap_tensor(tensor, tp_rank):
+    if isinstance(tensor, LocalSerializedTensor):
+        return tensor.get(tp_rank)
+    return tensor
+@dataclass
+class LocalSerializedTensor:
+    """torch.Tensor that gets serialized by MultiprocessingSerializer (which only serializes a pointer and not the data).
+    The i-th element in the list corresponds to i-th rank's GPU."""
+    values: List[bytes]
+    def get(self, rank: int):
+        return MultiprocessingSerializer.deserialize(self.values[rank])

sglang/srt/model_loader/loader.py CHANGED Viewed

@@ -11,7 +11,7 @@ import math
 import os
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
-from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, Type, cast
+from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast
 import gguf
 import huggingface_hub
@@ -19,7 +19,7 @@ import numpy as np
 import torch
 from huggingface_hub import HfApi, hf_hub_download
 from torch import nn
-from transformers import AutoModelForCausalLM, PretrainedConfig
+from transformers import AutoModelForCausalLM
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 from sglang.srt.configs.device_config import DeviceConfig
@@ -197,7 +197,7 @@ class DefaultModelLoader(BaseModelLoader):
         Returns the path to the downloaded model, or None if the model is not
         downloaded from ModelScope."""
-        if "SGLANG_USE_MODELSCOPE" in os.environ:
+        if os.environ.get("SGLANG_USE_MODELSCOPE", None) == "True":
             # download model from ModelScope hub,
             # lazy import so that modelscope is not required for normal use.
             # pylint: disable=C.

sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl

sglang 0.4.3.post2py3-none-any.whl → 0.4.3.post3py3-none-any.whl