PyPI - sglang - Versions diffs - 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

sglang 0.4.1.post6py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

sglang/__init__.py +21 -23
sglang/api.py +2 -7
sglang/bench_offline_throughput.py +41 -27
sglang/bench_one_batch.py +60 -4
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +83 -71
sglang/lang/backend/runtime_endpoint.py +183 -4
sglang/lang/chat_template.py +46 -4
sglang/launch_server.py +1 -1
sglang/srt/_custom_ops.py +80 -42
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +1 -0
sglang/srt/constrained/base_grammar_backend.py +21 -0
sglang/srt/constrained/xgrammar_backend.py +8 -4
sglang/srt/conversation.py +14 -1
sglang/srt/distributed/__init__.py +3 -3
sglang/srt/distributed/communication_op.py +2 -1
sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +112 -42
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
sglang/srt/distributed/device_communicators/pynccl.py +80 -1
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
sglang/srt/distributed/parallel_state.py +1 -1
sglang/srt/distributed/utils.py +2 -1
sglang/srt/entrypoints/engine.py +452 -0
sglang/srt/entrypoints/http_server.py +603 -0
sglang/srt/function_call_parser.py +494 -0
sglang/srt/layers/activation.py +8 -8
sglang/srt/layers/attention/flashinfer_backend.py +10 -9
sglang/srt/layers/attention/triton_backend.py +4 -6
sglang/srt/layers/attention/vision.py +204 -0
sglang/srt/layers/dp_attention.py +71 -0
sglang/srt/layers/layernorm.py +5 -5
sglang/srt/layers/linear.py +65 -14
sglang/srt/layers/logits_processor.py +49 -64
sglang/srt/layers/moe/ep_moe/layer.py +24 -16
sglang/srt/layers/moe/fused_moe_native.py +84 -1
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -7
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -5
sglang/srt/layers/parameter.py +18 -8
sglang/srt/layers/quantization/__init__.py +20 -23
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/fp8.py +10 -4
sglang/srt/layers/quantization/modelopt_quant.py +1 -2
sglang/srt/layers/quantization/w8a8_int8.py +1 -1
sglang/srt/layers/radix_attention.py +2 -2
sglang/srt/layers/rotary_embedding.py +1184 -31
sglang/srt/layers/sampler.py +64 -6
sglang/srt/layers/torchao_utils.py +12 -6
sglang/srt/layers/vocab_parallel_embedding.py +2 -2
sglang/srt/lora/lora.py +1 -9
sglang/srt/managers/configure_logging.py +3 -0
sglang/srt/managers/data_parallel_controller.py +79 -72
sglang/srt/managers/detokenizer_manager.py +24 -6
sglang/srt/managers/image_processor.py +158 -2
sglang/srt/managers/io_struct.py +57 -3
sglang/srt/managers/schedule_batch.py +78 -45
sglang/srt/managers/schedule_policy.py +26 -12
sglang/srt/managers/scheduler.py +326 -201
sglang/srt/managers/session_controller.py +1 -0
sglang/srt/managers/tokenizer_manager.py +210 -121
sglang/srt/managers/tp_worker.py +6 -4
sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
sglang/srt/managers/utils.py +44 -0
sglang/srt/mem_cache/memory_pool.py +10 -32
sglang/srt/metrics/collector.py +15 -6
sglang/srt/model_executor/cuda_graph_runner.py +26 -30
sglang/srt/model_executor/forward_batch_info.py +5 -7
sglang/srt/model_executor/model_runner.py +44 -19
sglang/srt/model_loader/loader.py +83 -6
sglang/srt/model_loader/weight_utils.py +145 -6
sglang/srt/models/baichuan.py +6 -6
sglang/srt/models/chatglm.py +2 -2
sglang/srt/models/commandr.py +17 -5
sglang/srt/models/dbrx.py +13 -5
sglang/srt/models/deepseek.py +3 -3
sglang/srt/models/deepseek_v2.py +11 -11
sglang/srt/models/exaone.py +2 -2
sglang/srt/models/gemma.py +2 -2
sglang/srt/models/gemma2.py +15 -25
sglang/srt/models/gpt2.py +3 -5
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/granite.py +2 -2
sglang/srt/models/grok.py +4 -3
sglang/srt/models/internlm2.py +2 -2
sglang/srt/models/llama.py +7 -5
sglang/srt/models/minicpm.py +2 -2
sglang/srt/models/minicpm3.py +9 -9
sglang/srt/models/minicpmv.py +1238 -0
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mixtral_quant.py +3 -3
sglang/srt/models/mllama.py +2 -2
sglang/srt/models/olmo.py +3 -3
sglang/srt/models/olmo2.py +4 -4
sglang/srt/models/olmoe.py +7 -13
sglang/srt/models/phi3_small.py +2 -2
sglang/srt/models/qwen.py +2 -2
sglang/srt/models/qwen2.py +41 -4
sglang/srt/models/qwen2_moe.py +3 -3
sglang/srt/models/qwen2_vl.py +22 -122
sglang/srt/models/stablelm.py +2 -2
sglang/srt/models/torch_native_llama.py +20 -7
sglang/srt/models/xverse.py +6 -6
sglang/srt/models/xverse_moe.py +6 -6
sglang/srt/openai_api/adapter.py +139 -37
sglang/srt/openai_api/protocol.py +7 -4
sglang/srt/sampling/custom_logit_processor.py +38 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
sglang/srt/sampling/sampling_batch_info.py +143 -18
sglang/srt/sampling/sampling_params.py +3 -1
sglang/srt/server.py +4 -1090
sglang/srt/server_args.py +77 -15
sglang/srt/speculative/eagle_utils.py +37 -15
sglang/srt/speculative/eagle_worker.py +11 -13
sglang/srt/utils.py +164 -129
sglang/test/runners.py +8 -13
sglang/test/test_programs.py +2 -1
sglang/test/test_utils.py +83 -22
sglang/utils.py +12 -2
sglang/version.py +1 -1
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/METADATA +21 -10
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/RECORD +138 -123
sglang/launch_server_llavavid.py +0 -25
sglang/srt/constrained/__init__.py +0 -16
sglang/srt/distributed/device_communicators/__init__.py +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/vision.py ADDED Viewed

@@ -0,0 +1,204 @@
+from __future__ import annotations
+from typing import Optional
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from sglang.srt.distributed import parallel_state
+from sglang.srt.distributed import utils as dist_utils
+from sglang.srt.layers.attention.triton_ops.prefill_attention import (
+    context_attention_fwd,
+)
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.quantization import QuantizationConfig
+def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(
+            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
+        )
+def apply_rotary_emb_torch(
+    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
+) -> torch.Tensor:
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(
+        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
+    sin = repeat(
+        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
+    return torch.cat(
+        [
+            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
+            x[..., ro_dim:],
+        ],
+        dim=-1,
+    )
+def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    t_ = t.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    output = apply_rotary_emb_torch(t_, cos, sin).type_as(t)
+    return output
+class VisionAttention(nn.Module):
+    """Multi-headed attention without any cache, mostly used for ViT."""
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        use_qkv_parallel: bool,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads
+        )
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, world_size
+        )
+        # self.tp_size = get_tensor_model_parallel_world_size()
+        # num_heads = self.num_heads_per_partition
+        self.use_qkv_parallel = use_qkv_parallel
+        if use_qkv_parallel:
+            self.head_dim = embed_dim // num_heads
+            self.qkv_proj = QKVParallelLinear(
+                hidden_size=embed_dim,
+                head_size=self.head_dim,
+                total_num_heads=num_heads,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+        else:
+            self.qkv_proj = ColumnParallelLinear(
+                input_size=embed_dim,
+                output_size=3 * projection_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+        self.proj = RowParallelLinear(
+            input_size=embed_dim,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        """
+        Input shape: [b, s, embed_dim]
+        Output shape: [s, b, num_heads * head_size]
+        """
+        bsz, s, _ = x.shape
+        if self.use_qkv_parallel:
+            # [b, s, embed_dim] --> [b, s, embed_dim]
+            qkv, _ = self.qkv_proj(x)
+            q, k, v = qkv.chunk(3, dim=-1)
+            # [b, s, embed_dim] --> [b * s, num_heads, head_size]
+            q, k, v = [
+                x.reshape(
+                    bsz * s, self.num_attention_heads_per_partition, -1
+                ).contiguous()
+                for x in (q, k, v)
+            ]
+        else:
+            # [b, s, embed_dim] --> [s, b, embed_dim]
+            x = rearrange(x, "b s ... -> s b ...")
+            # [s, b, embed_dim] --> [s, b, head * 3 * head_dim]
+            qkv, _ = self.qkv_proj(x)
+            # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim]
+            new_x_shape = qkv.size()[:-1] + (
+                self.num_attention_heads_per_partition,
+                3 * self.hidden_size_per_attention_head,
+            )
+            qkv = qkv.view(*new_x_shape)
+            # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim]
+            q, k, v = dist_utils.split_tensor_along_last_dim(qkv, 3)
+            # [s, b, head, head_dim] --> [b, s, head, head_dim]
+            q, k, v = [
+                rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
+            ]
+        if rotary_pos_emb is not None:
+            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+        if self.use_qkv_parallel:
+            pass
+        else:
+            # [b, s, head, head_dim] --> [b * s, head, head_dim]
+            q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
+        # [b * s, num_heads, head_size]
+        output = torch.empty_like(q)
+        seq_lens = (cu_seqlens[1:] - cu_seqlens[:-1]).cuda()
+        max_seqlen = seq_lens.max().item()
+        context_attention_fwd(
+            q,
+            k,
+            v,
+            output,
+            cu_seqlens.cuda(),
+            seq_lens,
+            max_seqlen,
+            is_causal=False,
+        )
+        if self.use_qkv_parallel:
+            # [b * s, head, head_dim] --> [b, s, head * head_dim]
+            output = rearrange(output, "(b s) ... h d -> b s ... (h d)", b=bsz)
+            # [b, s, head, head_dim] --> [b, s, head, head_dim]
+            output, _ = self.proj(output)
+        else:
+            # [b * s, head, head_dim] --> [b, s, head, head_dim]
+            context_layer = rearrange(output, "(b s) ... -> b s ...", b=bsz)
+            # [s, b, num_heads * head_size]
+            context_layer = rearrange(
+                context_layer, "b s h d -> s b (h d)"
+            ).contiguous()
+            # [s, b, num_heads * head_size] --> [s, b, num_heads * head_size]
+            output, _ = self.proj(context_layer)
+            output = output.view(bsz, s, -1)
+        return output

sglang/srt/layers/dp_attention.py ADDED Viewed

@@ -0,0 +1,71 @@
+import torch
+from sglang.srt.distributed import GroupCoordinator, get_tp_group
+_ATTN_TP_GROUP = None
+_ATTN_TP_RANK = None
+_ATTN_TP_SIZE = None
+_DP_RANK = None
+_DP_SIZE = None
+def compute_dp_attention_world_info(enable_dp_attention, tp_rank, tp_size, dp_size):
+    if not enable_dp_attention:
+        return tp_rank, tp_size, 0
+    attn_tp_size = tp_size // dp_size
+    dp_rank = tp_rank // attn_tp_size
+    attn_tp_rank = tp_rank % attn_tp_size
+    return attn_tp_rank, attn_tp_size, dp_rank
+def initialize_dp_attention(enable_dp_attention, tp_rank, tp_size, dp_size):
+    global _ATTN_TP_GROUP, _ATTN_TP_RANK, _ATTN_TP_SIZE, _DP_RANK, _DP_SIZE
+    from sglang.srt.layers.sampler import SYNC_TOKEN_IDS_ACROSS_TP
+    _ATTN_TP_RANK, _ATTN_TP_SIZE, _DP_RANK = compute_dp_attention_world_info(
+        enable_dp_attention, tp_rank, tp_size, dp_size
+    )
+    _DP_SIZE = dp_size
+    tp_group = get_tp_group()
+    _ATTN_TP_GROUP = GroupCoordinator(
+        [
+            list(range(head, head + _ATTN_TP_SIZE))
+            for head in range(0, tp_size, _ATTN_TP_SIZE)
+        ],
+        tp_rank,
+        torch.distributed.get_backend(tp_group.device_group),
+        SYNC_TOKEN_IDS_ACROSS_TP,
+        False,
+        False,
+        False,
+        False,
+        group_name="attention_tp",
+    )
+def get_attention_tp_group():
+    assert _ATTN_TP_GROUP is not None, "dp attention not initialized!"
+    return _ATTN_TP_GROUP
+def get_attention_tp_rank():
+    assert _ATTN_TP_RANK is not None, "dp attention not initialized!"
+    return _ATTN_TP_RANK
+def get_attention_tp_size():
+    assert _ATTN_TP_SIZE is not None, "dp attention not initialized!"
+    return _ATTN_TP_SIZE
+def get_attention_dp_rank():
+    assert _DP_RANK is not None, "dp attention not initialized!"
+    return _DP_RANK
+def get_attention_dp_size():
+    assert _DP_SIZE is not None, "dp attention not initialized!"
+    return _DP_SIZE

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -19,10 +19,10 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import is_cuda_available
-if is_flashinfer_available():
-    from flashinfer.norm import (
+if is_cuda_available():
+    from sgl_kernel import (
         fused_add_rmsnorm,
         gemma_fused_add_rmsnorm,
         gemma_rmsnorm,
@@ -121,8 +121,8 @@ class GemmaRMSNorm(CustomOp):
         return out
-if not is_flashinfer_available():
+if not is_cuda_available():
     logger.info(
-        "FlashInfer is not available on Non-NV platforms. Fallback to other kernel libraries."
+        "sgl-kernel is not available on Non-NV platforms. Fallback to other kernel libraries."
     )
     from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm

sglang/srt/layers/linear.py CHANGED Viewed

@@ -7,7 +7,8 @@ from typing import Dict, List, Optional, Tuple
 import torch
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter, UninitializedParameter
-from vllm.distributed import (
+from sglang.srt.distributed import (
     divide,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -15,10 +16,6 @@ from vllm.distributed import (
     tensor_model_parallel_all_gather,
     tensor_model_parallel_all_reduce,
 )
-# Workaround: many QuantizationConfig still depends on this, so we have to use vLLM's LinearBase now.
-from vllm.model_executor.layers.linear import LinearBase
 from sglang.srt.layers.parameter import (
     BasevLLMParameter,
     PackedColumnParameter,
@@ -174,6 +171,45 @@ class UnquantizedLinearMethod(LinearMethodBase):
         return F.linear(x, layer.weight, bias)
+class LinearBase(torch.nn.Module):
+    """Base linear layer.
+    Args:
+        input_size: input dimension of the linear layer.
+        output_size: output dimension of the linear layer.
+        bias: If true, add bias.
+        skip_bias_add: If true, skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+    """
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.skip_bias_add = skip_bias_add
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+        if quant_config is None:
+            self.quant_method: Optional[QuantizeMethodBase] = UnquantizedLinearMethod()
+        else:
+            self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
 class ReplicatedLinear(LinearBase):
     """Replicated linear layer.
@@ -293,12 +329,14 @@ class ColumnParallelLinear(LinearBase):
         prefix: str = "",
         tp_rank: Optional[int] = None,
         tp_size: Optional[int] = None,
+        use_presharded_weights: bool = False,
     ):
         super().__init__(
             input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
         )
         self.gather_output = gather_output
+        self.use_presharded_weights = use_presharded_weights
         # Divide the weight matrix along the last dimension.
         if tp_rank is None:
@@ -366,7 +404,8 @@ class ColumnParallelLinear(LinearBase):
         if output_dim is not None and not use_bitsandbytes_4bit:
             shard_size = param_data.shape[output_dim]
             start_idx = self.tp_rank * shard_size
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+            if not self.use_presharded_weights:
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         # Special case for loading scales off disk, which often do not
         # have a shape (such as in the case of AutoFP8).
@@ -382,7 +421,11 @@ class ColumnParallelLinear(LinearBase):
         if len(loaded_weight.shape) == 0:
             assert loaded_weight.numel() == 1
             loaded_weight = loaded_weight.reshape(1)
-        param.load_column_parallel_weight(loaded_weight, tp_rank=self.tp_rank)
+        param.load_column_parallel_weight(
+            loaded_weight,
+            tp_rank=self.tp_rank,
+            use_presharded_weights=self.use_presharded_weights,
+        )
     def forward(self, input_):
         bias = self.bias if not self.skip_bias_add else None
@@ -463,7 +506,9 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             prefix=prefix,
             tp_rank=tp_rank,
             tp_size=tp_size,
+            use_presharded_weights=use_presharded_weights,
         )
+        self.prefix = prefix
     def weight_loader(
         self,
@@ -707,6 +752,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         prefix: str = "",
         tp_rank: Optional[int] = None,
         tp_size: Optional[int] = None,
+        load_presharded_attn: bool = False,
     ):
         self.hidden_size = hidden_size
         self.head_size = head_size
@@ -736,6 +782,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             self.num_kv_heads * self.head_size * tp_size,  # k_proj
             self.num_kv_heads * self.head_size * tp_size,  # v_proj
         ]
+        self.use_presharded_weights = load_presharded_attn
         super().__init__(
             input_size=input_size,
@@ -748,6 +795,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             prefix=prefix,
             tp_rank=tp_rank,
             tp_size=tp_size,
+            use_presharded_weights=self.use_presharded_weights,
         )
     def _get_shard_offset_mapping(self, loaded_shard_id: str):
@@ -806,9 +854,10 @@ class QKVParallelLinear(ColumnParallelLinear):
                     shard_size=shard_size, shard_offset=shard_offset
                 )
-            loaded_weight_shard = loaded_weight.narrow(
-                param.output_dim, shard_offset, shard_size
-            )
+            if not self.use_presharded_weights:
+                loaded_weight_shard = loaded_weight.narrow(
+                    param.output_dim, shard_offset, shard_size
+                )
             self.weight_loader_v2(param, loaded_weight_shard, shard_id)
     def weight_loader_v2(
@@ -846,6 +895,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             shard_offset=shard_offset,
             shard_size=shard_size,
             tp_rank=self.tp_rank,
+            use_presharded_weights=self.use_presharded_weights,
         )
     def weight_loader(
@@ -951,9 +1001,10 @@ class QKVParallelLinear(ColumnParallelLinear):
                         param, orig_qkv_offsets, shard_id
                     )
-                loaded_weight_shard = loaded_weight.narrow(
-                    output_dim, shard_offset, shard_size
-                )
+                if not self.use_presharded_weights:
+                    loaded_weight_shard = loaded_weight.narrow(
+                        output_dim, shard_offset, shard_size
+                    )
                 self.weight_loader(param, loaded_weight_shard, shard_id)
             return
@@ -1013,7 +1064,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             # bitsandbytes loads the weights of the specific portion
             # no need to narrow here
-            if not use_bitsandbytes_4bit:
+            if not use_bitsandbytes_4bit and not self.use_presharded_weights:
                 loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         # Special case for for AQLM codebooks.

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -14,17 +14,18 @@
 """Logits processing."""
 import dataclasses
+import logging
 from typing import List, Optional, Union
 import torch
 import triton
 import triton.language as tl
 from torch import nn
-from vllm.distributed import (
+from sglang.srt.distributed import (
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_gather,
 )
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from sglang.srt.model_executor.forward_batch_info import (
     CaptureHiddenMode,
@@ -32,6 +33,8 @@ from sglang.srt.model_executor.forward_batch_info import (
     ForwardMode,
 )
+logger = logging.getLogger(__name__)
 @dataclasses.dataclass
 class LogitsProcessorOutput:
@@ -50,8 +53,6 @@ class LogitsProcessorOutput:
     next_token_top_logprobs_idx: Optional[List] = None
     ## Part 3: Prefill-only. This part will be assigned in python/sglang/srt/layers/logits_processor.py::LogitsProcessor
-    # The normlaized logprobs of prompts.  shape: [#seq]
-    normalized_prompt_logprobs: torch.Tensor = None
     # The logprobs of input tokens.        shape: [#token]
     input_token_logprobs: torch.Tensor = None
     # The logprobs and ids of the top-k tokens in input positions.  shape: [#seq, #token, k]
@@ -129,59 +130,70 @@ class LogitsProcessor(nn.Module):
         hidden_states,
         lm_head: VocabParallelEmbedding,
         logits_metadata: Union[LogitsMetadata, ForwardBatch],
-    ):
+    ) -> LogitsProcessorOutput:
         if isinstance(logits_metadata, ForwardBatch):
             logits_metadata = LogitsMetadata.from_forward_batch(logits_metadata)
         # Get the last hidden states and last logits for the next token prediction
         if (
-            logits_metadata.forward_mode.is_decode()
+            logits_metadata.forward_mode.is_decode_or_idle()
             or logits_metadata.forward_mode.is_target_verify()
         ):
-            last_index = None
-            last_hidden = hidden_states
-        else:
+            pruned_states = hidden_states
+            sample_indices = None
+        elif (
+            logits_metadata.forward_mode.is_extend()
+            and not logits_metadata.extend_return_logprob
+        ):
+            # Prefill without input logprobs.
             last_index = torch.cumsum(logits_metadata.extend_seq_lens, dim=0) - 1
-            last_hidden = hidden_states[last_index]
+            pruned_states = hidden_states[last_index]
+            sample_indices = None
+        else:
+            # Slice the requested tokens to compute logprob
+            sample_index_pt = -1
+            sample_indices = []
+            pt, pruned_states, pruned_input_ids = 0, [], []
+            for start_len, extend_len in zip(
+                logits_metadata.extend_logprob_start_lens_cpu,
+                logits_metadata.extend_seq_lens_cpu,
+            ):
+                pruned_states.append(hidden_states[pt + start_len : pt + extend_len])
+                sample_index_pt += extend_len - start_len
+                sample_indices.append(sample_index_pt)
+                pruned_input_ids.append(input_ids[pt + start_len : pt + extend_len])
+                pt += extend_len
+            pruned_states = torch.cat(pruned_states)
+        # Compute logits for both input and sampled tokens.
+        logits = self._get_logits(pruned_states, lm_head, logits_metadata)
+        sampled_logits = (
+            logits[sample_indices] if sample_indices is not None else logits
+        )
-        # Compute logits
-        last_logits = self._get_logits(last_hidden, lm_head)
         if (
             not logits_metadata.extend_return_logprob
             or logits_metadata.capture_hidden_mode.need_capture()
         ):
             # Decode mode or extend mode without return_logprob.
             return LogitsProcessorOutput(
-                next_token_logits=last_logits,
+                next_token_logits=sampled_logits,
                 hidden_states=(
                     hidden_states
                     if logits_metadata.capture_hidden_mode.is_full()
                     else (
-                        last_hidden
+                        pruned_states
                         if logits_metadata.capture_hidden_mode.is_last()
                         else None
                     )
                 ),
             )
         else:
-            # Slice the requested tokens to compute logprob
-            pt, pruned_states, pruned_input_ids = 0, [], []
-            for start_len, extend_len in zip(
-                logits_metadata.extend_logprob_start_lens_cpu,
-                logits_metadata.extend_seq_lens_cpu,
-            ):
-                pruned_states.append(hidden_states[pt + start_len : pt + extend_len])
-                pruned_input_ids.append(input_ids[pt + start_len : pt + extend_len])
-                pt += extend_len
-            # Compute the logits of all required tokens
-            pruned_states = torch.cat(pruned_states)
-            del hidden_states
-            input_token_logits = self._get_logits(pruned_states, lm_head)
-            del pruned_states
+            input_logprobs = logits
+            del hidden_states, logits
             # Normalize the logprob w/o temperature, top-p
-            input_logprobs = input_token_logits
             input_logprobs = self.compute_temp_top_p_normalized_logprobs(
                 input_logprobs, logits_metadata
             )
@@ -195,25 +207,18 @@ class LogitsProcessor(nn.Module):
             else:
                 input_top_logprobs_val = input_top_logprobs_idx = None
-            # Compute the normalized logprobs for the requested tokens.
-            # Note that we pad a zero at the end for easy batching.
             input_token_logprobs = input_logprobs[
-                torch.arange(input_logprobs.shape[0], device="cuda"),
+                torch.arange(input_logprobs.shape[0], device=input_logprobs.device),
                 torch.cat(
                     [
                         torch.cat(pruned_input_ids)[1:],
-                        torch.tensor([0], device="cuda"),
+                        torch.tensor([0], device=input_logprobs.device),
                     ]
                 ),
             ]
-            normalized_prompt_logprobs = self._get_normalized_prompt_logprobs(
-                input_token_logprobs,
-                logits_metadata,
-            )
             return LogitsProcessorOutput(
-                next_token_logits=last_logits,
-                normalized_prompt_logprobs=normalized_prompt_logprobs,
+                next_token_logits=sampled_logits,
                 input_token_logprobs=input_token_logprobs,
                 input_top_logprobs_val=input_top_logprobs_val,
                 input_top_logprobs_idx=input_top_logprobs_idx,
@@ -223,8 +228,11 @@ class LogitsProcessor(nn.Module):
         self,
         hidden_states: torch.Tensor,
         lm_head: VocabParallelEmbedding,
+        logits_metadata: LogitsMetadata,
         embedding_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        """Get logits from hidden_states."""
         if hasattr(lm_head, "weight"):
             logits = torch.matmul(hidden_states, lm_head.weight.T)
         else:
@@ -237,8 +245,6 @@ class LogitsProcessor(nn.Module):
         if self.do_tensor_parallel_all_gather:
             logits = tensor_model_parallel_all_gather(logits)
-        # Compute the normalized logprobs for the requested tokens.
-        # Note that we pad a zero at the end for easy batching.
         logits = logits[:, : self.config.vocab_size].float()
         if self.final_logit_softcapping:
@@ -246,27 +252,6 @@ class LogitsProcessor(nn.Module):
         return logits
-    @staticmethod
-    def _get_normalized_prompt_logprobs(
-        input_token_logprobs: torch.Tensor,
-        logits_metadata: LogitsMetadata,
-    ):
-        logprobs_cumsum = torch.cumsum(input_token_logprobs, dim=0, dtype=torch.float32)
-        pruned_lens = torch.tensor(
-            logits_metadata.extend_logprob_pruned_lens_cpu, device="cuda"
-        )
-        start = torch.zeros_like(pruned_lens)
-        start[1:] = torch.cumsum(pruned_lens[:-1], dim=0)
-        end = torch.clamp(
-            start + pruned_lens - 2, min=0, max=logprobs_cumsum.shape[0] - 1
-        )
-        sum_logp = (
-            logprobs_cumsum[end] - logprobs_cumsum[start] + input_token_logprobs[start]
-        )
-        normalized_prompt_logprobs = sum_logp / (pruned_lens - 1).clamp(min=1)
-        return normalized_prompt_logprobs
     @staticmethod
     def get_top_logprobs(all_logprobs: torch.Tensor, logits_metadata: LogitsMetadata):
         max_k = max(logits_metadata.top_logprobs_nums)
@@ -311,7 +296,7 @@ def fused_softcap_kernel(
     n_elements,
     BLOCK_SIZE: tl.constexpr,
 ):
-    pid = tl.program_id(0)
+    pid = tl.program_id(0).to(tl.int64)
     block_start = pid * BLOCK_SIZE
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
     mask = offsets < n_elements

sglang 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl

sglang 0.4.1.post6py3-none-any.whl → 0.4.2py3-none-any.whl