PyPI - sglang - Versions diffs - 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl - Mend

sglang 0.2.11py3-none-any.whl → 0.2.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

sglang/api.py +7 -1
sglang/bench_latency.py +9 -6
sglang/bench_serving.py +46 -22
sglang/global_config.py +1 -1
sglang/lang/backend/runtime_endpoint.py +60 -49
sglang/lang/compiler.py +2 -2
sglang/lang/interpreter.py +4 -2
sglang/lang/ir.py +16 -7
sglang/srt/constrained/base_tool_cache.py +1 -1
sglang/srt/constrained/fsm_cache.py +12 -2
sglang/srt/constrained/jump_forward.py +13 -2
sglang/srt/layers/activation.py +32 -0
sglang/srt/layers/{token_attention.py → decode_attention.py} +9 -5
sglang/srt/layers/extend_attention.py +9 -2
sglang/srt/layers/fused_moe/__init__.py +1 -0
sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
sglang/srt/layers/fused_moe/layer.py +587 -0
sglang/srt/layers/layernorm.py +65 -0
sglang/srt/layers/logits_processor.py +7 -2
sglang/srt/layers/pooler.py +50 -0
sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} +5 -0
sglang/srt/layers/radix_attention.py +40 -16
sglang/srt/managers/detokenizer_manager.py +31 -9
sglang/srt/managers/io_struct.py +63 -0
sglang/srt/managers/policy_scheduler.py +173 -25
sglang/srt/managers/schedule_batch.py +115 -97
sglang/srt/managers/tokenizer_manager.py +194 -112
sglang/srt/managers/tp_worker.py +290 -359
sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} +9 -4
sglang/srt/mem_cache/chunk_cache.py +43 -20
sglang/srt/mem_cache/memory_pool.py +2 -2
sglang/srt/mem_cache/radix_cache.py +74 -40
sglang/srt/model_executor/cuda_graph_runner.py +71 -25
sglang/srt/model_executor/forward_batch_info.py +293 -156
sglang/srt/model_executor/model_runner.py +77 -57
sglang/srt/models/chatglm.py +2 -2
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/deepseek.py +2 -2
sglang/srt/models/deepseek_v2.py +7 -6
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +11 -6
sglang/srt/models/grok.py +50 -396
sglang/srt/models/internlm2.py +2 -7
sglang/srt/models/llama2.py +4 -4
sglang/srt/models/llama_embedding.py +88 -0
sglang/srt/models/minicpm.py +2 -2
sglang/srt/models/mixtral.py +56 -254
sglang/srt/models/mixtral_quant.py +1 -4
sglang/srt/models/qwen.py +2 -2
sglang/srt/models/qwen2.py +2 -2
sglang/srt/models/qwen2_moe.py +2 -13
sglang/srt/models/stablelm.py +1 -1
sglang/srt/openai_api/adapter.py +187 -48
sglang/srt/openai_api/protocol.py +37 -1
sglang/srt/sampling/penaltylib/__init__.py +13 -0
sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
sglang/srt/sampling_params.py +31 -8
sglang/srt/server.py +91 -29
sglang/srt/server_args.py +32 -19
sglang/srt/utils.py +32 -15
sglang/test/run_eval.py +10 -1
sglang/test/runners.py +81 -73
sglang/test/simple_eval_humaneval.py +2 -8
sglang/test/simple_eval_mgsm.py +203 -0
sglang/test/srt/sampling/penaltylib/utils.py +337 -0
sglang/test/test_layernorm.py +60 -0
sglang/test/test_programs.py +36 -7
sglang/test/test_utils.py +24 -2
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/METADATA +33 -16
sglang-0.2.13.dist-info/RECORD +112 -0
{sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/WHEEL +1 -1
sglang/srt/layers/linear.py +0 -884
sglang/srt/layers/quantization/__init__.py +0 -64
sglang/srt/layers/quantization/fp8.py +0 -677
sglang/srt/model_loader/model_loader.py +0 -292
sglang/srt/model_loader/utils.py +0 -275
sglang-0.2.11.dist-info/RECORD +0 -102
{sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/LICENSE +0 -0
{sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/top_level.txt +0 -0

sglang/srt/layers/activation.py ADDED Viewed

@@ -0,0 +1,32 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""Fused operators for activation layers."""
+import torch
+import torch.nn.functional as F
+from flashinfer.activation import silu_and_mul
+from vllm.model_executor.custom_op import CustomOp
+class SiluAndMul(CustomOp):
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        return F.silu(x[..., :d]) * x[..., d:]
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        silu_and_mul(x, out)
+        return out

sglang/srt/layers/{token_attention.py → decode_attention.py} RENAMED Viewed

@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+"""
+Memory-efficient attention for decoding.
+"""
 # Adapted from
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_softmax_and_reducev.py
@@ -194,7 +198,7 @@ def _fwd_kernel_stage2(
     tl.store(out_ptrs, acc)
-def _token_att_m_fwd(
+def _decode_att_m_fwd(
     q,
     k_buffer,
     att_out,
@@ -254,7 +258,7 @@ def _token_att_m_fwd(
     )
-def _token_softmax_reducev_fwd(
+def _decode_softmax_reducev_fwd(
     logics,
     v_buffer,
     o,
@@ -292,7 +296,7 @@ def _token_softmax_reducev_fwd(
     )
-def token_attention_fwd(
+def decode_attention_fwd(
     q,
     k_buffer,
     v_buffer,
@@ -312,7 +316,7 @@ def token_attention_fwd(
             (q.shape[-2], total_num_tokens), dtype=REDUCE_TORCH_TYPE, device="cuda"
         )
-    _token_att_m_fwd(
+    _decode_att_m_fwd(
         q,
         k_buffer,
         att_m,
@@ -324,7 +328,7 @@ def token_attention_fwd(
         sm_scale,
         logit_cap,
     )
-    _token_softmax_reducev_fwd(
+    _decode_softmax_reducev_fwd(
         att_m,
         v_buffer,
         o,

sglang/srt/layers/extend_attention.py CHANGED Viewed

@@ -13,11 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+"""
+Memory-efficient attention for prefill.
+It supporst page size = 1 and prefill with KV cache (i.e. extend).
+"""
 import torch
 import triton
 import triton.language as tl
-from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
+from sglang.srt.layers.prefill_attention import context_attention_fwd
 CUDA_CAPABILITY = torch.cuda.get_device_capability()
@@ -270,7 +275,9 @@ def extend_attention_fwd(
         BLOCK_DPE = 0
     BLOCK_DV = Lv
-    if CUDA_CAPABILITY[0] >= 8:
+    if CUDA_CAPABILITY[0] >= 9:
+        BLOCK_M, BLOCK_N = (128, 64)
+    elif CUDA_CAPABILITY[0] >= 8:
         BLOCK_M, BLOCK_N = (128, 128) if Lq <= 128 else (64, 64)
     else:
         BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)

sglang/srt/layers/fused_moe/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from sglang.srt.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase

sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} RENAMED Viewed

@@ -1,20 +1,5 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
 # Adapted from
-# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/layers/fused_moe/fused_moe.py#L1
+# https://github.com/vllm-project/vllm/tree/v0.5.4/vllm/model_executor/layers/fused_moe
 """Fused MoE kernel."""
 import functools
 import json
@@ -24,6 +9,7 @@ from typing import Any, Dict, Optional, Tuple
 import torch
 import triton
 import triton.language as tl
+import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
@@ -373,6 +359,31 @@ def get_default_config(
     return config
+def try_get_optimal_moe_config(
+    w1_shape: Tuple[int, ...],
+    w2_shape: Tuple[int, ...],
+    top_k: int,
+    dtype: Optional[str],
+    M: int,
+    override_config: Optional[Dict[str, Any]] = None,
+):
+    if override_config:
+        config = override_config
+    else:
+        # First try to load optimal config from the file
+        E, _, N = w2_shape
+        configs = get_moe_configs(E, N, dtype)
+        if configs:
+            # If an optimal configuration map has been found, look up the
+            # optimal config
+            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+        else:
+            # Else use the default config
+            config = get_default_config(M, E, N, w1_shape[2], top_k, dtype)
+    return config
 def fused_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -403,6 +414,41 @@ def fused_topk(
     return topk_weights, topk_ids
+# This is used by the Deepseek-V2 model
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    scores = torch.softmax(gating_output, dim=-1)
+    num_token = scores.shape[0]
+    group_scores = (
+        scores.view(num_token, num_expert_group, -1).max(dim=-1).values
+    )  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
+        1
+    ]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
+        .reshape(num_token, -1)
+    )  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids
 def fused_experts(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -425,24 +471,23 @@ def fused_experts(
     assert w2.is_contiguous(), "Expert weights2 must be contiguous"
     assert hidden_states.dtype in [torch.float32, torch.float16, torch.bfloat16]
-    M, _ = hidden_states.shape
+    num_tokens, _ = hidden_states.shape
     E, N, _ = w1.shape
+    # We execute the fused_moe kernel in chunks to circumvent this issue:
+    # https://github.com/vllm-project/vllm/issues/5938
+    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+    M = min(num_tokens, CHUNK_SIZE)
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.shape,
+        w2.shape,
+        topk_ids.shape[1],
+        "float8" if use_fp8 else None,
+        override_config=override_config,
+    )
-    if override_config:
-        config = override_config
-    else:
-        # First try to load optimal config from the file
-        configs = get_moe_configs(E, w2.shape[2], "float8" if use_fp8 else None)
-        if configs:
-            # If an optimal configuration map has been found, look up the
-            # optimal config
-            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
-        else:
-            # Else use the default config
-            config = get_default_config(
-                M, E, N, w1.shape[2], topk_ids.shape[1], "float8" if use_fp8 else None
-            )
+    config = get_config_func(M)
     intermediate_cache1 = torch.empty(
         (M, topk_ids.shape[1], N),
@@ -460,56 +505,85 @@ def fused_experts(
         dtype=hidden_states.dtype,
     )
-    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
-        topk_ids, config["BLOCK_SIZE_M"], E
-    )
     compute_type = tl.bfloat16 if hidden_states.dtype == torch.bfloat16 else tl.float16
-    invoke_fused_moe_kernel(
-        hidden_states,
-        w1,
-        intermediate_cache1,
-        a1_scale,
-        w1_scale,
-        topk_weights,
-        topk_ids,
-        sorted_token_ids,
-        expert_ids,
-        num_tokens_post_padded,
-        False,
-        topk_ids.shape[1],
-        config,
-        compute_type=compute_type,
-        use_fp8=use_fp8,
-    )
+    if inplace:
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
-    ops.gelu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+    for chunk in range((num_tokens // CHUNK_SIZE) + 1):
+        begin_chunk_idx, end_chunk_idx = (
+            chunk * CHUNK_SIZE,
+            min((chunk + 1) * CHUNK_SIZE, num_tokens),
+        )
+        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+        tokens_in_chunk, _ = curr_hidden_states.shape
+        if tokens_in_chunk == 0:
+            break
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            # Adjust the intermediate cache size and config for the last
+            # chunk. Note that in most cases we only have one chunk
+            # so the cache size and config are already set correctly and
+            # do not need to be adjusted.
+            intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
+            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
+            intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
+            config = get_config_func(tokens_in_chunk)
+        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+        sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+            curr_topk_ids, config["BLOCK_SIZE_M"], E
+        )
-    invoke_fused_moe_kernel(
-        intermediate_cache2,
-        w2,
-        intermediate_cache3,
-        a2_scale,
-        w2_scale,
-        topk_weights,
-        topk_ids,
-        sorted_token_ids,
-        expert_ids,
-        num_tokens_post_padded,
-        True,
-        1,
-        config,
-        compute_type=compute_type,
-        use_fp8=use_fp8,
-    )
+        invoke_fused_moe_kernel(
+            curr_hidden_states,
+            w1,
+            intermediate_cache1,
+            a1_scale,
+            w1_scale,
+            curr_topk_weights,
+            curr_topk_ids,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            False,
+            topk_ids.shape[1],
+            config,
+            compute_type=compute_type,
+            use_fp8=use_fp8,
+        )
-    if inplace:
-        return torch.sum(
+        ops.gelu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+        invoke_fused_moe_kernel(
+            intermediate_cache2,
+            w2,
+            intermediate_cache3,
+            a2_scale,
+            w2_scale,
+            curr_topk_weights,
+            curr_topk_ids,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            True,
+            1,
+            config,
+            compute_type=compute_type,
+            use_fp8=use_fp8,
+        )
+        torch.sum(
             intermediate_cache3.view(*intermediate_cache3.shape),
             dim=1,
-            out=hidden_states,
+            out=out_hidden_states[begin_chunk_idx:end_chunk_idx],
         )
-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+    return out_hidden_states
 def fused_moe(
@@ -521,6 +595,9 @@ def fused_moe(
     renormalize: bool,
     inplace: bool = False,
     override_config: Optional[Dict[str, Any]] = None,
+    use_grouped_topk: bool = False,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
     use_fp8: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
@@ -543,6 +620,10 @@ def fused_moe(
         Defaults to False.
     - override_config (Optional[Dict[str, Any]]): Optional override
         for the kernel configuration.
+    - num_expert_group: Optional[int]: additional parameter for grouped_topk
+    - topk_group: Optional[int]: additional parameter for grouped_topk
+    - use_grouped_topk: If True, use grouped_topk instead of fused_topk
+        note: Deepseekv2 model uses grouped_topk
     - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
         products for w1 and w2. Defaults to False.
     - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
@@ -556,12 +637,18 @@ def fused_moe(
     # Check constraints.
     assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
-    if hasattr(ops, "topk_softmax"):
-        topk_weights, topk_ids = fused_topk(
-            hidden_states, gating_output, topk, renormalize
+    if use_grouped_topk:
+        assert num_expert_group is not None and topk_group is not None
+        topk_weights, topk_ids = grouped_topk(
+            hidden_states,
+            gating_output,
+            topk,
+            renormalize,
+            num_expert_group,
+            topk_group,
         )
     else:
-        topk_weights, topk_ids = fused_topk_v0_4_3(
+        topk_weights, topk_ids = fused_topk(
             hidden_states, gating_output, topk, renormalize
         )
@@ -579,33 +666,3 @@ def fused_moe(
         a1_scale=a1_scale,
         a2_scale=a2_scale,
     )
-def fused_topk_v0_4_3(
-    hidden_states: torch.Tensor,
-    gating_output: torch.Tensor,
-    topk: int,
-    renormalize: bool,
-):
-    import vllm._moe_C as moe_kernels
-    M, _ = hidden_states.shape
-    topk_weights = torch.empty(
-        M, topk, dtype=torch.float32, device=hidden_states.device
-    )
-    topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
-    token_expert_indicies = torch.empty(
-        M, topk, dtype=torch.int32, device=hidden_states.device
-    )
-    moe_kernels.topk_softmax(
-        topk_weights,
-        topk_ids,
-        token_expert_indicies,
-        gating_output.float(),  # TODO(woosuk): Optimize this.
-    )
-    del token_expert_indicies  # Not used. Will be used in the future.
-    if renormalize:
-        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
-    return topk_weights, topk_ids

sglang 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl

sglang 0.2.11py3-none-any.whl → 0.2.13py3-none-any.whl