PyPI - sglang - Versions diffs - 0.3.4.post1__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

sglang 0.3.4.post1py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

sglang/api.py +1 -1
sglang/bench_latency.py +3 -3
sglang/bench_server_latency.py +2 -3
sglang/bench_serving.py +92 -0
sglang/global_config.py +9 -3
sglang/lang/chat_template.py +50 -25
sglang/lang/interpreter.py +9 -1
sglang/lang/ir.py +11 -2
sglang/launch_server.py +1 -1
sglang/srt/configs/model_config.py +76 -15
sglang/srt/constrained/__init__.py +18 -0
sglang/srt/constrained/bnf_cache.py +61 -0
sglang/srt/constrained/fsm_cache.py +10 -3
sglang/srt/constrained/grammar.py +190 -0
sglang/srt/hf_transformers_utils.py +20 -5
sglang/srt/layers/attention/flashinfer_backend.py +5 -5
sglang/srt/layers/attention/triton_ops/decode_attention.py +110 -30
sglang/srt/layers/attention/triton_ops/prefill_attention.py +1 -1
sglang/srt/layers/fused_moe/fused_moe.py +4 -3
sglang/srt/layers/fused_moe/layer.py +28 -0
sglang/srt/layers/logits_processor.py +5 -5
sglang/srt/layers/quantization/base_config.py +16 -1
sglang/srt/layers/rotary_embedding.py +15 -48
sglang/srt/layers/sampler.py +51 -39
sglang/srt/layers/vocab_parallel_embedding.py +486 -0
sglang/srt/managers/data_parallel_controller.py +8 -7
sglang/srt/managers/detokenizer_manager.py +11 -9
sglang/srt/managers/image_processor.py +4 -3
sglang/srt/managers/io_struct.py +80 -78
sglang/srt/managers/schedule_batch.py +46 -52
sglang/srt/managers/schedule_policy.py +24 -13
sglang/srt/managers/scheduler.py +145 -82
sglang/srt/managers/tokenizer_manager.py +236 -334
sglang/srt/managers/tp_worker.py +5 -5
sglang/srt/managers/tp_worker_overlap_thread.py +58 -21
sglang/srt/mem_cache/flush_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +10 -3
sglang/srt/model_executor/cuda_graph_runner.py +34 -23
sglang/srt/model_executor/forward_batch_info.py +6 -9
sglang/srt/model_executor/model_runner.py +10 -19
sglang/srt/models/baichuan.py +4 -4
sglang/srt/models/chatglm.py +4 -4
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/dbrx.py +5 -5
sglang/srt/models/deepseek.py +4 -4
sglang/srt/models/deepseek_v2.py +4 -4
sglang/srt/models/exaone.py +4 -4
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +1 -1
sglang/srt/models/gpt2.py +287 -0
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +4 -4
sglang/srt/models/internlm2.py +4 -4
sglang/srt/models/llama.py +15 -7
sglang/srt/models/llama_embedding.py +2 -10
sglang/srt/models/llama_reward.py +5 -0
sglang/srt/models/minicpm.py +4 -4
sglang/srt/models/minicpm3.py +4 -4
sglang/srt/models/mixtral.py +7 -5
sglang/srt/models/mixtral_quant.py +4 -4
sglang/srt/models/mllama.py +5 -5
sglang/srt/models/olmo.py +4 -4
sglang/srt/models/olmoe.py +4 -4
sglang/srt/models/qwen.py +4 -4
sglang/srt/models/qwen2.py +4 -4
sglang/srt/models/qwen2_moe.py +4 -4
sglang/srt/models/qwen2_vl.py +4 -8
sglang/srt/models/stablelm.py +4 -4
sglang/srt/models/torch_native_llama.py +4 -4
sglang/srt/models/xverse.py +4 -4
sglang/srt/models/xverse_moe.py +4 -4
sglang/srt/openai_api/adapter.py +52 -66
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
sglang/srt/sampling/sampling_batch_info.py +7 -13
sglang/srt/sampling/sampling_params.py +5 -7
sglang/srt/server.py +41 -33
sglang/srt/server_args.py +34 -5
sglang/srt/utils.py +40 -56
sglang/test/run_eval.py +2 -0
sglang/test/runners.py +2 -1
sglang/test/srt/sampling/penaltylib/utils.py +1 -0
sglang/test/test_utils.py +151 -6
sglang/utils.py +62 -1
sglang/version.py +1 -1
sglang-0.3.5.dist-info/METADATA +344 -0
sglang-0.3.5.dist-info/RECORD +152 -0
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/WHEEL +1 -1
sglang-0.3.4.post1.dist-info/METADATA +0 -900
sglang-0.3.4.post1.dist-info/RECORD +0 -148
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/LICENSE +0 -0
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/top_level.txt +0 -0

sglang/srt/constrained/grammar.py ADDED Viewed

@@ -0,0 +1,190 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""Cache for the compressed finite state machine."""
+import logging
+from typing import List, Optional, Tuple, Union
+import torch
+from sglang.srt.constrained import GrammarMatcher, RegexGuide
+from sglang.srt.constrained.bnf_cache import BNFCache
+from sglang.srt.constrained.fsm_cache import FSMCache
+from sglang.srt.constrained.jump_forward import JumpForwardCache, JumpForwardMap
+# from sglang.srt.managers.schedule_batch import Req
+logger = logging.getLogger(__name__)
+INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
+class XGrammarJump:
+    pass
+class JumpHelper:
+    data: Union[List, str]
+    state: int
+    suffix_ids: List[int]
+    def __init__(
+        self, data: Union[List, str] = "", state: int = -1, suffix_ids=[]
+    ) -> None:
+        self.data = data
+        self.state = state
+        self.suffix_ids = suffix_ids
+    def can_jump(self):
+        return len(self.data) > 0
+class Grammar:
+    grammar: Union[GrammarMatcher, Tuple[RegexGuide, int]]
+    jump_map: Union[XGrammarJump, JumpForwardMap, None]
+    def __init__(
+        self,
+        grammar: Union[GrammarMatcher, Tuple[RegexGuide, int]],
+        jump_map: Union[XGrammarJump, JumpForwardMap, None],
+    ) -> None:
+        self.grammar = grammar
+        self.jump_map = jump_map
+    def accept_token(self, token: int):
+        if isinstance(self.grammar, GrammarMatcher):
+            assert self.grammar.accept_token(token)
+        else:
+            guide, state = self.grammar
+            self.grammar = guide, guide.get_next_state(state, token)
+    def try_jump(self, tokenizer) -> JumpHelper:
+        if isinstance(self.jump_map, XGrammarJump):
+            assert isinstance(self.grammar, GrammarMatcher)
+            return JumpHelper(self.grammar.find_jump_forward_string())
+        elif isinstance(self.jump_map, JumpForwardMap):
+            assert isinstance(self.grammar, Tuple)
+            _, state = self.grammar
+            jump_forward_bytes = self.jump_map.jump_forward_byte(state)
+            if jump_forward_bytes is None or len(jump_forward_bytes) == 0:
+                return JumpHelper()  # can't jump
+            # preprocess the jump forward string
+            suffix_bytes = []
+            continuation_range = range(0x80, 0xC0)
+            cur_state = state
+            while (
+                len(jump_forward_bytes)
+                and jump_forward_bytes[0][0] in continuation_range
+            ):
+                # continuation bytes
+                byte_edge = jump_forward_bytes.pop(0)
+                suffix_bytes.append(byte_edge[0])
+                cur_state = byte_edge[1]
+            suffix_tokens = [f"<0x{hex(b)[2:].upper()}>" for b in suffix_bytes]
+            suffix_ids = tokenizer.convert_tokens_to_ids(suffix_tokens)
+            return JumpHelper(suffix_ids, cur_state, suffix_bytes)
+        else:
+            return JumpHelper()  # can't jump
+    def jump_forward_str_state(self, helper: JumpHelper) -> Tuple[str, int]:
+        if isinstance(helper.data, str):
+            return helper.data, -1
+        else:
+            assert isinstance(self.jump_map, JumpForwardMap)
+            return self.jump_map.jump_forward_symbol(helper.state)
+    def jump_and_retokenize(
+        self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
+    ):
+        if isinstance(self.grammar, GrammarMatcher):
+            k = 0
+            for i, old_id in enumerate(old_output_ids):
+                if old_id == new_output_ids[i]:
+                    k = i + 1
+                else:
+                    break
+            # rollback to the last token that is the same
+            if k < len(old_output_ids):
+                self.grammar.rollback(len(old_output_ids) - k)
+            for i in range(k, len(new_output_ids)):
+                assert self.grammar.accept_token(new_output_ids[i])
+        else:
+            self.grammar = self.grammar[0], next_state
+    def fill_vocab_mask(self, vocab_mask: torch.Tensor, vocab_size: int):
+        if isinstance(self.grammar, GrammarMatcher):
+            # Note that this bitmask is a bitset, not bool
+            bitmask = self.grammar.find_next_token_bitmask()
+            # Mask the tokens that are not allowed
+            vocab_mask[
+                self.grammar.get_rejected_tokens_from_bitmask(bitmask, vocab_size)
+            ] = 1
+        else:
+            guide, state = self.grammar
+            vocab_mask.fill_(1)
+            vocab_mask[guide.get_next_instruction(state).tokens] = 0
+class GrammarCache:
+    grammar_cache: Union[BNFCache, FSMCache]
+    jump_cache: Union[XGrammarJump, JumpForwardCache, None]
+    def __init__(
+        self,
+        tokenizer_path,
+        tokenizer_args_dict,
+        skip_tokenizer_init=False,
+        whitespace_patterns=None,
+        backend=None,
+        allow_jump=False,
+    ):
+        if backend == "xgrammar":
+            self.grammar_cache = BNFCache(
+                tokenizer_path=tokenizer_path,
+                tokenizer_args_dict=tokenizer_args_dict,
+                skip_tokenizer_init=skip_tokenizer_init,
+                whitespace_patterns=whitespace_patterns,
+            )
+            self.jump_cache = XGrammarJump() if allow_jump else None
+        else:
+            assert backend == "outlines"
+            self.grammar_cache = FSMCache(
+                tokenizer_path=tokenizer_path,
+                tokenizer_args_dict=tokenizer_args_dict,
+                skip_tokenizer_init=skip_tokenizer_init,
+                constrained_json_whitespace_pattern=whitespace_patterns,
+                enable=True,
+            )
+            self.jump_cache = JumpForwardCache() if allow_jump else None
+    def query(self, key: Tuple[str, str], vocab_size: int) -> Grammar:
+        if isinstance(self.grammar_cache, BNFCache):
+            assert not isinstance(self.jump_cache, JumpForwardCache)
+            return Grammar(self.grammar_cache.query(key, vocab_size), self.jump_cache)
+        else:
+            jump_map = None
+            guide, regex = self.grammar_cache.query(key)
+            if isinstance(self.jump_cache, JumpForwardCache):
+                jump_map = self.jump_cache.query(regex)
+            return Grammar((guide, 0), jump_map)
+    def reset(self):
+        if isinstance(self.grammar_cache, FSMCache):
+            self.grammar_cache.reset()
+        if isinstance(self.jump_cache, JumpForwardCache):
+            self.jump_cache.reset()

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -81,26 +81,27 @@ def get_config(
 CONTEXT_LENGTH_KEYS = [
     "max_sequence_length",
     "seq_length",
-    "max_position_embeddings",
     "max_seq_len",
     "model_max_length",
+    "max_position_embeddings",
 ]
 def get_context_length(config):
     """Get the context length of a model from a huggingface model configs."""
-    rope_scaling = getattr(config, "rope_scaling", None)
+    text_config = config
+    rope_scaling = getattr(text_config, "rope_scaling", None)
     if rope_scaling:
-        rope_scaling_factor = config.rope_scaling.get("factor", 1)
+        rope_scaling_factor = rope_scaling.get("factor", 1)
         if "original_max_position_embeddings" in rope_scaling:
             rope_scaling_factor = 1
-        if config.rope_scaling.get("rope_type", None) == "llama3":
+        if rope_scaling.get("rope_type", None) == "llama3":
             rope_scaling_factor = 1
     else:
         rope_scaling_factor = 1
     for key in CONTEXT_LENGTH_KEYS:
-        val = getattr(config, key, None)
+        val = getattr(text_config, key, None)
         if val is not None:
             return int(rope_scaling_factor * val)
     return 2048
@@ -163,6 +164,8 @@ def get_tokenizer(
             "Using a slow tokenizer. This might cause a significant "
             "slowdown. Consider using a fast tokenizer instead."
         )
+    attach_additional_stop_token_ids(tokenizer)
     return tokenizer
@@ -181,4 +184,16 @@ def get_processor(
         tokenizer_revision=tokenizer_revision,
         **kwargs,
     )
+    attach_additional_stop_token_ids(processor.tokenizer)
     return processor
+def attach_additional_stop_token_ids(tokenizer):
+    # Special handling for stop token <|eom_id|> generated by llama 3 tool use.
+    if "<|eom_id|>" in tokenizer.get_added_vocab():
+        tokenizer.additional_stop_token_ids = set(
+            [tokenizer.get_added_vocab()["<|eom_id|>"]]
+        )
+    else:
+        tokenizer.additional_stop_token_ids = None

sglang/srt/layers/attention/flashinfer_backend.py CHANGED Viewed

@@ -337,7 +337,7 @@ class FlashInferIndicesUpdaterDecode:
     def update(
         self, req_pool_indices, seq_lens, seq_lens_sum, decode_wrappers, encoder_lens
     ):
-        # Keep the signature for type checking, will be initialized during runtime
+        # Keep the signature for type checking. It will be assigned during runtime.
         raise NotImplementedError()
     def update_single_wrapper(
@@ -432,8 +432,8 @@ class FlashInferIndicesUpdaterDecode:
         kv_start_idx,
     ):
         bs = len(req_pool_indices)
+        kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
         kv_indptr = kv_indptr[: bs + 1]
-        kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0)
         kv_indices = torch.empty(
             paged_kernel_lens_sum, dtype=torch.int32, device="cuda"
         )
@@ -497,7 +497,7 @@ class FlashInferIndicesUpdaterPrefill:
             self.update = self.update_single_wrapper
     def update(self, req_pool_indices, seq_lens, prefix_lens, use_ragged, encoder_lens):
-        # Keep the signature for type checking, will be initialized during runtime
+        # Keep the signature for type checking. It will be assigned during runtime.
         raise NotImplementedError()
     def update_single_wrapper(
@@ -589,8 +589,8 @@ class FlashInferIndicesUpdaterPrefill:
         use_ragged,
     ):
         bs = len(req_pool_indices)
+        kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
         kv_indptr = kv_indptr[: bs + 1]
-        kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0)
         kv_indices = torch.empty(kv_indptr[-1], dtype=torch.int32, device="cuda")
         create_flashinfer_kv_indices_triton[(bs,)](
             self.req_to_token,
@@ -602,8 +602,8 @@ class FlashInferIndicesUpdaterPrefill:
             self.max_context_len,
         )
+        qo_indptr[1 : bs + 1] = torch.cumsum(seq_lens - prefix_lens, dim=0)
         qo_indptr = qo_indptr[: bs + 1]
-        qo_indptr[1:] = torch.cumsum(seq_lens - prefix_lens, dim=0)
         # extend part
         if use_ragged:

sglang/srt/layers/attention/triton_ops/decode_attention.py CHANGED Viewed

@@ -24,6 +24,8 @@ It supports page size = 1.
 import triton
 import triton.language as tl
+from sglang.srt.utils import is_hip
 @triton.jit
 def tanh(x):
@@ -296,12 +298,18 @@ def _fwd_grouped_kernel_stage1(
     Lk: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
-    cur_kv_head = tl.program_id(1)
+    cur_head_id = tl.program_id(1)
+    cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
     start_n = tl.program_id(2)
     reduce_dtype = Att_Out.dtype.element_ty
-    cur_head = cur_kv_head * kv_group_num + tl.arange(0, BLOCK_H)
-    mask_h = cur_head < (cur_kv_head + 1) * kv_group_num
+    if BLOCK_H < kv_group_num:
+        VALID_BLOCK_H: tl.constexpr = BLOCK_H
+    else:
+        VALID_BLOCK_H: tl.constexpr = kv_group_num
+    cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
     mask_h = mask_h & (cur_head < q_head_num)
     offs_d = tl.arange(0, BLOCK_DMODEL)
@@ -400,10 +408,15 @@ def _fwd_grouped_kernel_stage2(
     Lv: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
-    cur_kv_head = tl.program_id(1)
+    cur_head_id = tl.program_id(1)
+    cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
-    cur_head = cur_kv_head * kv_group_num + tl.arange(0, BLOCK_H)
-    mask_h = cur_head < (cur_kv_head + 1) * kv_group_num
+    if BLOCK_H < kv_group_num:
+        VALID_BLOCK_H: tl.constexpr = BLOCK_H
+    else:
+        VALID_BLOCK_H: tl.constexpr = kv_group_num
+    cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
     mask_h = mask_h & (cur_head < q_head_num)
     cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
@@ -485,7 +498,7 @@ def _decode_grouped_att_m_fwd(
     batch, head_num = B_req_idx.shape[0], q.shape[1]
     kv_group_num = q.shape[1] // k_buffer.shape[1]
-    BLOCK_H = max(16, triton.next_power_of_2(kv_group_num))
+    BLOCK_H = max(16, min(64, triton.next_power_of_2(kv_group_num)))
     grid = (
         batch,
         triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
@@ -534,7 +547,7 @@ def _decode_grouped_softmax_reducev_fwd(
     BLOCK = 128
     batch, head_num = b_seq_len.shape[0], logits.shape[0]
     kv_group_num = logits.shape[0] // v_buffer.shape[1]
-    BLOCK_H = max(16, triton.next_power_of_2(kv_group_num))
+    BLOCK_H = max(16, min(64, triton.next_power_of_2(kv_group_num)))
     grid = (batch, triton.cdiv(head_num, min(BLOCK_H, kv_group_num)), 1)
     num_warps = 8
@@ -542,6 +555,12 @@ def _decode_grouped_softmax_reducev_fwd(
     Lv = v_buffer.shape[-1]
     BLOCK_DMODEL = triton.next_power_of_2(Lv)
+    extra_kargs = {}
+    if is_hip():
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
     _fwd_grouped_kernel_stage2[grid](
         logits,
         v_buffer,
@@ -564,6 +583,81 @@ def _decode_grouped_softmax_reducev_fwd(
         Lv=Lv,
         num_warps=num_warps,
         num_stages=1,
+        **extra_kargs,
+    )
+def decode_attention_fwd_normal(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_req_idx,
+    b_start_loc,
+    b_seq_len,
+    attn_logits,
+    max_len_in_batch,
+    sm_scale,
+    logit_cap=0.0,
+):
+    _decode_att_m_fwd(
+        q,
+        k_buffer,
+        attn_logits,
+        req_to_token,
+        b_req_idx,
+        b_start_loc,
+        b_seq_len,
+        max_len_in_batch,
+        sm_scale,
+        logit_cap,
+    )
+    _decode_softmax_reducev_fwd(
+        attn_logits,
+        v_buffer,
+        o,
+        req_to_token,
+        b_req_idx,
+        b_start_loc,
+        b_seq_len,
+    )
+def decode_attention_fwd_grouped(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_req_idx,
+    b_start_loc,
+    b_seq_len,
+    attn_logits,
+    max_len_in_batch,
+    sm_scale,
+    logit_cap=0.0,
+):
+    _decode_grouped_att_m_fwd(
+        q,
+        k_buffer,
+        attn_logits,
+        req_to_token,
+        b_req_idx,
+        b_start_loc,
+        b_seq_len,
+        max_len_in_batch,
+        sm_scale,
+        logit_cap,
+    )
+    _decode_grouped_softmax_reducev_fwd(
+        attn_logits,
+        v_buffer,
+        o,
+        req_to_token,
+        b_req_idx,
+        b_start_loc,
+        b_seq_len,
     )
@@ -585,47 +679,33 @@ def decode_attention_fwd(
     if kv_group_num == 1:
         # MHA
-        _decode_att_m_fwd(
+        decode_attention_fwd_normal(
             q,
             k_buffer,
-            attn_logits,
+            v_buffer,
+            o,
             req_to_token,
             b_req_idx,
             b_start_loc,
             b_seq_len,
+            attn_logits,
             max_len_in_batch,
             sm_scale,
             logit_cap,
         )
-        _decode_softmax_reducev_fwd(
-            attn_logits,
-            v_buffer,
-            o,
-            req_to_token,
-            b_req_idx,
-            b_start_loc,
-            b_seq_len,
-        )
     else:
         # GQA/MQA/MLA
-        _decode_grouped_att_m_fwd(
+        decode_attention_fwd_grouped(
             q,
             k_buffer,
-            attn_logits,
+            v_buffer,
+            o,
             req_to_token,
             b_req_idx,
             b_start_loc,
             b_seq_len,
+            attn_logits,
             max_len_in_batch,
             sm_scale,
             logit_cap,
         )
-        _decode_grouped_softmax_reducev_fwd(
-            attn_logits,
-            v_buffer,
-            o,
-            req_to_token,
-            b_req_idx,
-            b_start_loc,
-            b_seq_len,
-        )

sglang/srt/layers/attention/triton_ops/prefill_attention.py CHANGED Viewed

@@ -168,7 +168,7 @@ def _fwd_kernel(
 def context_attention_fwd(
     q, k, v, o, b_start_loc, b_seq_len, max_input_len, is_causal=True
 ):
-    if is_cuda_available and CUDA_CAPABILITY[0] >= 8:
+    if is_cuda_available and CUDA_CAPABILITY[0] > 8:
         BLOCK = 128
     else:
         BLOCK = 64

sglang/srt/layers/fused_moe/fused_moe.py CHANGED Viewed

@@ -14,6 +14,7 @@ from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 logger = init_logger(__name__)
+padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
 @triton.jit
@@ -263,7 +264,7 @@ def invoke_fused_moe_kernel(
         expert_ids,
         num_tokens_post_padded,
         B.shape[1],
-        B.shape[2],
+        B.shape[2] - padding_size,
         sorted_token_ids.shape[0],
         topk_ids.numel(),
         A.stride(0),
@@ -464,7 +465,7 @@ def fused_experts(
     a2_scale: Optional[torch.Tensor] = None,
 ):
     # Check constraints.
-    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+    assert hidden_states.shape[1] == w1.shape[2] - padding_size, "Hidden size mismatch"
     assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
@@ -481,7 +482,7 @@ def fused_experts(
     get_config_func = functools.partial(
         try_get_optimal_moe_config,
         w1.shape,
-        w2.shape,
+        (w2.shape[0], w2.shape[1], w2.shape[2] - padding_size),
         topk_ids.shape[1],
         "float8" if use_fp8 else None,
         override_config=override_config,

sglang/srt/layers/fused_moe/layer.py CHANGED Viewed

@@ -1,9 +1,11 @@
 # Adapted from
 # https://github.com/vllm-project/vllm/tree/v0.5.4/vllm/model_executor/layers/fused_moe
+import os
 from abc import abstractmethod
 from typing import List, Optional, Tuple
 import torch
+import torch.nn.functional as F
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -18,6 +20,7 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.layers.quantization.fp8 import Fp8Config
 from vllm.model_executor.utils import set_weight_attrs
+from sglang.srt.layers.fused_moe.fused_moe import padding_size
 from sglang.srt.utils import is_hip
 logger = init_logger(__name__)
@@ -506,6 +509,19 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 )
             layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+            # If ROCm, apply weight padding (min. Mem channel contention) only if set
+            if is_hip() and bool(int(os.getenv("MOE_PADDING", "0"))):
+                layer.w13_weight = torch.nn.Parameter(
+                    F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
+                    requires_grad=False,
+                )
+                torch.cuda.empty_cache()
+                layer.w2_weight = torch.nn.Parameter(
+                    F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
+                    requires_grad=False,
+                )
+                torch.cuda.empty_cache()
             return
         # If checkpoint is fp8, we need to handle that the
@@ -572,6 +588,18 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     start += shard_size
             layer.w13_scale = torch.nn.Parameter(max_w13_scales, requires_grad=False)
+            # If ROCm, apply weight padding (min. Mem channel contention) only if set
+            if is_hip() and bool(int(os.getenv("MOE_PADDING", "0"))):
+                layer.w13_weight = torch.nn.Parameter(
+                    F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
+                    requires_grad=False,
+                )
+                torch.cuda.empty_cache()
+                layer.w2_weight = torch.nn.Parameter(
+                    F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
+                    requires_grad=False,
+                )
+                torch.cuda.empty_cache()
             return
     def apply(

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -33,17 +33,17 @@ class LogitsProcessorOutput:
     # The logits of the next tokens.       shape: [#seq, vocab_size]
     next_token_logits: torch.Tensor
     # The logprobs of the next tokens.     shape: [#seq, vocab_size]
-    next_token_logprobs: torch.Tensor
+    next_token_logprobs: torch.Tensor = None
     # The normlaized logprobs of prompts.  shape: [#seq]
-    normalized_prompt_logprobs: torch.Tensor
+    normalized_prompt_logprobs: torch.Tensor = None
     # The logprobs of input tokens.        shape: [#token, vocab_size]
-    input_token_logprobs: torch.Tensor
+    input_token_logprobs: torch.Tensor = None
     # The logprob and id of the top-k tokens in input positions.  shape [#seq, #token, k] of Tuple(logprob, token_id)
-    input_top_logprobs: List
+    input_top_logprobs: List = None
     # The logprob and id of the top-k tokens in output positions. shape [#seq, #token, k] of Tuple(logprob, token_id)
-    output_top_logprobs: List
+    output_top_logprobs: List = None
 @dataclasses.dataclass

sglang/srt/layers/quantization/base_config.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/base_config.py
+import inspect
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Type
 import torch
 from torch import nn
@@ -120,3 +121,17 @@ class QuantizationConfig(ABC):
         For now, this is only used by AWQ.
         """
         raise NotImplementedError
+def method_has_implemented_embedding(
+        method_class: Type[QuantizeMethodBase]) -> bool:
+    """
+    Not all quant methods have embedding implemented, so we need to check that
+    it exists for our given method. We check this by making sure the function
+    has been changed from the base implementation.
+    """
+    base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding",
+                                            None)
+    class_embedding = inspect.getattr_static(method_class, "embedding", None)
+    return (class_embedding is not None
+            and class_embedding is not base_embedding)

sglang 0.3.4.post1__py3-none-any.whl → 0.3.5__py3-none-any.whl

sglang 0.3.4.post1py3-none-any.whl → 0.3.5py3-none-any.whl