PyPI - sglang - Versions diffs - 0.3.5__py3-none-any.whl → 0.3.5.post1__py3-none-any.whl - Mend

sglang 0.3.5py3-none-any.whl → 0.3.5.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

sglang/bench_serving.py +113 -3
sglang/srt/configs/model_config.py +5 -2
sglang/srt/constrained/__init__.py +2 -66
sglang/srt/constrained/base_grammar_backend.py +72 -0
sglang/srt/constrained/outlines_backend.py +165 -0
sglang/srt/constrained/outlines_jump_forward.py +182 -0
sglang/srt/constrained/xgrammar_backend.py +114 -0
sglang/srt/layers/attention/triton_ops/decode_attention.py +7 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +6 -0
sglang/srt/layers/fused_moe/fused_moe.py +23 -7
sglang/srt/layers/quantization/base_config.py +4 -6
sglang/srt/layers/vocab_parallel_embedding.py +216 -150
sglang/srt/managers/io_struct.py +5 -3
sglang/srt/managers/schedule_batch.py +14 -20
sglang/srt/managers/scheduler.py +153 -94
sglang/srt/managers/tokenizer_manager.py +81 -17
sglang/srt/metrics/collector.py +211 -0
sglang/srt/metrics/func_timer.py +108 -0
sglang/srt/mm_utils.py +1 -1
sglang/srt/model_executor/cuda_graph_runner.py +2 -2
sglang/srt/model_executor/forward_batch_info.py +7 -3
sglang/srt/model_executor/model_runner.py +2 -1
sglang/srt/models/gemma2_reward.py +69 -0
sglang/srt/models/gpt2.py +31 -37
sglang/srt/models/internlm2_reward.py +62 -0
sglang/srt/models/llama.py +11 -6
sglang/srt/models/llama_reward.py +5 -26
sglang/srt/models/qwen2_vl.py +5 -7
sglang/srt/openai_api/adapter.py +6 -2
sglang/srt/sampling/sampling_batch_info.py +2 -3
sglang/srt/sampling/sampling_params.py +0 -14
sglang/srt/server.py +58 -16
sglang/srt/server_args.py +42 -22
sglang/srt/utils.py +87 -0
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +2 -2
sglang/test/simple_eval_mgsm.py +2 -2
sglang/test/test_utils.py +18 -4
sglang/utils.py +1 -0
sglang/version.py +1 -1
{sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/METADATA +11 -7
{sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/RECORD +45 -42
{sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/WHEEL +1 -1
sglang/srt/constrained/base_tool_cache.py +0 -65
sglang/srt/constrained/bnf_cache.py +0 -61
sglang/srt/constrained/fsm_cache.py +0 -95
sglang/srt/constrained/grammar.py +0 -190
sglang/srt/constrained/jump_forward.py +0 -203
{sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/LICENSE +0 -0
{sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/top_level.txt +0 -0

sglang/srt/constrained/xgrammar_backend.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""Constrained decoding with xgrammar backend."""
+from typing import List, Tuple
+import torch
+from xgrammar import CachedGrammarCompiler, CompiledGrammar, GrammarMatcher
+from sglang.srt.constrained.base_grammar_backend import (
+    BaseGrammarBackend,
+    BaseGrammarObject,
+)
+MAX_ROLLBACK_TOKENS = 10
+class XGrammarGrammar(BaseGrammarObject):
+    def __init__(
+        self, matcher: GrammarMatcher, vocab_size: int, ctx: CompiledGrammar
+    ) -> None:
+        self.matcher = matcher
+        self.vocab_size = vocab_size
+        self.ctx = ctx
+    def accept_token(self, token: int):
+        assert self.matcher.accept_token(token)
+    def try_jump_forward(self, tokenizer) -> Tuple[List[int], str]:
+        s = self.matcher.find_jump_forward_string()
+        if s:
+            return [], s
+        return None
+    def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
+        _, data = helper
+        return data, -1
+    def jump_and_retokenize(
+        self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
+    ):
+        k = 0
+        for i, old_id in enumerate(old_output_ids):
+            if old_id == new_output_ids[i]:
+                k = i + 1
+            else:
+                break
+        # rollback to the last token that is the same
+        if k < len(old_output_ids):
+            self.matcher.rollback(len(old_output_ids) - k)
+        for i in range(k, len(new_output_ids)):
+            assert self.matcher.accept_token(new_output_ids[i])
+    def fill_vocab_mask(self, vocab_mask: torch.Tensor):
+        # Note that this bitmask is a bitset, not bool
+        bitmask = self.matcher.get_next_token_bitmask()
+        # Mask the tokens that are not allowed
+        vocab_mask[
+            self.matcher.get_rejected_tokens_from_bitmask(bitmask, self.vocab_size)
+        ] = 1
+    def copy(self):
+        matcher = GrammarMatcher(
+            self.ctx,
+            max_rollback_tokens=MAX_ROLLBACK_TOKENS,
+            mask_vocab_size=self.vocab_size,
+        )
+        return XGrammarGrammar(matcher, self.vocab_size, self.ctx)
+class XGrammarGrammarBackend(BaseGrammarBackend):
+    def __init__(
+        self,
+        tokenizer,
+        vocab_size: int,
+    ):
+        super().__init__()
+        self.grammar_cache = CachedGrammarCompiler(tokenizer_or_vocab=tokenizer)
+        self.vocab_size = vocab_size
+    def init_value_impl(self, key: Tuple[str, str]) -> XGrammarGrammar:
+        key_type, key_string = key
+        if key_type == "json":
+            ctx = self.grammar_cache.get_compiled_grammar_for_json_schema(key_string)
+        elif key_type == "regex":
+            raise ValueError("regex hasn't been supported by xgrammar yet")
+        else:
+            raise ValueError(f"Invalid key_type: {key_type}")
+        matcher = GrammarMatcher(
+            ctx,
+            max_rollback_tokens=MAX_ROLLBACK_TOKENS,
+            mask_vocab_size=self.vocab_size,
+        )
+        return XGrammarGrammar(matcher, self.vocab_size, ctx)
+    def reset(self):
+        self.grammar_cache.clear()

sglang/srt/layers/attention/triton_ops/decode_attention.py CHANGED Viewed

@@ -507,6 +507,12 @@ def _decode_grouped_att_m_fwd(
     num_warps = 4
+    extra_kargs = {}
+    if is_hip():
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
     _fwd_grouped_kernel_stage1[grid](
         q,
         k_buffer,
@@ -532,6 +538,7 @@ def _decode_grouped_att_m_fwd(
         num_warps=num_warps,
         num_stages=1,
         Lk=Lk,
+        **extra_kargs,
     )

sglang/srt/layers/attention/triton_ops/extend_attention.py CHANGED Viewed

@@ -25,6 +25,7 @@ import triton.language as tl
 from sglang.srt.layers.attention.triton_ops.prefill_attention import (
     context_attention_fwd,
 )
+from sglang.srt.utils import is_hip
 is_cuda_available = torch.cuda.is_available()
 if is_cuda_available:
@@ -311,6 +312,10 @@ def extend_attention_fwd(
     num_warps = 4 if Lk <= 64 else 8
     num_stages = 1
+    extra_kargs = {}
+    if is_hip():
+        extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
     _fwd_kernel[grid](
         q_extend,
         k_extend,
@@ -348,6 +353,7 @@ def extend_attention_fwd(
         Lv=Lv,
         num_warps=num_warps,
         num_stages=num_stages,
+        **extra_kargs,
     )

sglang/srt/layers/fused_moe/fused_moe.py CHANGED Viewed

@@ -54,6 +54,7 @@ def fused_moe_kernel(
     top_k: tl.constexpr,
     compute_type: tl.constexpr,
     use_fp8: tl.constexpr,
+    even_Ks: tl.constexpr,
 ):
     """
     Implements the fused computation for a Mixture of Experts (MOE) using
@@ -130,16 +131,24 @@ def fused_moe_kernel(
     # of fp32 values for higher accuracy.
     # `accumulator` will be converted back to fp16 after the loop.
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
         # Load the next block of A and B, generate a mask by checking the
         # K dimension.
-        a = tl.load(
-            a_ptrs,
-            mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
-            other=0.0,
-        )
-        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+        if even_Ks:
+            a = tl.load(
+                a_ptrs,
+                mask=token_mask[:, None],
+                other=0.0,
+            )
+            b = tl.load(b_ptrs)
+        else:
+            a = tl.load(
+                a_ptrs,
+                mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                other=0.0,
+            )
+            b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
         # We accumulate along the K dimension.
         if use_fp8:
             accumulator = tl.dot(a, b, acc=accumulator)
@@ -253,6 +262,12 @@ def invoke_fused_moe_kernel(
         * triton.cdiv(B.shape[1], META["BLOCK_SIZE_N"]),
     )
+    K = B.shape[2] - padding_size
+    if K % config["BLOCK_SIZE_K"] == 0:
+        even_ks = True
+    else:
+        even_ks = False
     fused_moe_kernel[grid](
         A,
         B,
@@ -278,6 +293,7 @@ def invoke_fused_moe_kernel(
         top_k=top_k,
         compute_type=compute_type,
         use_fp8=use_fp8,
+        even_Ks=even_ks,
         **config,
     )

sglang/srt/layers/quantization/base_config.py CHANGED Viewed

@@ -122,16 +122,14 @@ class QuantizationConfig(ABC):
         """
         raise NotImplementedError
-def method_has_implemented_embedding(
-        method_class: Type[QuantizeMethodBase]) -> bool:
+def method_has_implemented_embedding(method_class: Type[QuantizeMethodBase]) -> bool:
     """
     Not all quant methods have embedding implemented, so we need to check that
     it exists for our given method. We check this by making sure the function
     has been changed from the base implementation.
     """
-    base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding",
-                                            None)
+    base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding", None)
     class_embedding = inspect.getattr_static(method_class, "embedding", None)
-    return (class_embedding is not None
-            and class_embedding is not base_embedding)
+    return class_embedding is not None and class_embedding is not base_embedding

sglang 0.3.5__py3-none-any.whl → 0.3.5.post1__py3-none-any.whl

sglang 0.3.5py3-none-any.whl → 0.3.5.post1py3-none-any.whl