PyPI - sglang - Versions diffs - 0.3.1.post2__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

sglang 0.3.1.post2py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

sglang/bench_latency.py +12 -11
sglang/bench_server_latency.py +0 -6
sglang/srt/hf_transformers_utils.py +1 -0
sglang/srt/layers/activation.py +3 -2
sglang/srt/layers/attention_backend.py +6 -12
sglang/srt/layers/fused_moe/patch.py +117 -0
sglang/srt/layers/linear.py +1133 -0
sglang/srt/layers/quantization/__init__.py +76 -0
sglang/srt/layers/quantization/base_config.py +122 -0
sglang/srt/managers/schedule_batch.py +3 -5
sglang/srt/managers/tokenizer_manager.py +1 -0
sglang/srt/managers/tp_worker.py +1 -1
sglang/srt/mem_cache/radix_cache.py +5 -5
sglang/srt/model_executor/cuda_graph_runner.py +10 -6
sglang/srt/model_executor/forward_batch_info.py +2 -4
sglang/srt/model_executor/model_runner.py +0 -3
sglang/srt/models/baichuan.py +1 -1
sglang/srt/models/chatglm.py +6 -6
sglang/srt/models/commandr.py +7 -7
sglang/srt/models/dbrx.py +7 -7
sglang/srt/models/deepseek.py +7 -7
sglang/srt/models/deepseek_v2.py +7 -7
sglang/srt/models/exaone.py +6 -6
sglang/srt/models/gemma.py +6 -6
sglang/srt/models/gemma2.py +6 -6
sglang/srt/models/gpt_bigcode.py +6 -6
sglang/srt/models/grok.py +6 -6
sglang/srt/models/internlm2.py +6 -6
sglang/srt/models/llama.py +14 -6
sglang/srt/models/llama_classification.py +1 -1
sglang/srt/models/llava.py +1 -1
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpm.py +6 -6
sglang/srt/models/minicpm3.py +1 -1
sglang/srt/models/mixtral.py +6 -6
sglang/srt/models/mixtral_quant.py +6 -6
sglang/srt/models/olmoe.py +1 -1
sglang/srt/models/qwen.py +6 -6
sglang/srt/models/qwen2.py +6 -6
sglang/srt/models/qwen2_moe.py +7 -7
sglang/srt/models/stablelm.py +6 -6
sglang/srt/models/xverse.py +1 -1
sglang/srt/models/xverse_moe.py +1 -1
sglang/srt/models/yivl.py +1 -1
sglang/srt/openai_api/adapter.py +7 -0
sglang/srt/utils.py +21 -1
sglang/test/runners.py +7 -9
sglang/test/test_utils.py +39 -2
sglang/version.py +1 -1
{sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/METADATA +8 -6
{sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/RECORD +54 -50
{sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/LICENSE +0 -0
{sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/WHEEL +0 -0
{sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/top_level.txt +0 -0

sglang/bench_latency.py CHANGED Viewed

@@ -64,8 +64,13 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.server import _set_envs_and_config
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import kill_child_process, suppress_other_loggers
+from sglang.srt.utils import (
+    configure_logger,
+    kill_child_process,
+    suppress_other_loggers,
+)
 @dataclasses.dataclass
@@ -255,7 +260,7 @@ def correctness_test(
     # Decode
     output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
-    for _ in range(bench_args.output_len[0]):
+    for _ in range(bench_args.output_len[0] - 1):
         next_token_ids, _ = decode(next_token_ids, batch, model_runner)
         for i in range(len(reqs)):
             output_ids[i].append(next_token_ids[i])
@@ -306,7 +311,7 @@ def latency_test_run_once(
     # Decode
     decode_latencies = []
-    for i in range(output_len):
+    for i in range(output_len - 1):
         torch.cuda.synchronize()
         tic = time.time()
         next_token_ids, _ = decode(next_token_ids, batch, model_runner)
@@ -341,6 +346,8 @@ def latency_test(
     bench_args,
     tp_rank,
 ):
+    configure_logger(server_args, prefix=f" TP{tp_rank}")
+    _set_envs_and_config(server_args)
     rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
     # Load the model
@@ -484,18 +491,10 @@ def main(server_args, bench_args):
 if __name__ == "__main__":
-    multiprocessing.set_start_method("spawn", force=True)
     parser = argparse.ArgumentParser()
     ServerArgs.add_cli_args(parser)
     BenchArgs.add_cli_args(parser)
-    # For this script, model-path is not required
-    assert (
-        parser._actions[1].option_strings[0] == "--model-path"
-    ), "options changed, this code need to be updated"
-    parser._actions[1].required = False
     args = parser.parse_args()
     server_args = ServerArgs.from_cli_args(args)
     bench_args = BenchArgs.from_cli_args(args)
@@ -504,6 +503,8 @@ if __name__ == "__main__":
         format="%(message)s",
     )
+    multiprocessing.set_start_method("spawn", force=True)
     try:
         main(server_args, bench_args)
     except Exception as e:

sglang/bench_server_latency.py CHANGED Viewed

@@ -174,13 +174,7 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ServerArgs.add_cli_args(parser)
     BenchArgs.add_cli_args(parser)
-    # For this script, model-path is not required
-    assert (
-        parser._actions[1].option_strings[0] == "--model-path"
-    ), "options changed, this code need to be updated"
-    parser._actions[1].required = False
     args = parser.parse_args()
     server_args = ServerArgs.from_cli_args(args)
     bench_args = BenchArgs.from_cli_args(args)

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -129,6 +129,7 @@ def get_tokenizer(
             *args,
             trust_remote_code=trust_remote_code,
             tokenizer_revision=tokenizer_revision,
+            clean_up_tokenization_spaces=False,
             **kwargs,
         )
     except TypeError as e:

sglang/srt/layers/activation.py CHANGED Viewed

@@ -31,8 +31,9 @@ from vllm.distributed import (
     get_tensor_model_parallel_world_size,
 )
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.utils import set_weight_attrs
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.utils import set_weight_attrs
 logger = logging.getLogger(__name__)

sglang/srt/layers/attention_backend.py CHANGED Viewed

@@ -86,17 +86,9 @@ class FlashInferAttnBackend(AttentionBackend):
         super().__init__()
         self.model_runner = model_runner
-        local_num_qo_heads = (
-            model_runner.model_config.num_attention_heads // model_runner.tp_size
-        )
-        local_num_kv_heads = model_runner.model_config.get_num_kv_heads(
-            model_runner.tp_size
-        )
-        if (
-            not _grouped_size_compiled_for_decode_kernels(
-                local_num_qo_heads, local_num_kv_heads
-            )
-            or local_num_qo_heads // local_num_kv_heads > 4
+        if not _grouped_size_compiled_for_decode_kernels(
+            model_runner.model_config.num_attention_heads // model_runner.tp_size,
+            model_runner.model_config.get_num_kv_heads(model_runner.tp_size),
         ):
             self.decode_use_tensor_cores = True
         else:
@@ -346,7 +338,9 @@ class TritonAttnBackend(AttentionBackend):
         self.decode_attention_fwd = decode_attention_fwd
         self.extend_attention_fwd = extend_attention_fwd
-        self.num_head = model_runner.model_config.num_attention_heads
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // model_runner.tp_size
+        )
         if global_server_args_dict.get("triton_attention_reduce_in_fp32", False):
             self.reduce_dtype = torch.float32

sglang/srt/layers/fused_moe/patch.py ADDED Viewed

@@ -0,0 +1,117 @@
+from typing import Optional
+import torch
+from torch.nn import functional as F
+def fused_topk_native(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    M, _ = hidden_states.shape
+    topk_weights = torch.empty(
+        M, topk, dtype=torch.float32, device=hidden_states.device
+    )
+    topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
+    topk_weights = F.softmax(gating_output.float(), dim=-1)
+    topk_weights, topk_ids = torch.topk(topk_weights, topk, dim=-1)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids
+# This is used by the Deepseek-V2 model
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    scores = torch.softmax(gating_output, dim=-1)
+    num_token = scores.shape[0]
+    group_scores = (
+        scores.view(num_token, num_expert_group, -1).max(dim=-1).values
+    )  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
+        1
+    ]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
+        .reshape(num_token, -1)
+    )  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids
+def select_experts_native(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    top_k: int,
+    use_grouped_topk: bool,
+    renormalize: bool,
+    topk_group: Optional[int] = None,
+    num_expert_group: Optional[int] = None,
+):
+    # DeekSeekv2 uses grouped_top_k
+    if use_grouped_topk:
+        assert topk_group is not None
+        assert num_expert_group is not None
+        topk_weights, topk_ids = grouped_topk(
+            hidden_states=hidden_states,
+            gating_output=router_logits,
+            topk=top_k,
+            renormalize=renormalize,
+            num_expert_group=num_expert_group,
+            topk_group=topk_group,
+        )
+    else:
+        topk_weights, topk_ids = fused_topk_native(
+            hidden_states=hidden_states,
+            gating_output=router_logits,
+            topk=top_k,
+            renormalize=renormalize,
+        )
+    return topk_weights, topk_ids
+def fused_moe_forward_native(
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    use_grouped_topk: bool,
+    top_k: int,
+    router_logits: torch.Tensor,
+    renormalize: bool,
+    topk_group: Optional[int] = None,
+    num_expert_group: Optional[int] = None,
+) -> torch.Tensor:
+    topk_weights, topk_ids = select_experts_native(
+        hidden_states=x,
+        router_logits=router_logits,
+        use_grouped_topk=use_grouped_topk,
+        top_k=top_k,
+        renormalize=renormalize,
+        topk_group=topk_group,
+        num_expert_group=num_expert_group,
+    )
+    w13_weights = layer.w13_weight[topk_ids]
+    w1_weights, w3_weights = torch.chunk(w13_weights, 2, dim=2)
+    w2_weights = layer.w2_weight[topk_ids]
+    x1 = F.silu(torch.einsum("ti,taoi -> tao", x, w1_weights))
+    x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
+    expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
+    return torch.einsum("tai,ta -> ti", expert_outs, topk_weights)

sglang 0.3.1.post2__py3-none-any.whl → 0.3.2__py3-none-any.whl

sglang 0.3.1.post2py3-none-any.whl → 0.3.2py3-none-any.whl