PyPI - sglang - Versions diffs - 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl - Mend

sglang 0.3.5.post2py3-none-any.whl → 0.3.6.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

sglang/__init__.py +2 -2
sglang/api.py +2 -2
sglang/bench_latency.py +1 -553
sglang/bench_offline_throughput.py +48 -20
sglang/bench_one_batch.py +472 -0
sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
sglang/bench_serving.py +125 -6
sglang/check_env.py +3 -6
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/runtime_endpoint.py +2 -2
sglang/srt/configs/model_config.py +13 -14
sglang/srt/constrained/__init__.py +13 -14
sglang/srt/constrained/base_grammar_backend.py +13 -15
sglang/srt/constrained/outlines_backend.py +28 -17
sglang/srt/constrained/outlines_jump_forward.py +13 -15
sglang/srt/constrained/xgrammar_backend.py +47 -58
sglang/srt/conversation.py +13 -15
sglang/srt/hf_transformers_utils.py +13 -15
sglang/srt/layers/activation.py +16 -13
sglang/srt/layers/attention/flashinfer_backend.py +106 -54
sglang/srt/layers/attention/triton_backend.py +9 -7
sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
sglang/srt/layers/custom_op_util.py +25 -0
sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +11 -4
sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
sglang/srt/layers/fused_moe_triton/layer.py +633 -0
sglang/srt/layers/layernorm.py +17 -15
sglang/srt/layers/logits_processor.py +23 -25
sglang/srt/layers/quantization/__init__.py +77 -17
sglang/srt/layers/radix_attention.py +13 -15
sglang/srt/layers/rotary_embedding.py +13 -13
sglang/srt/layers/sampler.py +4 -8
sglang/srt/layers/torchao_utils.py +2 -0
sglang/srt/lora/lora.py +13 -14
sglang/srt/lora/lora_config.py +13 -14
sglang/srt/lora/lora_manager.py +22 -24
sglang/srt/managers/data_parallel_controller.py +98 -27
sglang/srt/managers/detokenizer_manager.py +13 -15
sglang/srt/managers/io_struct.py +63 -21
sglang/srt/managers/schedule_batch.py +154 -59
sglang/srt/managers/schedule_policy.py +18 -16
sglang/srt/managers/scheduler.py +278 -109
sglang/srt/managers/session_controller.py +61 -0
sglang/srt/managers/tokenizer_manager.py +63 -18
sglang/srt/managers/tp_worker.py +25 -16
sglang/srt/managers/tp_worker_overlap_thread.py +62 -67
sglang/srt/metrics/collector.py +13 -15
sglang/srt/metrics/func_timer.py +13 -15
sglang/srt/mm_utils.py +13 -14
sglang/srt/model_executor/cuda_graph_runner.py +63 -25
sglang/srt/model_executor/forward_batch_info.py +128 -32
sglang/srt/model_executor/model_runner.py +132 -64
sglang/srt/model_parallel.py +98 -0
sglang/srt/models/chatglm.py +15 -16
sglang/srt/models/commandr.py +15 -16
sglang/srt/models/dbrx.py +15 -16
sglang/srt/models/deepseek.py +15 -15
sglang/srt/models/deepseek_v2.py +162 -59
sglang/srt/models/exaone.py +14 -15
sglang/srt/models/gemma.py +14 -14
sglang/srt/models/gemma2.py +31 -25
sglang/srt/models/gemma2_reward.py +13 -14
sglang/srt/models/gpt_bigcode.py +14 -14
sglang/srt/models/grok.py +15 -15
sglang/srt/models/internlm2.py +13 -15
sglang/srt/models/internlm2_reward.py +13 -14
sglang/srt/models/llama.py +21 -21
sglang/srt/models/llama_classification.py +13 -14
sglang/srt/models/llama_reward.py +13 -14
sglang/srt/models/llava.py +14 -16
sglang/srt/models/llavavid.py +14 -16
sglang/srt/models/minicpm.py +13 -15
sglang/srt/models/minicpm3.py +13 -15
sglang/srt/models/mistral.py +13 -15
sglang/srt/models/mixtral.py +15 -15
sglang/srt/models/mixtral_quant.py +14 -14
sglang/srt/models/olmo.py +22 -20
sglang/srt/models/olmoe.py +23 -20
sglang/srt/models/phi3_small.py +447 -0
sglang/srt/models/qwen.py +14 -14
sglang/srt/models/qwen2.py +22 -19
sglang/srt/models/qwen2_moe.py +17 -18
sglang/srt/models/qwen2_vl.py +13 -6
sglang/srt/models/stablelm.py +18 -16
sglang/srt/models/torch_native_llama.py +107 -93
sglang/srt/models/xverse.py +13 -14
sglang/srt/models/xverse_moe.py +15 -16
sglang/srt/models/yivl.py +13 -15
sglang/srt/openai_api/adapter.py +19 -17
sglang/srt/openai_api/protocol.py +14 -16
sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
sglang/srt/sampling/sampling_batch_info.py +61 -57
sglang/srt/sampling/sampling_params.py +14 -16
sglang/srt/server.py +86 -35
sglang/srt/server_args.py +96 -80
sglang/srt/utils.py +266 -68
sglang/test/few_shot_gsm8k.py +8 -4
sglang/test/runners.py +38 -20
sglang/test/srt/sampling/penaltylib/utils.py +23 -21
sglang/test/test_utils.py +31 -20
sglang/version.py +1 -1
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +66 -57
sglang-0.3.6.post1.dist-info/RECORD +164 -0
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +1 -1
sglang/srt/layers/fused_moe/__init__.py +0 -1
sglang-0.3.5.post2.dist-info/RECORD +0 -156
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -1,22 +1,20 @@
-from __future__ import annotations
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Run the model with cuda graph and torch.compile."""
+from __future__ import annotations
 import bisect
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Callable
@@ -25,7 +23,7 @@ import torch
 from vllm.distributed.parallel_state import graph_capture
 from vllm.model_executor.custom_op import CustomOp
-from sglang.srt.layers.fused_moe.patch import fused_moe_forward_native
+from sglang.srt.layers.fused_moe_patch import fused_moe_forward_native
 from sglang.srt.layers.logits_processor import (
     LogitsMetadata,
     LogitsProcessor,
@@ -67,7 +65,10 @@ def patch_model(
             _to_torch(model)
             monkey_patch_vllm_all_gather()
             backup_ca_comm = tp_group.ca_comm
-            tp_group.ca_comm = None
+            # Use custom-allreduce here.
+            # We found the custom allreduce is much faster than the built-in allreduce in torch,
+            # even with ENABLE_INTRA_NODE_COMM=1.
+            # tp_group.ca_comm = None
             yield torch.compile(
                 torch.no_grad()(model.forward), mode="max-autotune-no-cudagraphs"
             )
@@ -90,6 +91,8 @@ def set_torch_compile_config():
     # FIXME: tmp workaround
     torch._dynamo.config.accumulated_cache_size_limit = 1024
+    if hasattr(torch._dynamo.config, "cache_size_limit"):
+        torch._dynamo.config.cache_size_limit = 1024
 @maybe_torch_compile(dynamic=True)
@@ -111,6 +114,8 @@ class CudaGraphRunner:
         self.use_torch_compile = model_runner.server_args.enable_torch_compile
         self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
         self.is_encoder_decoder = self.model_runner.model_config.is_encoder_decoder
+        self.enable_dp_attention = self.model_runner.server_args.enable_dp_attention
+        self.tp_size = self.model_runner.tp_size
         # Batch sizes to capture
         if model_runner.server_args.disable_cuda_graph_padding:
@@ -165,6 +170,15 @@ class CudaGraphRunner:
             else:
                 self.encoder_lens = None
+            if self.enable_dp_attention:
+                self.gathered_buffer = torch.zeros(
+                    (
+                        self.max_bs * self.tp_size,
+                        self.model_runner.model_config.hidden_size,
+                    ),
+                    dtype=self.model_runner.dtype,
+                )
         # Capture
         try:
             with self.model_capture_mode():
@@ -190,11 +204,21 @@ class CudaGraphRunner:
             self.model_runner.model.capture_mode = False
     def can_run(self, forward_batch: ForwardBatch):
-        is_bs_supported = (
-            forward_batch.batch_size in self.graphs
-            if self.disable_padding
-            else forward_batch.batch_size <= self.max_bs
-        )
+        if self.enable_dp_attention:
+            min_num_tokens, max_num_tokens = min(forward_batch.global_num_tokens), max(
+                forward_batch.global_num_tokens
+            )
+            is_bs_supported = forward_batch.can_run_dp_cuda_graph and (
+                (min_num_tokens == max_num_tokens and max_num_tokens in self.graphs)
+                if self.disable_padding
+                else max_num_tokens <= self.max_bs
+            )
+        else:
+            is_bs_supported = (
+                forward_batch.batch_size in self.graphs
+                if self.disable_padding
+                else forward_batch.batch_size <= self.max_bs
+            )
         # NOTE: cuda graph cannot handle mixed batch (encoder_len = 0)
         # If mixed batch cannot be supported, then encoder_lens can be removed in cuda graph
@@ -239,6 +263,13 @@ class CudaGraphRunner:
         seq_lens_sum = seq_lens.sum().item()
         mrope_positions = self.mrope_positions[:, :bs]
+        if self.enable_dp_attention:
+            global_num_tokens = [bs] * self.tp_size
+            gathered_buffer = self.gathered_buffer[: bs * self.tp_size]
+        else:
+            global_num_tokens = None
+            gathered_buffer = None
         # Attention backend
         self.model_runner.attn_backend.init_forward_metadata_capture_cuda_graph(
             bs,
@@ -265,6 +296,8 @@ class CudaGraphRunner:
                 top_logprobs_nums=[0] * bs,
                 positions=clamp_position(seq_lens),
                 mrope_positions=mrope_positions,
+                global_num_tokens=global_num_tokens,
+                gathered_buffer=gathered_buffer,
             )
             logits_output = forward(input_ids, forward_batch.positions, forward_batch)
             return logits_output.next_token_logits
@@ -295,7 +328,12 @@ class CudaGraphRunner:
         raw_bs = forward_batch.batch_size
         # Pad
-        index = bisect.bisect_left(self.capture_bs, raw_bs)
+        if self.enable_dp_attention:
+            index = bisect.bisect_left(
+                self.capture_bs, max(forward_batch.global_num_tokens)
+            )
+        else:
+            index = bisect.bisect_left(self.capture_bs, raw_bs)
         bs = self.capture_bs[index]
         if bs != raw_bs:
             self.seq_lens.fill_(1)

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -1,20 +1,16 @@
-from __future__ import annotations
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """
 Store information about a forward batch.
@@ -31,11 +27,15 @@ ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
   It contains low-level tensor data. Most of the data consists of GPU tensors.
 """
+from __future__ import annotations
 from dataclasses import dataclass
 from enum import IntEnum, auto
 from typing import TYPE_CHECKING, List, Optional
 import torch
+import triton
+import triton.language as tl
 from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
@@ -50,12 +50,18 @@ if TYPE_CHECKING:
 class ForwardMode(IntEnum):
     # Prefill a new sequence. This is deprecated now. "EXTEND" covers this case.
     PREFILL = auto()
-    # Extend a sequence. The KV cache of the first part of the sequence is already computed (e.g., system prompt).
+    # Extend a sequence. The KV cache of the beginning part of the sequence is already computed (e.g., system prompt).
     EXTEND = auto()
     # Decode one token.
     DECODE = auto()
-    # Contains both EXTEND and DECODE.
+    # Contains both EXTEND and DECODE when doing chunked prefill.
     MIXED = auto()
+    # No sequence to forward. For data parallel attention, some workers wil be IDLE if no sequence are allocated.
+    IDLE = auto()
+    # A dummy first batch to start the pipeline for overlap scheduler.
+    # It is now used for triggering the sampling_info_done event for the first prefill batch.
+    DUMMY_FIRST = auto()
     def is_prefill(self):
         return self == ForwardMode.PREFILL
@@ -69,6 +75,12 @@ class ForwardMode(IntEnum):
     def is_mixed(self):
         return self == ForwardMode.MIXED
+    def is_idle(self):
+        return self == ForwardMode.IDLE
+    def is_dummy_first(self):
+        return self == ForwardMode.DUMMY_FIRST
 @dataclass
 class ForwardBatch:
@@ -102,6 +114,7 @@ class ForwardBatch:
     extend_seq_lens: Optional[torch.Tensor] = None
     extend_prefix_lens: Optional[torch.Tensor] = None
     extend_start_loc: Optional[torch.Tensor] = None
+    extend_prefix_lens_cpu: Optional[List[int]] = None
     extend_seq_lens_cpu: Optional[List[int]] = None
     extend_logprob_start_lens_cpu: Optional[List[int]] = None
@@ -117,6 +130,9 @@ class ForwardBatch:
     # For LoRA
     lora_paths: Optional[List[str]] = None
+    # For input embeddings
+    input_embeds: Optional[torch.tensor] = None
     # Sampling info
     sampling_info: SamplingBatchInfo = None
@@ -128,6 +144,11 @@ class ForwardBatch:
     # For Qwen2-VL
     mrope_positions: torch.Tensor = None
+    # For DP attention
+    global_num_tokens: Optional[List[int]] = None
+    gathered_buffer: Optional[torch.Tensor] = None
+    can_run_dp_cuda_graph: bool = False
     def compute_mrope_positions(
         self, model_runner: ModelRunner, batch: ModelWorkerBatch
     ):
@@ -209,31 +230,37 @@ class ForwardBatch:
             seq_lens_sum=batch.seq_lens_sum,
             return_logprob=batch.return_logprob,
             top_logprobs_nums=batch.top_logprobs_nums,
+            global_num_tokens=batch.global_num_tokens,
+            can_run_dp_cuda_graph=batch.can_run_dp_cuda_graph,
             lora_paths=batch.lora_paths,
             sampling_info=batch.sampling_info,
+            input_embeds=batch.input_embeds,
         )
+        if ret.global_num_tokens is not None:
+            max_len = max(ret.global_num_tokens)
+            ret.gathered_buffer = torch.zeros(
+                (max_len * model_runner.tp_size, model_runner.model_config.hidden_size),
+                dtype=model_runner.dtype,
+                device=device,
+            )
+        if ret.forward_mode.is_idle():
+            return ret
         # Init position information
         if not ret.forward_mode.is_decode():
-            ret.positions = torch.concat(
-                [
-                    torch.arange(prefix_len, prefix_len + extend_len, device=device)
-                    for prefix_len, extend_len in zip(
-                        batch.extend_prefix_lens, batch.extend_seq_lens
-                    )
-                ],
-                axis=0,
-            )
-            ret.extend_num_tokens = batch.extend_num_tokens
             ret.extend_seq_lens = torch.tensor(
                 batch.extend_seq_lens, dtype=torch.int32
             ).to(device, non_blocking=True)
             ret.extend_prefix_lens = torch.tensor(
                 batch.extend_prefix_lens, dtype=torch.int32
             ).to(device, non_blocking=True)
-            ret.extend_start_loc = torch.zeros_like(ret.extend_seq_lens)
-            ret.extend_start_loc[1:] = torch.cumsum(ret.extend_seq_lens[:-1], dim=0)
+            ret.extend_num_tokens = batch.extend_num_tokens
+            ret.positions, ret.extend_start_loc = compute_position_triton(
+                ret.extend_prefix_lens, ret.extend_seq_lens, ret.extend_num_tokens
+            )
+            ret.extend_prefix_lens_cpu = batch.extend_prefix_lens
             ret.extend_seq_lens_cpu = batch.extend_seq_lens
             ret.extend_logprob_start_lens_cpu = batch.extend_logprob_start_lens
@@ -250,3 +277,72 @@ class ForwardBatch:
             model_runner.lora_manager.prepare_lora_batch(ret)
         return ret
+def compute_position_triton(
+    extend_prefix_lens: torch.Tensor, extend_seq_lens: torch.Tensor, extend_seq_lens_sum
+):
+    """Compute positions. It is a fused version of `compute_position_torch`."""
+    batch_size = extend_seq_lens.shape[0]
+    positions = torch.empty(
+        extend_seq_lens_sum, dtype=torch.int64, device=extend_seq_lens.device
+    )
+    extend_start_loc = torch.empty(
+        batch_size, dtype=torch.int32, device=extend_seq_lens.device
+    )
+    # Launch kernel
+    compute_position_kernel[(batch_size,)](
+        positions,
+        extend_start_loc,
+        extend_prefix_lens,
+        extend_seq_lens,
+    )
+    return positions, extend_start_loc
+@triton.jit
+def compute_position_kernel(
+    positions,
+    extend_start_loc,
+    extend_prefix_lens,
+    extend_seq_lens,
+):
+    BLOCK_SIZE: tl.constexpr = 512
+    pid = tl.program_id(0)
+    prefix_len = tl.load(extend_prefix_lens + pid)
+    seq_len = tl.load(extend_seq_lens + pid)
+    # TODO: optimize this?
+    cumsum_start = 0
+    for i in range(pid):
+        cumsum_start += tl.load(extend_seq_lens + i)
+    num_loop = tl.cdiv(seq_len, BLOCK_SIZE)
+    for i in range(num_loop):
+        offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        tl.store(
+            positions + cumsum_start + offset,
+            prefix_len + offset,
+            mask=offset < seq_len,
+        )
+    tl.store(extend_start_loc + pid, cumsum_start)
+def compute_position_torch(
+    extend_prefix_lens: torch.Tensor, extend_seq_lens: torch.Tensor
+):
+    positions = torch.concat(
+        [
+            torch.arange(
+                prefix_len, prefix_len + extend_len, device=extend_prefix_lens.device
+            )
+            for prefix_len, extend_len in zip(extend_prefix_lens, extend_seq_lens)
+        ],
+        axis=0,
+    )
+    extend_start_loc = torch.zeros_like(extend_seq_lens)
+    extend_start_loc[1:] = torch.cumsum(extend_seq_lens[:-1], dim=0)
+    return positions.to(torch.int64), extend_start_loc

sglang 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl

sglang 0.3.5.post2py3-none-any.whl → 0.3.6.post1py3-none-any.whl