PyPI - sglang - Versions diffs - 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl - Mend

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

sglang/bench_offline_throughput.py +10 -8
sglang/bench_one_batch.py +7 -6
sglang/bench_one_batch_server.py +157 -21
sglang/bench_serving.py +137 -59
sglang/compile_deep_gemm.py +5 -5
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +78 -78
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +2 -2
sglang/srt/configs/model_config.py +40 -28
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +69 -43
sglang/srt/conversation.py +49 -44
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +129 -135
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
sglang/srt/disaggregation/fake/conn.py +3 -13
sglang/srt/disaggregation/kv_events.py +357 -0
sglang/srt/disaggregation/mini_lb.py +57 -24
sglang/srt/disaggregation/mooncake/conn.py +238 -122
sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
sglang/srt/disaggregation/nixl/conn.py +10 -19
sglang/srt/disaggregation/prefill.py +132 -47
sglang/srt/disaggregation/utils.py +123 -6
sglang/srt/distributed/utils.py +3 -3
sglang/srt/entrypoints/EngineBase.py +5 -0
sglang/srt/entrypoints/engine.py +44 -9
sglang/srt/entrypoints/http_server.py +23 -6
sglang/srt/entrypoints/http_server_engine.py +5 -2
sglang/srt/function_call/base_format_detector.py +250 -0
sglang/srt/function_call/core_types.py +34 -0
sglang/srt/function_call/deepseekv3_detector.py +157 -0
sglang/srt/function_call/ebnf_composer.py +234 -0
sglang/srt/function_call/function_call_parser.py +175 -0
sglang/srt/function_call/llama32_detector.py +74 -0
sglang/srt/function_call/mistral_detector.py +84 -0
sglang/srt/function_call/pythonic_detector.py +163 -0
sglang/srt/function_call/qwen25_detector.py +67 -0
sglang/srt/function_call/utils.py +35 -0
sglang/srt/hf_transformers_utils.py +46 -7
sglang/srt/layers/attention/aiter_backend.py +513 -0
sglang/srt/layers/attention/flashattention_backend.py +64 -18
sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
sglang/srt/layers/attention/flashmla_backend.py +340 -78
sglang/srt/layers/attention/triton_backend.py +3 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/utils.py +6 -4
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/communicator.py +451 -0
sglang/srt/layers/dp_attention.py +61 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/cutlass_moe.py +207 -0
sglang/srt/layers/moe/ep_moe/kernels.py +34 -12
sglang/srt/layers/moe/ep_moe/layer.py +105 -51
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
sglang/srt/layers/moe/topk.py +67 -10
sglang/srt/layers/multimodal.py +70 -0
sglang/srt/layers/quantization/__init__.py +8 -3
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/deep_gemm.py +77 -74
sglang/srt/layers/quantization/fp8.py +92 -2
sglang/srt/layers/quantization/fp8_kernel.py +3 -3
sglang/srt/layers/quantization/fp8_utils.py +6 -0
sglang/srt/layers/quantization/gptq.py +298 -6
sglang/srt/layers/quantization/int8_kernel.py +20 -7
sglang/srt/layers/quantization/qoq.py +244 -0
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +2 -4
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/deepseek_eplb.py +278 -0
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/eplb_manager.py +55 -0
sglang/srt/managers/expert_distribution.py +704 -56
sglang/srt/managers/expert_location.py +394 -0
sglang/srt/managers/expert_location_dispatch.py +91 -0
sglang/srt/managers/io_struct.py +19 -4
sglang/srt/managers/mm_utils.py +294 -140
sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
sglang/srt/managers/multimodal_processors/internvl.py +14 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
sglang/srt/managers/schedule_batch.py +122 -42
sglang/srt/managers/schedule_policy.py +1 -5
sglang/srt/managers/scheduler.py +205 -138
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +232 -58
sglang/srt/managers/tp_worker.py +12 -9
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/base_prefix_cache.py +3 -0
sglang/srt/mem_cache/chunk_cache.py +3 -1
sglang/srt/mem_cache/hiradix_cache.py +4 -4
sglang/srt/mem_cache/memory_pool.py +76 -52
sglang/srt/mem_cache/multimodal_cache.py +45 -0
sglang/srt/mem_cache/radix_cache.py +58 -5
sglang/srt/metrics/collector.py +314 -39
sglang/srt/mm_utils.py +10 -0
sglang/srt/model_executor/cuda_graph_runner.py +29 -19
sglang/srt/model_executor/expert_location_updater.py +422 -0
sglang/srt/model_executor/forward_batch_info.py +5 -1
sglang/srt/model_executor/model_runner.py +163 -68
sglang/srt/model_loader/loader.py +10 -6
sglang/srt/models/clip.py +5 -1
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +308 -351
sglang/srt/models/exaone.py +8 -3
sglang/srt/models/gemma3_mm.py +70 -33
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llama4.py +15 -8
sglang/srt/models/llava.py +258 -7
sglang/srt/models/mimo_mtp.py +220 -0
sglang/srt/models/minicpmo.py +5 -12
sglang/srt/models/mistral.py +71 -1
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/mllama.py +3 -3
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/qwen2.py +95 -26
sglang/srt/models/qwen2_5_vl.py +8 -0
sglang/srt/models/qwen2_moe.py +330 -60
sglang/srt/models/qwen2_vl.py +6 -0
sglang/srt/models/qwen3.py +52 -10
sglang/srt/models/qwen3_moe.py +411 -48
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/siglip.py +294 -0
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/openai_api/adapter.py +58 -20
sglang/srt/openai_api/protocol.py +6 -8
sglang/srt/operations.py +154 -0
sglang/srt/operations_strategy.py +31 -0
sglang/srt/reasoning_parser.py +3 -3
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +4 -56
sglang/srt/sampling/sampling_params.py +2 -2
sglang/srt/server_args.py +162 -22
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +138 -7
sglang/srt/speculative/eagle_worker.py +69 -21
sglang/srt/utils.py +74 -17
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_cutlass_moe.py +278 -0
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +55 -14
sglang/utils.py +3 -3
sglang/version.py +1 -1
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +23 -13
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +178 -149
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
sglang/srt/function_call_parser.py +0 -858
sglang/srt/platforms/interface.py +0 -371
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
/sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0

sglang/srt/layers/communicator.py ADDED Viewed

@@ -0,0 +1,451 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Dict, Optional, Tuple
+import torch.distributed
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.dp_attention import (
+    attn_tp_all_gather,
+    attn_tp_reduce_scatter,
+    dp_gather_partial,
+    dp_scatter,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    get_local_attention_dp_size,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+class ScatterMode(Enum):
+    SCATTERED = auto()
+    TP_ATTN_FULL = auto()
+    FULL = auto()
+@dataclass
+class _LayerModeComputationContext:
+    num_layers: int
+    layer_id: int
+    is_layer_sparse: bool
+    is_previous_layer_sparse: Optional[bool]
+    def previous_layer(self):
+        assert self.is_previous_layer_sparse is not None
+        return _LayerModeComputationContext(
+            layer_id=self.layer_id - 1,
+            is_layer_sparse=self.is_previous_layer_sparse,
+            is_previous_layer_sparse=None,
+            num_layers=self.num_layers,
+        )
+@dataclass
+class LayerScatterModes:
+    layer_input_mode: ScatterMode
+    attn_mode: ScatterMode
+    # Can be further split into e.g. mlp_input_mode and mlp_output_mode if needed
+    mlp_mode: ScatterMode
+    middle_residual_mode: ScatterMode
+    layer_output_mode: ScatterMode
+    @classmethod
+    def init_new(cls, **kwargs):
+        context = _LayerModeComputationContext(**kwargs)
+        return cls(
+            layer_input_mode=cls._compute_layer_input_mode(context),
+            attn_mode=ScatterMode.TP_ATTN_FULL,
+            mlp_mode=cls._compute_mlp_mode(context),
+            middle_residual_mode=cls._compute_middle_residual_mode(context),
+            layer_output_mode=cls._compute_layer_output_mode(context),
+        )
+    @classmethod
+    def _compute_layer_input_mode(cls, context: _LayerModeComputationContext):
+        if context.layer_id == 0:
+            return ScatterMode.TP_ATTN_FULL
+        return cls._compute_layer_output_mode(context.previous_layer())
+    @classmethod
+    def _compute_mlp_mode(cls, context: _LayerModeComputationContext):
+        if context.is_layer_sparse:
+            return (
+                ScatterMode.SCATTERED
+                if global_server_args_dict["enable_deepep_moe"]
+                else ScatterMode.FULL
+            )
+        else:
+            return (
+                ScatterMode.SCATTERED
+                if enable_moe_dense_fully_dp()
+                else ScatterMode.FULL
+            )
+    @classmethod
+    def _compute_middle_residual_mode(cls, context: _LayerModeComputationContext):
+        mlp_mode = cls._compute_mlp_mode(context)
+        if mlp_mode == ScatterMode.SCATTERED:
+            return ScatterMode.SCATTERED
+        if mlp_mode == ScatterMode.FULL:
+            return ScatterMode.TP_ATTN_FULL
+        raise NotImplementedError
+    @classmethod
+    def _compute_layer_output_mode(cls, context: _LayerModeComputationContext):
+        mlp_mode = cls._compute_mlp_mode(context)
+        if context.layer_id == context.num_layers - 1:
+            return ScatterMode.TP_ATTN_FULL
+        if mlp_mode == ScatterMode.SCATTERED:
+            return ScatterMode.SCATTERED
+        if mlp_mode == ScatterMode.FULL:
+            return ScatterMode.TP_ATTN_FULL
+        raise NotImplementedError
+def enable_moe_dense_fully_dp():
+    return global_server_args_dict["moe_dense_tp_size"] == 1
+class LayerCommunicator:
+    def __init__(
+        self,
+        layer_scatter_modes: LayerScatterModes,
+        input_layernorm: torch.nn.Module,
+        post_attention_layernorm: torch.nn.Module,
+    ):
+        self.layer_scatter_modes = layer_scatter_modes
+        self.input_layernorm = input_layernorm
+        self.post_attention_layernorm = post_attention_layernorm
+        self.attn_tp_rank = get_attention_tp_rank()
+        self.attn_tp_size = get_attention_tp_size()
+        self.local_attn_dp_size = get_local_attention_dp_size()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.process_group_sizes = {
+            ScatterMode.SCATTERED: 1,
+            ScatterMode.TP_ATTN_FULL: self.attn_tp_size,
+            ScatterMode.FULL: self.tp_size,
+        }
+    def prepare_attn(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        if hidden_states.shape[0] == 0:
+            residual = hidden_states
+        else:
+            if residual is None:
+                residual = hidden_states
+                hidden_states = self.input_layernorm(hidden_states)
+            else:
+                hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = _communicate_simple(
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            input_mode=self.layer_scatter_modes.layer_input_mode,
+            output_mode=self.layer_scatter_modes.attn_mode,
+            context=self._compute_context(forward_batch),
+        )
+        return hidden_states, residual
+    def prepare_mlp(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        return _communicate_with_all_reduce_and_layer_norm(
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+            hidden_states_input_mode=self.layer_scatter_modes.attn_mode,
+            residual_input_mode=self.layer_scatter_modes.layer_input_mode,
+            hidden_states_output_mode=self.layer_scatter_modes.mlp_mode,
+            residual_output_mode=self.layer_scatter_modes.middle_residual_mode,
+            layernorm=self.post_attention_layernorm,
+            context=self._compute_context(forward_batch),
+        )
+    def postprocess_layer(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        return _communicate_summable_tensor_pair(
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+            hidden_states_input_mode=self.layer_scatter_modes.mlp_mode,
+            residual_input_mode=self.layer_scatter_modes.middle_residual_mode,
+            output_mode=self.layer_scatter_modes.layer_output_mode,
+            context=self._compute_context(forward_batch),
+        )
+    def _compute_context(self, forward_batch: ForwardBatch):
+        return _Context(
+            num_tokens_of_mode=_compute_num_tokens_of_mode(
+                forward_batch,
+                attn_tp_rank=self.attn_tp_rank,
+                attn_tp_size=self.attn_tp_size,
+            ),
+            process_group_sizes=self.process_group_sizes,
+            attn_tp_rank=self.attn_tp_rank,
+            attn_tp_size=self.attn_tp_size,
+            local_attn_dp_size=self.local_attn_dp_size,
+            tp_size=self.tp_size,
+        )
+def _compute_num_tokens_of_mode(
+    forward_batch: ForwardBatch, attn_tp_rank: int, attn_tp_size: int
+):
+    tp_attn_full_num_tokens = forward_batch.input_ids.shape[0]
+    return {
+        ScatterMode.SCATTERED: _torch_tensor_split_len(
+            tp_attn_full_num_tokens, attn_tp_size, attn_tp_rank
+        ),
+        ScatterMode.TP_ATTN_FULL: tp_attn_full_num_tokens,
+        ScatterMode.FULL: (
+            forward_batch.gathered_buffer.shape[0]
+            if global_server_args_dict["enable_dp_attention"]
+            else forward_batch.input_ids.shape[0]
+        ),
+    }
+def _torch_tensor_split_len(tensor_len: int, n: int, output_index: int):
+    if output_index < int(tensor_len % n):
+        return int(tensor_len / n) + 1
+    else:
+        return int(tensor_len / n)
+@dataclass
+class _Context:
+    num_tokens_of_mode: Dict["ScatterMode", int]
+    process_group_sizes: Dict["ScatterMode", int]
+    attn_tp_rank: int
+    attn_tp_size: int
+    local_attn_dp_size: int
+    tp_size: int
+    def is_same_group_size(self, a: "ScatterMode", b: "ScatterMode"):
+        return self.process_group_sizes[a] == self.process_group_sizes[b]
+    def check_shape(self, x: torch.Tensor, mode: ScatterMode):
+        if x is None:
+            return
+        actual_num_tokens = x.shape[0]
+        expect_num_tokens = self.num_tokens_of_mode[mode]
+        assert (
+            actual_num_tokens == expect_num_tokens
+        ), f"{actual_num_tokens=} {expect_num_tokens=} {mode=} {x.shape=} {self.num_tokens_of_mode=} {self.process_group_sizes=}"
+        return x
+    def check_shapes(
+        self, xs: Tuple[torch.Tensor, ...], modes: Tuple[ScatterMode, ...]
+    ) -> Tuple[torch.Tensor, ...]:
+        return tuple(
+            [self.check_shape(x, mode) for x, mode in zip(xs, modes, strict=True)]
+        )
+def _communicate_simple(
+    hidden_states: torch.Tensor,
+    forward_batch: ForwardBatch,
+    input_mode: ScatterMode,
+    output_mode: ScatterMode,
+    context: _Context,
+) -> torch.Tensor:
+    def _inner():
+        nonlocal hidden_states
+        if context.is_same_group_size(input_mode, output_mode):
+            return hidden_states
+        if (input_mode == ScatterMode.SCATTERED) and (
+            output_mode == ScatterMode.TP_ATTN_FULL
+        ):
+            hidden_states, local_hidden_states = (
+                forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
+                hidden_states,
+            )
+            attn_tp_all_gather(
+                list(hidden_states.tensor_split(context.attn_tp_size)),
+                local_hidden_states,
+            )
+            return hidden_states
+        raise NotImplementedError(f"{input_mode=} {output_mode=}")
+    context.check_shape(hidden_states, input_mode)
+    return context.check_shape(_inner(), output_mode)
+def _communicate_with_all_reduce_and_layer_norm(
+    hidden_states: torch.Tensor,
+    residual: torch.Tensor,
+    hidden_states_input_mode: ScatterMode,
+    residual_input_mode: ScatterMode,
+    hidden_states_output_mode: ScatterMode,
+    residual_output_mode: ScatterMode,
+    forward_batch: ForwardBatch,
+    layernorm: torch.nn.Module,
+    context: _Context,
+):
+    """Besides communication, needs to
+    1. All reduce in tp_attn_group on hidden_states
+    2. Apply layer norm
+    """
+    def _inner():
+        nonlocal hidden_states, residual
+        if (
+            context.is_same_group_size(
+                hidden_states_input_mode, hidden_states_output_mode
+            )
+            and context.is_same_group_size(residual_input_mode, residual_output_mode)
+            and context.attn_tp_size == 1
+        ):
+            # TODO move these `if shape != 0` into LayerNorm itself
+            if hidden_states.shape[0] != 0:
+                hidden_states, residual = layernorm(hidden_states, residual)
+            return hidden_states, residual
+        if (
+            (hidden_states_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (residual_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (hidden_states_output_mode == ScatterMode.FULL)
+            and (residual_output_mode == ScatterMode.TP_ATTN_FULL)
+        ):
+            if context.local_attn_dp_size != 1:
+                if context.attn_tp_rank == 0:
+                    hidden_states += residual
+                hidden_states, local_hidden_states = (
+                    forward_batch.gathered_buffer,
+                    hidden_states,
+                )
+                dp_gather_partial(hidden_states, local_hidden_states, forward_batch)
+                dp_scatter(residual, hidden_states, forward_batch)
+                if hidden_states.shape[0] != 0:
+                    hidden_states = layernorm(hidden_states)
+            else:
+                hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+                hidden_states, residual = layernorm(hidden_states, residual)
+            return hidden_states, residual
+        if (
+            (hidden_states_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (
+                residual_input_mode in [ScatterMode.SCATTERED, ScatterMode.TP_ATTN_FULL]
+            )
+            and (hidden_states_output_mode == ScatterMode.SCATTERED)
+            and (residual_output_mode == ScatterMode.SCATTERED)
+        ):
+            tensor_list = list(hidden_states.tensor_split(context.attn_tp_size))
+            hidden_states = tensor_list[context.attn_tp_rank]
+            attn_tp_reduce_scatter(hidden_states, tensor_list)
+            if residual_input_mode == ScatterMode.TP_ATTN_FULL:
+                residual = residual.tensor_split(context.attn_tp_size)[
+                    context.attn_tp_rank
+                ]
+            if hidden_states.shape[0] != 0:
+                hidden_states, residual = layernorm(hidden_states, residual)
+            return hidden_states, residual
+        raise NotImplementedError(
+            f"{hidden_states_input_mode=} {residual_input_mode=} {residual_output_mode=} {residual_output_mode=}"
+        )
+    context.check_shapes(
+        (hidden_states, residual), (hidden_states_input_mode, residual_input_mode)
+    )
+    return context.check_shapes(
+        _inner(), (hidden_states_output_mode, residual_output_mode)
+    )
+def _communicate_summable_tensor_pair(
+    hidden_states: torch.Tensor,
+    residual: torch.Tensor,
+    forward_batch: ForwardBatch,
+    hidden_states_input_mode: ScatterMode,
+    residual_input_mode: ScatterMode,
+    output_mode: ScatterMode,
+    context: _Context,
+):
+    """It is allowed to make (hidden_states, residual) := (hidden_states + residual, None) if needed."""
+    def _inner():
+        nonlocal hidden_states, residual
+        if context.is_same_group_size(
+            hidden_states_input_mode, output_mode
+        ) and context.is_same_group_size(residual_input_mode, output_mode):
+            return hidden_states, residual
+        if (
+            (hidden_states_input_mode == ScatterMode.FULL)
+            and (residual_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (output_mode == ScatterMode.TP_ATTN_FULL)
+        ):
+            # TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
+            # important: forward batch.gathered_buffer is used both after scatter and after gather.
+            # be careful about this!
+            hidden_states, global_hidden_states = (
+                forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
+                hidden_states,
+            )
+            dp_scatter(hidden_states, global_hidden_states, forward_batch)
+            return hidden_states, residual
+        if (
+            (hidden_states_input_mode == ScatterMode.SCATTERED)
+            and (residual_input_mode == ScatterMode.SCATTERED)
+            and (output_mode == ScatterMode.TP_ATTN_FULL)
+        ):
+            hidden_states += residual
+            residual = None
+            hidden_states, local_hidden_states = (
+                forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
+                hidden_states,
+            )
+            attn_tp_all_gather(
+                list(hidden_states.tensor_split(context.attn_tp_size)),
+                local_hidden_states,
+            )
+            return hidden_states, residual
+        raise NotImplementedError(
+            f"{hidden_states_input_mode=} {residual_input_mode=} {output_mode=}"
+        )
+    context.check_shapes(
+        (hidden_states, residual), (hidden_states_input_mode, residual_input_mode)
+    )
+    return context.check_shapes(_inner(), (output_mode, output_mode))

sglang/srt/layers/dp_attention.py CHANGED Viewed

@@ -24,8 +24,10 @@ if TYPE_CHECKING:
 _ATTN_TP_GROUP = None
 _ATTN_TP_RANK = None
 _ATTN_TP_SIZE = None
-_DP_RANK = None
-_DP_SIZE = None
+_ATTN_DP_RANK = None
+_ATTN_DP_SIZE = None
+_LOCAL_ATTN_DP_SIZE = None
+_LOCAL_ATTN_DP_RANK = None
 def compute_dp_attention_world_info(enable_dp_attention, tp_rank, tp_size, dp_size):
@@ -33,9 +35,27 @@ def compute_dp_attention_world_info(enable_dp_attention, tp_rank, tp_size, dp_si
         return tp_rank, tp_size, 0
     attn_tp_size = tp_size // dp_size
-    dp_rank = tp_rank // attn_tp_size
+    attn_dp_rank = tp_rank // attn_tp_size
     attn_tp_rank = tp_rank % attn_tp_size
-    return attn_tp_rank, attn_tp_size, dp_rank
+    return attn_tp_rank, attn_tp_size, attn_dp_rank
+def compute_dp_attention_local_info(
+    enable_dp_attention, tp_rank, tp_size, dp_size, moe_dense_tp_size
+):
+    if not enable_dp_attention:
+        return tp_rank, tp_size, 0
+    local_tp_size = moe_dense_tp_size if moe_dense_tp_size else tp_size
+    local_tp_rank = tp_rank % local_tp_size
+    local_dp_size = max(1, dp_size // (tp_size // local_tp_size))
+    local_attn_tp_size = local_tp_size // local_dp_size
+    local_attn_dp_rank = local_tp_rank // local_attn_tp_size
+    local_attn_tp_rank = local_tp_rank % local_attn_tp_size
+    return local_attn_tp_rank, local_attn_tp_size, local_attn_dp_rank
 def initialize_dp_attention(
@@ -43,22 +63,32 @@ def initialize_dp_attention(
     tp_rank: int,
     tp_size: int,
     dp_size: int,
+    moe_dense_tp_size: int,
     pp_size: int,
 ):
-    global _ATTN_TP_GROUP, _ATTN_TP_RANK, _ATTN_TP_SIZE, _DP_RANK, _DP_SIZE
+    global _ATTN_TP_GROUP, _ATTN_TP_RANK, _ATTN_TP_SIZE, _ATTN_DP_RANK, _ATTN_DP_SIZE
+    global _LOCAL_ATTN_DP_SIZE, _LOCAL_ATTN_DP_RANK
     from sglang.srt.layers.sampler import SYNC_TOKEN_IDS_ACROSS_TP
-    _ATTN_TP_RANK, _ATTN_TP_SIZE, _DP_RANK = compute_dp_attention_world_info(
+    _ATTN_TP_RANK, _ATTN_TP_SIZE, _ATTN_DP_RANK = compute_dp_attention_world_info(
         enable_dp_attention, tp_rank, tp_size, dp_size
     )
+    _, _, _LOCAL_ATTN_DP_RANK = compute_dp_attention_local_info(
+        enable_dp_attention, tp_rank, tp_size, dp_size, moe_dense_tp_size
+    )
     if enable_dp_attention:
         local_rank = tp_rank % (tp_size // dp_size)
-        _DP_SIZE = dp_size
+        _ATTN_DP_SIZE = dp_size
+        if moe_dense_tp_size is None:
+            _LOCAL_ATTN_DP_SIZE = _ATTN_DP_SIZE
+        else:
+            _LOCAL_ATTN_DP_SIZE = max(1, dp_size // (tp_size // moe_dense_tp_size))
     else:
         local_rank = tp_rank
-        _DP_SIZE = 1
+        _ATTN_DP_SIZE = 1
+        _LOCAL_ATTN_DP_SIZE = 1
     tp_group = get_tp_group()
     _ATTN_TP_GROUP = GroupCoordinator(
@@ -93,13 +123,23 @@ def get_attention_tp_size():
 def get_attention_dp_rank():
-    assert _DP_RANK is not None, "dp attention not initialized!"
-    return _DP_RANK
+    assert _ATTN_DP_RANK is not None, "dp attention not initialized!"
+    return _ATTN_DP_RANK
 def get_attention_dp_size():
-    assert _DP_SIZE is not None, "dp attention not initialized!"
-    return _DP_SIZE
+    assert _ATTN_DP_SIZE is not None, "dp attention not initialized!"
+    return _ATTN_DP_SIZE
+def get_local_attention_dp_rank():
+    assert _LOCAL_ATTN_DP_RANK is not None, "dp attention not initialized!"
+    return _LOCAL_ATTN_DP_RANK
+def get_local_attention_dp_size():
+    assert _LOCAL_ATTN_DP_SIZE is not None, "dp attention not initialized!"
+    return _LOCAL_ATTN_DP_SIZE
 @contextmanager
@@ -112,19 +152,19 @@ def disable_dp_size():
     Args:
         tp_group (GroupCoordinator): the tp group coordinator
     """
-    global _DP_SIZE
-    assert _DP_SIZE is not None, "dp attention not initialized!"
+    global _ATTN_DP_SIZE
+    assert _ATTN_DP_SIZE is not None, "dp attention not initialized!"
-    old_dp_size = _DP_SIZE
-    _DP_SIZE = 1
+    old_dp_size = _ATTN_DP_SIZE
+    _ATTN_DP_SIZE = 1
     try:
         yield
     finally:
-        _DP_SIZE = old_dp_size
+        _ATTN_DP_SIZE = old_dp_size
 def get_dp_local_info(forward_batch: ForwardBatch):
-    dp_rank = get_attention_dp_rank()
+    dp_rank = get_local_attention_dp_rank()
     if forward_batch.dp_local_start_pos is None:
         cumtokens = torch.cumsum(forward_batch.global_num_tokens_gpu, dim=0)
@@ -201,7 +241,7 @@ def _dp_gather(
             global_tokens, local_tokens, 0, local_start_pos, local_num_tokens, False
         )
-    # Input IDs are in int 32. We should use inplace_all_reduce for local case becaues of custom all reduce.
+    # Input IDs are in int 32. We should use inplace_all_reduce for local case because of custom all reduce.
     NUM_GPUS_PER_NODE = 8
     if (
         not local_tokens.dtype.is_floating_point
@@ -252,12 +292,12 @@ def dp_scatter(
         )
-def tp_reduce_scatter(
+def attn_tp_reduce_scatter(
     output: torch.Tensor,
     input_list: List[torch.Tensor],
 ):
     return get_attention_tp_group().reduce_scatter(output, input_list)
-def tp_all_gather(output_list: List[torch.Tensor], input_: torch.Tensor):
+def attn_tp_all_gather(output_list: List[torch.Tensor], input_: torch.Tensor):
     return get_attention_tp_group().all_gather(input_, tensor_list=output_list)

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -76,7 +76,7 @@ class RMSNorm(CustomOp):
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         if not x.is_contiguous():
-            # NOTE: Romove this if aiter kernel supports discontinuous input
+            # NOTE: Remove this if aiter kernel supports discontinuous input
             x = x.contiguous()
         if residual is not None:
             fused_add_rms_norm(x, residual, self.weight.data, self.variance_epsilon)

sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post5py3-none-any.whl