PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (359) hide show

sglang/srt/layers/communicator.py CHANGED Viewed

@@ -14,7 +14,8 @@
 from dataclasses import dataclass
 from enum import Enum, auto
-from typing import Dict, Optional, Tuple
+from functools import partial
+from typing import Dict, Optional
 import torch.distributed
@@ -36,10 +37,23 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 class ScatterMode(Enum):
+    """
+    Suppose we have TP=4, DP=2, enable-dp-attention, and the system handles seq a,b,c,d
+    Model input/output: [ab, ab, cd, cd] for four ranks respectively
+    SCATTERED: [a, b, c, d]
+    TP_ATTN_FULL: [ab, ab, cd, cd], i.e. all ranks inside a TP attn group have full data of the group
+    FULL: [abcd, abcd, abcd, abcd]
+    """
     SCATTERED = auto()
     TP_ATTN_FULL = auto()
     FULL = auto()
+    @staticmethod
+    def model_input_output():
+        """The scatter mode for model forward pass input and output data"""
+        return ScatterMode.TP_ATTN_FULL
 @dataclass
 class _LayerModeComputationContext:
@@ -81,7 +95,7 @@ class LayerScatterModes:
     @classmethod
     def _compute_layer_input_mode(cls, context: _LayerModeComputationContext):
         if context.layer_id == 0:
-            return ScatterMode.TP_ATTN_FULL
+            return ScatterMode.model_input_output()
         return cls._compute_layer_output_mode(context.previous_layer())
     @classmethod
@@ -112,7 +126,7 @@ class LayerScatterModes:
     def _compute_layer_output_mode(cls, context: _LayerModeComputationContext):
         mlp_mode = cls._compute_mlp_mode(context)
         if context.layer_id == context.num_layers - 1:
-            return ScatterMode.TP_ATTN_FULL
+            return ScatterMode.model_input_output()
         if mlp_mode == ScatterMode.SCATTERED:
             return ScatterMode.SCATTERED
         if mlp_mode == ScatterMode.FULL:
@@ -135,15 +149,29 @@ class LayerCommunicator:
         self.input_layernorm = input_layernorm
         self.post_attention_layernorm = post_attention_layernorm
-        self.attn_tp_rank = get_attention_tp_rank()
-        self.attn_tp_size = get_attention_tp_size()
-        self.local_attn_dp_size = get_local_attention_dp_size()
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.process_group_sizes = {
-            ScatterMode.SCATTERED: 1,
-            ScatterMode.TP_ATTN_FULL: self.attn_tp_size,
-            ScatterMode.FULL: self.tp_size,
-        }
+        self._context = CommunicateContext.init_new()
+        self._communicate_simple_fn = CommunicateSimpleFn.get_fn(
+            input_mode=self.layer_scatter_modes.layer_input_mode,
+            output_mode=self.layer_scatter_modes.attn_mode,
+            context=self._context,
+        )
+        self._communicate_with_all_reduce_and_layer_norm_fn = (
+            CommunicateWithAllReduceAndLayerNormFn.get_fn(
+                hidden_states_input_mode=self.layer_scatter_modes.attn_mode,
+                residual_input_mode=self.layer_scatter_modes.layer_input_mode,
+                hidden_states_output_mode=self.layer_scatter_modes.mlp_mode,
+                residual_output_mode=self.layer_scatter_modes.middle_residual_mode,
+                context=self._context,
+            )
+        )
+        self._communicate_summable_tensor_pair_fn = (
+            CommunicateSummableTensorPairFn.get_fn(
+                hidden_states_input_mode=self.layer_scatter_modes.mlp_mode,
+                residual_input_mode=self.layer_scatter_modes.middle_residual_mode,
+                output_mode=self.layer_scatter_modes.layer_output_mode,
+                context=self._context,
+            )
+        )
     def prepare_attn(
         self,
@@ -160,12 +188,10 @@ class LayerCommunicator:
             else:
                 hidden_states, residual = self.input_layernorm(hidden_states, residual)
-        hidden_states = _communicate_simple(
+        hidden_states = self._communicate_simple_fn(
             hidden_states=hidden_states,
             forward_batch=forward_batch,
-            input_mode=self.layer_scatter_modes.layer_input_mode,
-            output_mode=self.layer_scatter_modes.attn_mode,
-            context=self._compute_context(forward_batch),
+            context=self._context,
         )
         return hidden_states, residual
@@ -176,16 +202,12 @@ class LayerCommunicator:
         residual: torch.Tensor,
         forward_batch: ForwardBatch,
     ):
-        return _communicate_with_all_reduce_and_layer_norm(
+        return self._communicate_with_all_reduce_and_layer_norm_fn(
             hidden_states=hidden_states,
             residual=residual,
             forward_batch=forward_batch,
-            hidden_states_input_mode=self.layer_scatter_modes.attn_mode,
-            residual_input_mode=self.layer_scatter_modes.layer_input_mode,
-            hidden_states_output_mode=self.layer_scatter_modes.mlp_mode,
-            residual_output_mode=self.layer_scatter_modes.middle_residual_mode,
             layernorm=self.post_attention_layernorm,
-            context=self._compute_context(forward_batch),
+            context=self._context,
         )
     def postprocess_layer(
@@ -194,136 +216,102 @@ class LayerCommunicator:
         residual: torch.Tensor,
         forward_batch: ForwardBatch,
     ):
-        return _communicate_summable_tensor_pair(
+        return self._communicate_summable_tensor_pair_fn(
             hidden_states=hidden_states,
             residual=residual,
             forward_batch=forward_batch,
-            hidden_states_input_mode=self.layer_scatter_modes.mlp_mode,
-            residual_input_mode=self.layer_scatter_modes.middle_residual_mode,
-            output_mode=self.layer_scatter_modes.layer_output_mode,
-            context=self._compute_context(forward_batch),
-        )
-    def _compute_context(self, forward_batch: ForwardBatch):
-        return _Context(
-            num_tokens_of_mode=_compute_num_tokens_of_mode(
-                forward_batch,
-                attn_tp_rank=self.attn_tp_rank,
-                attn_tp_size=self.attn_tp_size,
-            ),
-            process_group_sizes=self.process_group_sizes,
-            attn_tp_rank=self.attn_tp_rank,
-            attn_tp_size=self.attn_tp_size,
-            local_attn_dp_size=self.local_attn_dp_size,
-            tp_size=self.tp_size,
+            context=self._context,
         )
-def _compute_num_tokens_of_mode(
-    forward_batch: ForwardBatch, attn_tp_rank: int, attn_tp_size: int
-):
-    tp_attn_full_num_tokens = forward_batch.input_ids.shape[0]
-    return {
-        ScatterMode.SCATTERED: _torch_tensor_split_len(
-            tp_attn_full_num_tokens, attn_tp_size, attn_tp_rank
-        ),
-        ScatterMode.TP_ATTN_FULL: tp_attn_full_num_tokens,
-        ScatterMode.FULL: (
-            forward_batch.gathered_buffer.shape[0]
-            if global_server_args_dict["enable_dp_attention"]
-            else forward_batch.input_ids.shape[0]
-        ),
-    }
-def _torch_tensor_split_len(tensor_len: int, n: int, output_index: int):
-    if output_index < int(tensor_len % n):
-        return int(tensor_len / n) + 1
-    else:
-        return int(tensor_len / n)
 @dataclass
-class _Context:
-    num_tokens_of_mode: Dict["ScatterMode", int]
-    process_group_sizes: Dict["ScatterMode", int]
+class CommunicateContext:
+    process_group_sizes: Dict[ScatterMode, int]
     attn_tp_rank: int
     attn_tp_size: int
     local_attn_dp_size: int
     tp_size: int
-    def is_same_group_size(self, a: "ScatterMode", b: "ScatterMode"):
+    def is_same_group_size(self, a: ScatterMode, b: ScatterMode):
         return self.process_group_sizes[a] == self.process_group_sizes[b]
-    def check_shape(self, x: torch.Tensor, mode: ScatterMode):
-        if x is None:
-            return
-        actual_num_tokens = x.shape[0]
-        expect_num_tokens = self.num_tokens_of_mode[mode]
-        assert (
-            actual_num_tokens == expect_num_tokens
-        ), f"{actual_num_tokens=} {expect_num_tokens=} {mode=} {x.shape=} {self.num_tokens_of_mode=} {self.process_group_sizes=}"
-        return x
-    def check_shapes(
-        self, xs: Tuple[torch.Tensor, ...], modes: Tuple[ScatterMode, ...]
-    ) -> Tuple[torch.Tensor, ...]:
-        return tuple(
-            [self.check_shape(x, mode) for x, mode in zip(xs, modes, strict=True)]
+    @classmethod
+    def init_new(cls):
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+        local_attn_dp_size = get_local_attention_dp_size()
+        tp_size = get_tensor_model_parallel_world_size()
+        process_group_sizes = {
+            ScatterMode.SCATTERED: 1,
+            ScatterMode.TP_ATTN_FULL: attn_tp_size,
+            # TODO: support --moe-dense-tp-size > 1
+            ScatterMode.FULL: tp_size,
+        }
+        return cls(
+            process_group_sizes=process_group_sizes,
+            attn_tp_rank=attn_tp_rank,
+            attn_tp_size=attn_tp_size,
+            local_attn_dp_size=local_attn_dp_size,
+            tp_size=tp_size,
         )
-def _communicate_simple(
-    hidden_states: torch.Tensor,
-    forward_batch: ForwardBatch,
-    input_mode: ScatterMode,
-    output_mode: ScatterMode,
-    context: _Context,
-) -> torch.Tensor:
-    def _inner():
-        nonlocal hidden_states
+class CommunicateSimpleFn:
+    @staticmethod
+    def get_fn(
+        input_mode: ScatterMode,
+        output_mode: ScatterMode,
+        context: CommunicateContext,
+    ):
         if context.is_same_group_size(input_mode, output_mode):
-            return hidden_states
+            return CommunicateSimpleFn._trivial
         if (input_mode == ScatterMode.SCATTERED) and (
             output_mode == ScatterMode.TP_ATTN_FULL
         ):
-            hidden_states, local_hidden_states = (
-                forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
-                hidden_states,
-            )
-            attn_tp_all_gather(
-                list(hidden_states.tensor_split(context.attn_tp_size)),
-                local_hidden_states,
-            )
-            return hidden_states
+            return CommunicateSimpleFn._scattered_to_tp_attn_full
         raise NotImplementedError(f"{input_mode=} {output_mode=}")
-    context.check_shape(hidden_states, input_mode)
-    return context.check_shape(_inner(), output_mode)
-def _communicate_with_all_reduce_and_layer_norm(
-    hidden_states: torch.Tensor,
-    residual: torch.Tensor,
-    hidden_states_input_mode: ScatterMode,
-    residual_input_mode: ScatterMode,
-    hidden_states_output_mode: ScatterMode,
-    residual_output_mode: ScatterMode,
-    forward_batch: ForwardBatch,
-    layernorm: torch.nn.Module,
-    context: _Context,
-):
+    @staticmethod
+    def _trivial(
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+    ) -> torch.Tensor:
+        return hidden_states
+    @staticmethod
+    def _scattered_to_tp_attn_full(
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+    ) -> torch.Tensor:
+        hidden_states, local_hidden_states = (
+            forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
+            hidden_states,
+        )
+        attn_tp_all_gather(
+            list(hidden_states.tensor_split(context.attn_tp_size)),
+            local_hidden_states,
+        )
+        return hidden_states
+class CommunicateWithAllReduceAndLayerNormFn:
     """Besides communication, needs to
     1. All reduce in tp_attn_group on hidden_states
     2. Apply layer norm
     """
-    def _inner():
-        nonlocal hidden_states, residual
+    @staticmethod
+    def get_fn(
+        hidden_states_input_mode: ScatterMode,
+        residual_input_mode: ScatterMode,
+        hidden_states_output_mode: ScatterMode,
+        residual_output_mode: ScatterMode,
+        context: CommunicateContext,
+    ):
         if (
             context.is_same_group_size(
@@ -332,32 +320,20 @@ def _communicate_with_all_reduce_and_layer_norm(
             and context.is_same_group_size(residual_input_mode, residual_output_mode)
             and context.attn_tp_size == 1
         ):
-            # TODO move these `if shape != 0` into LayerNorm itself
-            if hidden_states.shape[0] != 0:
-                hidden_states, residual = layernorm(hidden_states, residual)
-            return hidden_states, residual
+            return CommunicateWithAllReduceAndLayerNormFn._simple
         if (
             (hidden_states_input_mode == ScatterMode.TP_ATTN_FULL)
-            and (residual_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (
+                residual_input_mode in [ScatterMode.SCATTERED, ScatterMode.TP_ATTN_FULL]
+            )
             and (hidden_states_output_mode == ScatterMode.FULL)
             and (residual_output_mode == ScatterMode.TP_ATTN_FULL)
         ):
-            if context.local_attn_dp_size != 1:
-                if context.attn_tp_rank == 0:
-                    hidden_states += residual
-                hidden_states, local_hidden_states = (
-                    forward_batch.gathered_buffer,
-                    hidden_states,
-                )
-                dp_gather_partial(hidden_states, local_hidden_states, forward_batch)
-                dp_scatter(residual, hidden_states, forward_batch)
-                if hidden_states.shape[0] != 0:
-                    hidden_states = layernorm(hidden_states)
-            else:
-                hidden_states = tensor_model_parallel_all_reduce(hidden_states)
-                hidden_states, residual = layernorm(hidden_states, residual)
-            return hidden_states, residual
+            return partial(
+                CommunicateWithAllReduceAndLayerNormFn._gather_hidden_states_and_residual,
+                residual_input_mode=residual_input_mode,
+            )
         if (
             (hidden_states_input_mode == ScatterMode.TP_ATTN_FULL)
@@ -367,85 +343,193 @@ def _communicate_with_all_reduce_and_layer_norm(
             and (hidden_states_output_mode == ScatterMode.SCATTERED)
             and (residual_output_mode == ScatterMode.SCATTERED)
         ):
-            tensor_list = list(hidden_states.tensor_split(context.attn_tp_size))
-            hidden_states = tensor_list[context.attn_tp_rank]
-            attn_tp_reduce_scatter(hidden_states, tensor_list)
-            if residual_input_mode == ScatterMode.TP_ATTN_FULL:
-                residual = residual.tensor_split(context.attn_tp_size)[
-                    context.attn_tp_rank
-                ]
-            if hidden_states.shape[0] != 0:
-                hidden_states, residual = layernorm(hidden_states, residual)
-            return hidden_states, residual
+            return partial(
+                CommunicateWithAllReduceAndLayerNormFn._scatter_hidden_states_and_residual,
+                residual_input_mode=residual_input_mode,
+            )
         raise NotImplementedError(
             f"{hidden_states_input_mode=} {residual_input_mode=} {residual_output_mode=} {residual_output_mode=}"
         )
-    context.check_shapes(
-        (hidden_states, residual), (hidden_states_input_mode, residual_input_mode)
-    )
-    return context.check_shapes(
-        _inner(), (hidden_states_output_mode, residual_output_mode)
-    )
-def _communicate_summable_tensor_pair(
-    hidden_states: torch.Tensor,
-    residual: torch.Tensor,
-    forward_batch: ForwardBatch,
-    hidden_states_input_mode: ScatterMode,
-    residual_input_mode: ScatterMode,
-    output_mode: ScatterMode,
-    context: _Context,
-):
-    """It is allowed to make (hidden_states, residual) := (hidden_states + residual, None) if needed."""
+    @staticmethod
+    def _simple(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        layernorm: torch.nn.Module,
+        context: CommunicateContext,
+    ):
+        # TODO move these `if shape != 0` into LayerNorm itself
+        if hidden_states.shape[0] != 0:
+            hidden_states, residual = layernorm(hidden_states, residual)
+        return hidden_states, residual
+    @staticmethod
+    def _gather_hidden_states_and_residual(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        layernorm: torch.nn.Module,
+        context: CommunicateContext,
+        *,
+        residual_input_mode,
+    ):
+        if residual_input_mode == ScatterMode.SCATTERED and context.attn_tp_size > 1:
+            residual, local_residual = (
+                forward_batch.gathered_buffer[
+                    : forward_batch.input_ids.shape[0]
+                ].clone(),
+                residual,
+            )
+            attn_tp_all_gather(
+                list(residual.tensor_split(context.attn_tp_size)), local_residual
+            )
+        if context.local_attn_dp_size != 1:
+            if context.attn_tp_rank == 0:
+                hidden_states += residual
+            hidden_states, local_hidden_states = (
+                forward_batch.gathered_buffer,
+                hidden_states,
+            )
+            dp_gather_partial(hidden_states, local_hidden_states, forward_batch)
+            dp_scatter(residual, hidden_states, forward_batch)
+            if hidden_states.shape[0] != 0:
+                hidden_states = layernorm(hidden_states)
+        else:
+            hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+            hidden_states, residual = layernorm(hidden_states, residual)
+        return hidden_states, residual
-    def _inner():
-        nonlocal hidden_states, residual
+    @staticmethod
+    def _scatter_hidden_states_and_residual(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        layernorm: torch.nn.Module,
+        context: CommunicateContext,
+        *,
+        residual_input_mode,
+    ):
+        tensor_list = list(hidden_states.tensor_split(context.attn_tp_size))
+        hidden_states = tensor_list[context.attn_tp_rank]
+        attn_tp_reduce_scatter(hidden_states, tensor_list)
+        if residual_input_mode == ScatterMode.TP_ATTN_FULL:
+            residual = residual.tensor_split(context.attn_tp_size)[context.attn_tp_rank]
+        if hidden_states.shape[0] != 0:
+            hidden_states, residual = layernorm(hidden_states, residual)
+        return hidden_states, residual
+class CommunicateSummableTensorPairFn:
+    """It is allowed to make (hidden_states, residual) := (hidden_states + residual, None) if needed."""
+    @classmethod
+    def execute(
+        cls,
+        hidden_states_input_mode,
+        residual_input_mode,
+        output_mode,
+        context,
+        **kwargs,
+    ):
+        return cls.get_fn(
+            hidden_states_input_mode=hidden_states_input_mode,
+            residual_input_mode=residual_input_mode,
+            output_mode=output_mode,
+            context=context,
+        )(context=context, **kwargs)
+    @staticmethod
+    def get_fn(
+        hidden_states_input_mode: ScatterMode,
+        residual_input_mode: ScatterMode,
+        output_mode: ScatterMode,
+        context: CommunicateContext,
+    ):
         if context.is_same_group_size(
             hidden_states_input_mode, output_mode
         ) and context.is_same_group_size(residual_input_mode, output_mode):
-            return hidden_states, residual
+            return CommunicateSummableTensorPairFn._trivial
         if (
             (hidden_states_input_mode == ScatterMode.FULL)
             and (residual_input_mode == ScatterMode.TP_ATTN_FULL)
             and (output_mode == ScatterMode.TP_ATTN_FULL)
         ):
-            # TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
-            # important: forward batch.gathered_buffer is used both after scatter and after gather.
-            # be careful about this!
-            hidden_states, global_hidden_states = (
-                forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
-                hidden_states,
-            )
-            dp_scatter(hidden_states, global_hidden_states, forward_batch)
-            return hidden_states, residual
+            return CommunicateSummableTensorPairFn._scatter_hidden_states
         if (
             (hidden_states_input_mode == ScatterMode.SCATTERED)
             and (residual_input_mode == ScatterMode.SCATTERED)
             and (output_mode == ScatterMode.TP_ATTN_FULL)
         ):
-            hidden_states += residual
-            residual = None
-            hidden_states, local_hidden_states = (
-                forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
-                hidden_states,
-            )
-            attn_tp_all_gather(
-                list(hidden_states.tensor_split(context.attn_tp_size)),
-                local_hidden_states,
-            )
-            return hidden_states, residual
+            return CommunicateSummableTensorPairFn._gather
+        if (
+            (hidden_states_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (residual_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (output_mode == ScatterMode.SCATTERED)
+        ):
+            return CommunicateSummableTensorPairFn._scatter
         raise NotImplementedError(
             f"{hidden_states_input_mode=} {residual_input_mode=} {output_mode=}"
         )
-    context.check_shapes(
-        (hidden_states, residual), (hidden_states_input_mode, residual_input_mode)
-    )
-    return context.check_shapes(_inner(), (output_mode, output_mode))
+    @staticmethod
+    def _trivial(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+    ):
+        return hidden_states, residual
+    @staticmethod
+    def _scatter_hidden_states(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+    ):
+        # TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
+        # important: forward batch.gathered_buffer is used both after scatter and after gather.
+        # be careful about this!
+        hidden_states, global_hidden_states = (
+            forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
+            hidden_states,
+        )
+        dp_scatter(hidden_states, global_hidden_states, forward_batch)
+        return hidden_states, residual
+    @staticmethod
+    def _gather(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+    ):
+        hidden_states += residual
+        residual = None
+        hidden_states, local_hidden_states = (
+            forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
+            hidden_states,
+        )
+        attn_tp_all_gather(
+            list(hidden_states.tensor_split(context.attn_tp_size)),
+            local_hidden_states,
+        )
+        return hidden_states, residual
+    @staticmethod
+    def _scatter(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+    ):
+        assert residual is None, "not yet handled residual!=None"
+        tensor_list = list(hidden_states.tensor_split(context.attn_tp_size))
+        hidden_states = tensor_list[context.attn_tp_rank]
+        return hidden_states, residual

sglang/srt/layers/dp_attention.py CHANGED Viewed

@@ -98,11 +98,12 @@ def initialize_dp_attention(
         ],
         local_rank,
         torch.distributed.get_backend(tp_group.device_group),
-        SYNC_TOKEN_IDS_ACROSS_TP,
-        False,
-        False,
-        False,
-        False,
+        use_pynccl=SYNC_TOKEN_IDS_ACROSS_TP,
+        use_pymscclpp=False,
+        use_custom_allreduce=False,
+        use_hpu_communicator=False,
+        use_xpu_communicator=False,
+        use_npu_communicator=False,
         group_name="attention_tp",
     )

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl