PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (359) hide show

sglang/srt/model_executor/expert_location_updater.py CHANGED Viewed

@@ -12,8 +12,10 @@
 # limitations under the License.
 # ==============================================================================
 import logging
-from typing import Dict, List, Tuple
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+import einops
 import torch
 import torch.distributed
 from torch.distributed import P2POp
@@ -22,58 +24,131 @@ from sglang.srt.managers.expert_location import (
     ExpertLocationMetadata,
     get_global_expert_location_metadata,
 )
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.utils import get_bool_env_var
 logger = logging.getLogger(__name__)
-def update_expert_location(
+class ExpertLocationUpdater:
+    def __init__(self):
+        self._first_execution = True
+    def update(
+        self,
+        routed_experts_weights_of_layer: Dict[int, List[torch.Tensor]],
+        new_expert_location_metadata: ExpertLocationMetadata,
+        update_layer_ids: List[int],
+        nnodes: int,
+        rank: int,
+    ):
+        if self._first_execution:
+            self._first_execution = False
+            torch.cuda.empty_cache()
+        old_expert_location_metadata = get_global_expert_location_metadata()
+        _update_expert_weights(
+            routed_experts_weights_of_layer=routed_experts_weights_of_layer,
+            old_expert_location_metadata=old_expert_location_metadata,
+            new_expert_location_metadata=new_expert_location_metadata,
+            update_layer_ids=update_layer_ids,
+            nnodes=nnodes,
+            rank=rank,
+        )
+        old_expert_location_metadata.update(
+            new_expert_location_metadata,
+            update_layer_ids=update_layer_ids,
+        )
+def _update_expert_weights(**kwargs):
+    if get_bool_env_var("SGLANG_EXPERT_LOCATION_UPDATER_CANARY"):
+        return _update_expert_weights_with_canary(**kwargs)
+    else:
+        return _update_expert_weights_raw(**kwargs)
+# can add watchdog as well
+def _update_expert_weights_with_canary(
     routed_experts_weights_of_layer: Dict[int, List[torch.Tensor]],
+    old_expert_location_metadata: ExpertLocationMetadata,
     new_expert_location_metadata: ExpertLocationMetadata,
+    update_layer_ids: List[int],
     nnodes: int,
     rank: int,
 ):
-    old_expert_location_metadata = get_global_expert_location_metadata()
-    _update_expert_weights(
-        routed_experts_weights_of_layer,
-        old_expert_location_metadata,
-        new_expert_location_metadata,
-        nnodes,
-        rank,
+    num_local_physical_experts = old_expert_location_metadata.num_local_physical_experts
+    def _get_canary_value(meta: ExpertLocationMetadata, layer_id: int):
+        return meta.physical_to_logical_map_cpu[
+            layer_id,
+            num_local_physical_experts * rank : num_local_physical_experts * (rank + 1),
+        ]
+    routed_experts_weights_of_layer = {
+        k: [x for x in v] for k, v in routed_experts_weights_of_layer.items()
+    }
+    for layer_id in update_layer_ids:
+        canary_tensor = (
+            _get_canary_value(old_expert_location_metadata, layer_id)
+            .clone()
+            .to(device=global_server_args_dict["device"], non_blocking=True)
+        )
+        routed_experts_weights_of_layer[layer_id].append(canary_tensor)
+    _update_expert_weights_raw(
+        routed_experts_weights_of_layer=routed_experts_weights_of_layer,
+        old_expert_location_metadata=old_expert_location_metadata,
+        new_expert_location_metadata=new_expert_location_metadata,
+        update_layer_ids=update_layer_ids,
+        nnodes=nnodes,
+        rank=rank,
     )
-    old_expert_location_metadata.update(new_expert_location_metadata)
+    for layer_id in update_layer_ids:
+        # can optimize speed if needed
+        expect_value = _get_canary_value(new_expert_location_metadata, layer_id)
+        actual_value = routed_experts_weights_of_layer[layer_id][-1].cpu()
+        assert torch.all(expect_value == actual_value), (
+            f"{expect_value=} {actual_value=} {layer_id=} "
+            f"{old_expert_location_metadata.physical_to_logical_map_cpu.tolist()=} "
+            f"{new_expert_location_metadata.physical_to_logical_map_cpu.tolist()=} "
+        )
-def _update_expert_weights(
+def _update_expert_weights_raw(
     routed_experts_weights_of_layer: Dict[int, List[torch.Tensor]],
     old_expert_location_metadata: ExpertLocationMetadata,
     new_expert_location_metadata: ExpertLocationMetadata,
+    update_layer_ids: List[int],
     nnodes: int,
     rank: int,
 ):
+    log_metrics = get_bool_env_var("SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS")
     temp_buffers = create_temp_buffers(
-        next(iter(routed_experts_weights_of_layer.values()))
+        routed_experts_weights_of_layer[update_layer_ids[0]]
     )
     world_size = torch.distributed.get_world_size()
     num_local_physical_experts = old_expert_location_metadata.num_local_physical_experts
     num_gpu_per_node = world_size // nnodes
-    old_physical_to_logical_map = (
-        old_expert_location_metadata.physical_to_logical_map.tolist()
-    )
-    new_physical_to_logical_map = (
-        new_expert_location_metadata.physical_to_logical_map.tolist()
-    )
-    for layer_id in sorted(routed_experts_weights_of_layer.keys()):
+    for layer_id in update_layer_ids:
         update_expert_weights_single_layer(
             routed_experts_weights=routed_experts_weights_of_layer[layer_id],
             temp_buffers=temp_buffers,
-            old_physical_to_logical_map=old_physical_to_logical_map[layer_id],
-            new_physical_to_logical_map=new_physical_to_logical_map[layer_id],
+            old_physical_to_logical_map=old_expert_location_metadata.physical_to_logical_map_cpu[
+                layer_id
+            ].tolist(),
+            new_physical_to_logical_map=new_expert_location_metadata.physical_to_logical_map_cpu[
+                layer_id
+            ].tolist(),
             num_local_physical_experts=num_local_physical_experts,
             num_gpu_per_node=num_gpu_per_node,
             rank=rank,
+            world_size=world_size,
+            log_metrics=log_metrics,
         )
@@ -89,7 +164,9 @@ def update_expert_weights_single_layer(
     num_local_physical_experts: int,
     num_gpu_per_node: int,
     rank: int,
+    world_size: Optional[int] = None,
     debug: bool = False,
+    log_metrics: bool = False,
 ):
     assert all(
         tensor.shape[0] == num_local_physical_experts
@@ -121,6 +198,14 @@ def update_expert_weights_single_layer(
         _execute_p2p_ops(p2p_op_infos)
         _execute_buffer2weight_copies(buffer2weight_copy_infos)
+        if log_metrics:
+            _log_p2p_op_metrics(
+                p2p_op_infos,
+                world_size=world_size,
+                num_gpu_per_node=num_gpu_per_node,
+                self_node_id=self_node_id,
+            )
         if debug:
             output_logs.append(f"{p2p_op_infos=}")
             output_logs.append(f"{buffer2weight_copy_infos=}")
@@ -420,3 +505,53 @@ def _deduplicate_ordered(arr: List[int]):
         if len(output) == 0 or item != output[-1]:
             output.append(item)
     return output
+def _log_p2p_op_metrics(
+    p2p_op_infos: List[Tuple[int, List[P2POp]]],
+    num_gpu_per_node: int,
+    world_size: int,
+    self_node_id: int,
+):
+    text = ""
+    all_ops = [op for _, ops in p2p_op_infos for op in ops]
+    for direction, ops in _group_by(all_ops, _get_direction_from_op).items():
+        nbytes_of_gpu = [0] * world_size
+        for op in ops:
+            nbytes_of_gpu[op.peer] += op.tensor.nbytes
+        nbytes_of_gpu = torch.tensor(nbytes_of_gpu, dtype=torch.int64)
+        nbytes_of_node = einops.reduce(
+            nbytes_of_gpu,
+            "(num_nodes num_gpu_per_node) -> num_nodes",
+            num_gpu_per_node=num_gpu_per_node,
+            reduction="sum",
+        )
+        nbytes_curr_node = nbytes_of_node[self_node_id]
+        nbytes_cross_node = torch.sum(nbytes_of_node) - nbytes_curr_node
+        text += (
+            f"{direction}_nbytes_of_gpu={nbytes_of_gpu.tolist()} "
+            f"{direction}_nbytes_of_node={nbytes_of_node.tolist()} "
+            f"{direction}_nbytes_curr_node={nbytes_curr_node.item()} "
+            f"{direction}_nbytes_cross_node={nbytes_cross_node.item()} "
+        )
+    logger.info(f"[ExpertLocationUpdater] {text}")
+def _get_direction_from_op(op: P2POp):
+    if op.op == torch.distributed.isend:
+        return "isend"
+    if op.op == torch.distributed.irecv:
+        return "irecv"
+    raise NotImplementedError
+def _group_by(items, keyfunc):
+    ans = defaultdict(list)
+    for item in items:
+        ans[keyfunc(item)].append(item)
+    return dict(ans)

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -31,14 +31,15 @@ from __future__ import annotations
 from dataclasses import dataclass
 from enum import IntEnum, auto
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from functools import total_ordering
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 import torch
 import triton
 import triton.language as tl
 from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
-from sglang.srt.utils import flatten_nested_list, get_compiler_backend
+from sglang.srt.utils import flatten_nested_list, get_compiler_backend, support_triton
 if TYPE_CHECKING:
     from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
@@ -117,12 +118,14 @@ class ForwardMode(IntEnum):
         return self == ForwardMode.DECODE or self == ForwardMode.IDLE
+@total_ordering
 class CaptureHiddenMode(IntEnum):
-    NULL = auto()
-    # Capture hidden states of all tokens.
-    FULL = auto()
+    # Do not capture anything.
+    NULL = 0
     # Capture a hidden state of the last token.
-    LAST = auto()
+    LAST = 1
+    # Capture hidden states of all tokens.
+    FULL = 2
     def need_capture(self):
         return self != CaptureHiddenMode.NULL
@@ -133,6 +136,9 @@ class CaptureHiddenMode(IntEnum):
     def is_last(self):
         return self == CaptureHiddenMode.LAST
+    def __lt__(self, other):
+        return self.value < other.value
 @dataclass
 class ForwardBatch:
@@ -218,6 +224,9 @@ class ForwardBatch:
     # For input embeddings
     input_embeds: Optional[torch.tensor] = None
+    # For cross-encoder model
+    token_type_ids: Optional[torch.Tensor] = None
     # Sampling info
     sampling_info: SamplingBatchInfo = None
@@ -239,6 +248,7 @@ class ForwardBatch:
     dp_local_num_tokens: Optional[torch.Tensor] = None  # cached info at runtime
     gathered_buffer: Optional[torch.Tensor] = None
     can_run_dp_cuda_graph: bool = False
+    global_forward_mode: Optional[ForwardMode] = None
     # Speculative decoding
     spec_info: Optional[Union[EagleVerifyInput, EagleDraftInput]] = None
@@ -252,18 +262,19 @@ class ForwardBatch:
     # For Qwen2-VL
     mrope_positions: torch.Tensor = None
+    # For two-batch overlap
+    tbo_split_seq_index: Optional[int] = None
+    tbo_parent_token_range: Optional[Tuple[int, int]] = None
+    tbo_children: Optional[List["ForwardBatch"]] = None
     @classmethod
     def init_new(
         cls,
         batch: ModelWorkerBatch,
         model_runner: ModelRunner,
     ):
-        device = model_runner.device
-        extend_input_logprob_token_ids_gpu = None
-        if batch.extend_input_logprob_token_ids is not None:
-            extend_input_logprob_token_ids_gpu = (
-                batch.extend_input_logprob_token_ids.to(device, non_blocking=True)
-            )
+        from sglang.srt.two_batch_overlap import TboForwardBatchPreparer
         ret = cls(
             forward_mode=batch.forward_mode,
             batch_size=len(batch.seq_lens),
@@ -277,10 +288,12 @@ class ForwardBatch:
             encoder_lens_cpu=batch.encoder_lens_cpu,
             encoder_out_cache_loc=batch.encoder_out_cache_loc,
             seq_lens_sum=batch.seq_lens_sum,
+            seq_lens_cpu=batch.seq_lens_cpu,
             return_logprob=batch.return_logprob,
             top_logprobs_nums=batch.top_logprobs_nums,
             token_ids_logprobs=batch.token_ids_logprobs,
             can_run_dp_cuda_graph=batch.can_run_dp_cuda_graph,
+            global_forward_mode=batch.global_forward_mode,
             lora_paths=batch.lora_paths,
             sampling_info=batch.sampling_info,
             req_to_token_pool=model_runner.req_to_token_pool,
@@ -290,11 +303,20 @@ class ForwardBatch:
             spec_info=batch.spec_info,
             capture_hidden_mode=batch.capture_hidden_mode,
             input_embeds=batch.input_embeds,
-            extend_input_logprob_token_ids_gpu=extend_input_logprob_token_ids_gpu,
-            num_token_non_padded=torch.tensor(
-                len(batch.input_ids), dtype=torch.int32
-            ).to(device, non_blocking=True),
+            token_type_ids=batch.token_type_ids,
+            tbo_split_seq_index=batch.tbo_split_seq_index,
         )
+        device = model_runner.device
+        if batch.extend_input_logprob_token_ids is not None:
+            ret.extend_input_logprob_token_ids_gpu = (
+                batch.extend_input_logprob_token_ids.to(device, non_blocking=True)
+            )
+        if enable_num_token_non_padded(model_runner.server_args):
+            ret.num_token_non_padded = torch.tensor(
+                len(batch.input_ids), dtype=torch.int32
+            ).to(device, non_blocking=True)
         # For DP attention
         if batch.global_num_tokens is not None:
@@ -314,8 +336,10 @@ class ForwardBatch:
                 dtype=model_runner.dtype,
                 device=device,
             )
         if ret.forward_mode.is_idle():
             ret.positions = torch.empty((0,), device=device)
+            TboForwardBatchPreparer.prepare(ret)
             return ret
         # Override the positions with spec_info
@@ -325,10 +349,6 @@ class ForwardBatch:
         ):
             ret.positions = ret.spec_info.positions
-        # Get seq_lens_cpu if needed
-        if ret.seq_lens_cpu is None:
-            ret.seq_lens_cpu = batch.seq_lens_cpu
         # Init position information
         if ret.forward_mode.is_decode():
             if ret.positions is None:
@@ -340,8 +360,8 @@ class ForwardBatch:
             ret.extend_prefix_lens = torch.tensor(
                 batch.extend_prefix_lens, dtype=torch.int32
             ).to(device, non_blocking=True)
-            if model_runner.server_args.attention_backend != "torch_native":
-                ret.extend_num_tokens = batch.extend_num_tokens
+            ret.extend_num_tokens = batch.extend_num_tokens
+            if support_triton(model_runner.server_args.attention_backend):
                 positions, ret.extend_start_loc = compute_position_triton(
                     ret.extend_prefix_lens,
                     ret.extend_seq_lens,
@@ -364,6 +384,8 @@ class ForwardBatch:
         if model_runner.server_args.lora_paths is not None:
             model_runner.lora_manager.prepare_lora_batch(ret)
+        TboForwardBatchPreparer.prepare(ret)
         return ret
     def merge_mm_inputs(self) -> Optional[MultimodalInputs]:
@@ -588,6 +610,14 @@ class ForwardBatch:
         # Precompute the kv indices for each chunk
         self.prepare_chunked_kv_indices(device)
+    @property
+    def can_run_tbo(self):
+        return self.tbo_split_seq_index is not None
+def enable_num_token_non_padded(server_args):
+    return server_args.enable_ep_moe or server_args.enable_deepep_moe
 class PPProxyTensors:
     # adapted from https://github.com/vllm-project/vllm/blob/d14e98d924724b284dc5eaf8070d935e214e50c0/vllm/sequence.py#L1103

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl