PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (359) hide show

sglang/srt/lora/utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
 from dataclasses import dataclass
 from enum import Enum
-from typing import Optional, Set, Tuple
+from typing import List, Optional, Set, Tuple
 import torch
@@ -106,18 +106,22 @@ def get_hidden_dim(
             raise NotImplementedError()
-def get_stacked_name(name: str) -> Tuple[str]:
+def get_normalized_lora_weight_names(name: str) -> Tuple[List[str], List[str]]:
     """
-    Mapping a target module name to (stacked name for Lora A, stacked name for Lora B)
+    Mapping a target module name to names of the normized LoRA weights.
+    Returned tuple contains (name for Lora A, name for Lora B)
     """
     params_mapping = {
-        "q_proj": ("qkv_proj", "q_proj"),
-        "k_proj": ("qkv_proj", "kv_proj"),
-        "v_proj": ("qkv_proj", "kv_proj"),
-        "gate_proj": ("gate_up_proj", "gate_up_proj"),
-        "up_proj": ("gate_up_proj", "gate_up_proj"),
+        "q_proj": (["qkv_proj"], ["q_proj"]),
+        "k_proj": (["qkv_proj"], ["kv_proj"]),
+        "v_proj": (["qkv_proj"], ["kv_proj"]),
+        "gate_proj": (["gate_up_proj"], ["gate_up_proj"]),
+        "up_proj": (["gate_up_proj"], ["gate_up_proj"]),
+        "qkv_proj": (["qkv_proj"], ["q_proj", "kv_proj"]),
+        "gate_up_proj": (["gate_up_proj"], ["gate_up_proj"]),
     }
-    return params_mapping.get(name, (name, name))
+    stacked = params_mapping.get(name, ([name], [name]))
+    return stacked
 def get_stacked_multiply(module_name: str) -> int:
@@ -133,7 +137,7 @@ def get_stacked_multiply(module_name: str) -> int:
 def get_weight_name(
-    target_name: str, lora_weight_names: Set[Tuple[str]], lora_type: LoRAType
+    target_name: str, lora_weight_names: Tuple[Set[str]], lora_type: LoRAType
 ) -> Optional[str]:
     """
     target_name is name of a given module,
@@ -142,9 +146,9 @@ def get_weight_name(
     Else raise ValueError.
     """
     idx = 0 if lora_type == LoRAType.LORA_A else 1
-    for weight_name_pair in lora_weight_names:
-        if weight_name_pair[idx] in target_name:
-            return weight_name_pair[idx]
+    for weight_name in lora_weight_names[idx]:
+        if weight_name in target_name:
+            return weight_name
     raise ValueError(
         f"Cannot find weight name for {target_name} in {lora_weight_names}"
     )

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -22,7 +22,8 @@ from typing import List, Optional
 import torch
-from sglang.srt.mem_cache.memory_pool import HostKVCache, TokenToKVPoolAllocator
+from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
+from sglang.srt.mem_cache.memory_pool_host import HostKVCache
 logger = logging.getLogger(__name__)

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -248,12 +248,20 @@ class DataParallelController:
     def round_robin_scheduler(self, req: Req):
         if self.server_args.disaggregation_mode == "null":
-            self.workers[self.round_robin_counter].send_pyobj(req)
-            self.round_robin_counter = (self.round_robin_counter + 1) % len(
-                self.workers
-            )
+            if req.data_parallel_rank is not None:
+                logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
+                self.workers[req.data_parallel_rank].send_pyobj(req)
+            else:
+                self.workers[self.round_robin_counter].send_pyobj(req)
+                self.round_robin_counter = (self.round_robin_counter + 1) % len(
+                    self.workers
+                )
         else:
-            self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
+            if req.data_parallel_rank is not None:
+                logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
+                self.workers[req.data_parallel_rank].send_pyobj(req)
+            else:
+                self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
     def shortest_queue_scheduler(self, input_requests):
         raise NotImplementedError()

sglang/srt/managers/eplb_algorithms/__init__.py ADDED Viewed

@@ -0,0 +1,63 @@
+from enum import Enum, auto
+from typing import Optional
+import torch
+from sglang.srt.managers.eplb_algorithms import deepseek, deepseek_vec
+class EplbAlgorithm(Enum):
+    deepseek = auto()
+    deepseek_hierarchical = auto()
+    deepseek_vec = auto()
+    deepseek_vec_hierarchical = auto()
+    # TODO may have more algorithm later
+def rebalance_experts(
+    tokens_per_expert: torch.Tensor,
+    num_physical_experts: int,
+    num_local_physical_experts: int,
+    num_groups: Optional[int],
+    num_nodes: int,
+    algorithm: EplbAlgorithm,
+):
+    if algorithm in [EplbAlgorithm.deepseek, EplbAlgorithm.deepseek_hierarchical]:
+        return deepseek.rebalance_experts(
+            weight=tokens_per_expert.sum(dim=0),
+            num_replicas=num_physical_experts,
+            num_groups=num_groups,
+            num_nodes=num_nodes,
+            num_gpus=num_physical_experts // num_local_physical_experts,
+            enable_hierarchical=algorithm == EplbAlgorithm.deepseek_hierarchical,
+        )
+    if algorithm in [
+        EplbAlgorithm.deepseek_vec,
+        EplbAlgorithm.deepseek_vec_hierarchical,
+    ]:
+        return deepseek_vec.rebalance_experts(
+            tokens_per_expert=tokens_per_expert,
+            num_physical_experts=num_physical_experts,
+            num_local_physical_experts=num_local_physical_experts,
+            num_groups=num_groups,
+            num_nodes=num_nodes,
+            enable_hierarchical=algorithm == EplbAlgorithm.deepseek_vec_hierarchical,
+        )
+    raise NotImplementedError
+def compute_algorithm(
+    raw_algorithm: str,
+    num_groups: Optional[int],
+    num_nodes: int,
+) -> EplbAlgorithm:
+    if raw_algorithm != "auto":
+        return EplbAlgorithm[raw_algorithm]
+    # TODO test on real scenarios and know which ones perform better
+    if (num_groups is not None) and (num_groups % num_nodes == 0):
+        return EplbAlgorithm.deepseek_hierarchical
+    else:
+        return EplbAlgorithm.deepseek

sglang/srt/managers/eplb_algorithms/deepseek.py ADDED Viewed

@@ -0,0 +1,223 @@
+# This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py since that one is not a pypi package
+from typing import Tuple
+import torch
+from sglang.srt.utils import get_bool_env_var
+def balanced_packing(
+    weight: torch.Tensor, num_packs: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs
+    are as balanced as possible.
+    Parameters:
+        weight: [X, n], the weight of each item
+        num_packs: number of packs
+    Returns:
+        pack_index: [X, n], the pack index of each item
+        rank_in_pack: [X, n], the rank of the item in the pack
+    """
+    num_layers, num_groups = weight.shape
+    assert num_groups % num_packs == 0
+    groups_per_pack = num_groups // num_packs
+    if groups_per_pack == 1:
+        pack_index = torch.arange(
+            weight.size(-1), dtype=torch.int64, device=weight.device
+        ).expand(weight.shape)
+        rank_in_pack = torch.zeros_like(weight, dtype=torch.int64)
+        return pack_index, rank_in_pack
+    indices = weight.float().sort(-1, descending=True).indices.cpu()
+    pack_index = torch.full_like(weight, fill_value=-1, dtype=torch.int64, device="cpu")
+    rank_in_pack = torch.full_like(pack_index, fill_value=-1)
+    for i in range(num_layers):
+        pack_weights = [0] * num_packs
+        pack_items = [0] * num_packs
+        for group in indices[i]:
+            pack = min(
+                (i for i in range(num_packs) if pack_items[i] < groups_per_pack),
+                key=pack_weights.__getitem__,
+            )
+            assert pack_items[pack] < groups_per_pack
+            pack_index[i, group] = pack
+            rank_in_pack[i, group] = pack_items[pack]
+            pack_weights[pack] += weight[i, group]
+            pack_items[pack] += 1
+    return pack_index, rank_in_pack
+def replicate_experts(
+    weight: torch.Tensor, num_phy: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized.
+    Parameters:
+        weight: [X, num_log]
+        num_phy: total number of experts after replication
+    Returns:
+        phy2log: [X, num_phy], logical expert id of each physical expert
+        rank: [X, num_phy], the replica rank
+        logcnt: [X, num_log], number of replicas for each logical expert
+    """
+    n, num_log = weight.shape
+    num_redundant = num_phy - num_log
+    assert num_redundant >= 0
+    device = weight.device
+    phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
+    rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
+    logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
+    arangen = torch.arange(n, dtype=torch.int64, device=device)
+    for i in range(num_log, num_phy):
+        redundant_indices = (weight / logcnt).max(dim=-1).indices
+        phy2log[:, i] = redundant_indices
+        rank[:, i] = logcnt[arangen, redundant_indices]
+        logcnt[arangen, redundant_indices] += 1
+    return phy2log, rank, logcnt
+def rebalance_experts_hierarchical(
+    weight: torch.Tensor,
+    num_physical_experts: int,
+    num_groups: int,
+    num_nodes: int,
+    num_gpus: int,
+):
+    """
+    Parameters:
+        weight: [num_moe_layers, num_logical_experts]
+        num_physical_experts: number of physical experts after replication
+        num_groups: number of expert groups
+        num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
+        num_gpus: number of GPUs, must be a multiple of `num_nodes`
+    Returns:
+        physical_to_logical_map: [num_moe_layers, num_physical_experts]
+        logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
+        logical_count: [num_moe_layers, num_logical_experts]
+    """
+    num_layers, num_logical_experts = weight.shape
+    assert num_logical_experts % num_groups == 0
+    group_size = num_logical_experts // num_groups
+    assert num_groups % num_nodes == 0
+    groups_per_node = num_groups // num_nodes
+    assert num_gpus % num_nodes == 0
+    assert num_physical_experts % num_gpus == 0
+    phy_experts_per_gpu = num_physical_experts // num_gpus
+    def inverse(perm: torch.Tensor) -> torch.Tensor:
+        inv = torch.empty_like(perm)
+        inv.scatter_(
+            1,
+            perm,
+            torch.arange(perm.size(1), dtype=torch.int64, device=perm.device).expand(
+                perm.shape
+            ),
+        )
+        return inv
+    # Step 1: pack groups to nodes
+    tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
+    group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes)
+    log2mlog = (
+        (
+            (group_pack_index * groups_per_node + group_rank_in_pack) * group_size
+        ).unsqueeze(-1)
+        + torch.arange(group_size, dtype=torch.int64, device=group_pack_index.device)
+    ).flatten(-2)
+    mlog2log = inverse(log2mlog)
+    # Step 2: construct redundant experts within nodes
+    # [num_layers * num_nodes, num_logical_experts // num_nodes]
+    tokens_per_mlog = weight.gather(-1, mlog2log).view(
+        -1, num_logical_experts // num_nodes
+    )
+    phy2mlog, phyrank, mlogcnt = replicate_experts(
+        tokens_per_mlog, num_physical_experts // num_nodes
+    )
+    # Step 3: pack physical_experts to GPUs
+    # [num_layers * num_nodes, num_physical_experts // num_nodes]
+    tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
+    pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes)
+    phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
+    pphy2phy = inverse(phy2pphy)
+    pphy2mlog = phy2mlog.gather(
+        -1, pphy2phy
+    )  # [num_layers * num_nodes, num_log_per_nodes]
+    pphy2mlog = (
+        pphy2mlog.view(num_layers, num_nodes, -1)
+        + torch.arange(
+            0,
+            num_logical_experts,
+            num_logical_experts // num_nodes,
+            device=group_pack_index.device,
+        ).view(1, -1, 1)
+    ).flatten(-2)
+    pphy2log = mlog2log.gather(-1, pphy2mlog)
+    pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
+    logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
+    return pphy2log, pphyrank, logcnt
+def rebalance_experts(
+    weight: torch.Tensor,
+    num_replicas: int,
+    num_groups: int,
+    num_nodes: int,
+    num_gpus: int,
+    enable_hierarchical: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Entry point for expert-parallelism load balancer.
+    Parameters:
+        weight: [layers, num_logical_experts], the load statistics for all logical experts
+        num_replicas: number of physical experts, must be a multiple of `num_gpus`
+        num_groups: number of expert groups
+        num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
+        num_gpus: number of GPUs, must be a multiple of `num_nodes`
+    Returns:
+        physical_to_logical_map: [layers, num_replicas], the expert index of each replica
+        logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
+        expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert
+    """
+    num_layers, num_logical_experts = weight.shape
+    weight = weight.float().cpu()
+    if enable_hierarchical:
+        # use hierarchical load-balance policy
+        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+            weight, num_replicas, num_groups, num_nodes, num_gpus
+        )
+    else:
+        # use global load-balance policy
+        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+            weight, num_replicas, 1, 1, num_gpus
+        )
+    maxlogcnt = logcnt.max().item()
+    log2phy: torch.Tensor = torch.full(
+        (num_layers, num_logical_experts, maxlogcnt),
+        -1,
+        dtype=torch.int64,
+        device=logcnt.device,
+    )
+    log2phy.view(num_layers, -1).scatter_(
+        -1,
+        phy2log * maxlogcnt + phyrank,
+        torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
+            num_layers, -1
+        ),
+    )
+    return phy2log, log2phy, logcnt
+__all__ = ["rebalance_experts"]

sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} RENAMED Viewed

@@ -1,6 +1,5 @@
 # This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py since that one is not a pypi package
-from typing import Literal, Tuple
+from typing import Optional, Tuple
 import torch
@@ -257,11 +256,11 @@ def rebalance_experts(
     tokens_per_expert: torch.Tensor,
     num_physical_experts: int,
     num_local_physical_experts: int,
-    num_groups: int,
+    num_groups: Optional[int],
     num_nodes: int,
-    phase: Literal["prefill", "decode"],
+    enable_hierarchical: bool,
 ):
-    if phase == "prefill":
+    if enable_hierarchical:
         return prefill_rebalance_experts(
             tokens_per_expert=tokens_per_expert,
             num_physical_experts=num_physical_experts,
@@ -269,10 +268,9 @@ def rebalance_experts(
             num_groups=num_groups,
             num_nodes=num_nodes,
         )
-    if phase == "decode":
+    else:
         return decode_rebalance_experts(
             tokens_per_expert=tokens_per_expert,
             num_physical_experts=num_physical_experts,
             num_local_physical_experts=num_local_physical_experts,
         )
-    raise NotImplementedError

sglang/srt/managers/eplb_manager.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 import time
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, List
 import torch.cuda
@@ -20,27 +20,45 @@ class EPLBManager:
         super().__init__()
         self._model_runner = model_runner
         self._server_args = model_runner.server_args
+        self._rebalance_layers_per_chunk = (
+            self._server_args.eplb_rebalance_layers_per_chunk
+        )
+        self._rebalance_num_iterations = self._server_args.eplb_rebalance_num_iterations
         # Otherwise, the circular buffer will contain stale data. If the case is needed, it can be implemented.
         assert (
             self._server_args.eplb_rebalance_num_iterations
-            <= self._server_args.expert_distribution_recorder_buffer_size
-        ), "eplb_rebalance_num_iterations must be less than expert_distribution_recorder_buffer_size"
+            >= self._server_args.expert_distribution_recorder_buffer_size
+        ), "eplb_rebalance_num_iterations must be greater than expert_distribution_recorder_buffer_size"
-        get_global_expert_distribution_recorder().start_record()
+        if not get_global_expert_distribution_recorder().recording:
+            get_global_expert_distribution_recorder().start_record()
         logger.info(
-            f"[EPLBManager] system started, will rebalance per {self._server_args.eplb_rebalance_num_iterations} iterations."
+            f"[EPLBManager] system started, will rebalance per {self._rebalance_num_iterations} iterations."
         )
-    def on_forward_pass_end(self, forward_pass_id: int):
-        if forward_pass_id % self._server_args.eplb_rebalance_num_iterations == 0:
-            self.rebalance()
+        self._main_generator = self._entrypoint()
+    def on_forward_pass_end(self):
+        next(self._main_generator)
+    # can be more complex if needed
+    def _entrypoint(self):
+        while True:
+            for _ in range(self._rebalance_num_iterations):
+                yield
+            yield from self.rebalance()
     def rebalance(self):
         logger.info("[EPLBManager] rebalance start")
-        torch.cuda.synchronize()
-        time_start = time.time()
+        enable_timing = self._rebalance_layers_per_chunk is None
+        if enable_timing:
+            torch.cuda.synchronize()
+            time_start = time.time()
         logical_count = get_global_expert_distribution_recorder().dump_record(
             output_mode="object"
@@ -48,8 +66,31 @@ class EPLBManager:
         expert_location_metadata = ExpertLocationMetadata.init_by_eplb(
             self._server_args, self._model_runner.model_config, logical_count
         )
-        self._model_runner.update_expert_location(expert_location_metadata)
-        torch.cuda.synchronize()
-        time_end = time.time()
-        logger.info(f"[EPLBManager] rebalance end time={time_end - time_start:.3f}s")
+        update_layer_ids_chunks = self._compute_update_layer_ids_chunks()
+        for chunk_index, update_layer_ids in enumerate(update_layer_ids_chunks):
+            if len(update_layer_ids_chunks) > 1:
+                yield
+            self._model_runner.update_expert_location(
+                expert_location_metadata,
+                update_layer_ids=update_layer_ids,
+            )
+        msg = f"[EPLBManager] rebalance end"
+        if enable_timing:
+            torch.cuda.synchronize()
+            time_end = time.time()
+            msg += f" time={time_end - time_start:.3f}s"
+        logger.info(msg)
+    def _compute_update_layer_ids_chunks(self) -> List[List[int]]:
+        all_layer_ids = sorted(
+            list(self._model_runner.model.routed_experts_weights_of_layer.keys())
+        )
+        chunk_size = self._rebalance_layers_per_chunk or 1000000
+        return list(_chunk_list(all_layer_ids, chunk_size=chunk_size))
+def _chunk_list(items: List, chunk_size):
+    for start_index in range(0, len(items), chunk_size):
+        yield items[start_index : start_index + chunk_size]

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl