PyPI - sglang - Versions diffs - 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (358) hide show

sglang/srt/managers/eplb_algorithms/__init__.py ADDED Viewed

@@ -0,0 +1,63 @@
+from enum import Enum, auto
+from typing import Optional
+import torch
+from sglang.srt.managers.eplb_algorithms import deepseek, deepseek_vec
+class EplbAlgorithm(Enum):
+    deepseek = auto()
+    deepseek_hierarchical = auto()
+    deepseek_vec = auto()
+    deepseek_vec_hierarchical = auto()
+    # TODO may have more algorithm later
+def rebalance_experts(
+    tokens_per_expert: torch.Tensor,
+    num_physical_experts: int,
+    num_local_physical_experts: int,
+    num_groups: Optional[int],
+    num_nodes: int,
+    algorithm: EplbAlgorithm,
+):
+    if algorithm in [EplbAlgorithm.deepseek, EplbAlgorithm.deepseek_hierarchical]:
+        return deepseek.rebalance_experts(
+            weight=tokens_per_expert.sum(dim=0),
+            num_replicas=num_physical_experts,
+            num_groups=num_groups,
+            num_nodes=num_nodes,
+            num_gpus=num_physical_experts // num_local_physical_experts,
+            enable_hierarchical=algorithm == EplbAlgorithm.deepseek_hierarchical,
+        )
+    if algorithm in [
+        EplbAlgorithm.deepseek_vec,
+        EplbAlgorithm.deepseek_vec_hierarchical,
+    ]:
+        return deepseek_vec.rebalance_experts(
+            tokens_per_expert=tokens_per_expert,
+            num_physical_experts=num_physical_experts,
+            num_local_physical_experts=num_local_physical_experts,
+            num_groups=num_groups,
+            num_nodes=num_nodes,
+            enable_hierarchical=algorithm == EplbAlgorithm.deepseek_vec_hierarchical,
+        )
+    raise NotImplementedError
+def compute_algorithm(
+    raw_algorithm: str,
+    num_groups: Optional[int],
+    num_nodes: int,
+) -> EplbAlgorithm:
+    if raw_algorithm != "auto":
+        return EplbAlgorithm[raw_algorithm]
+    # TODO test on real scenarios and know which ones perform better
+    if (num_groups is not None) and (num_groups % num_nodes == 0):
+        return EplbAlgorithm.deepseek_hierarchical
+    else:
+        return EplbAlgorithm.deepseek

sglang/srt/managers/eplb_algorithms/deepseek.py ADDED Viewed

@@ -0,0 +1,223 @@
+# This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py since that one is not a pypi package
+from typing import Tuple
+import torch
+from sglang.srt.utils import get_bool_env_var
+def balanced_packing(
+    weight: torch.Tensor, num_packs: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs
+    are as balanced as possible.
+    Parameters:
+        weight: [X, n], the weight of each item
+        num_packs: number of packs
+    Returns:
+        pack_index: [X, n], the pack index of each item
+        rank_in_pack: [X, n], the rank of the item in the pack
+    """
+    num_layers, num_groups = weight.shape
+    assert num_groups % num_packs == 0
+    groups_per_pack = num_groups // num_packs
+    if groups_per_pack == 1:
+        pack_index = torch.arange(
+            weight.size(-1), dtype=torch.int64, device=weight.device
+        ).expand(weight.shape)
+        rank_in_pack = torch.zeros_like(weight, dtype=torch.int64)
+        return pack_index, rank_in_pack
+    indices = weight.float().sort(-1, descending=True).indices.cpu()
+    pack_index = torch.full_like(weight, fill_value=-1, dtype=torch.int64, device="cpu")
+    rank_in_pack = torch.full_like(pack_index, fill_value=-1)
+    for i in range(num_layers):
+        pack_weights = [0] * num_packs
+        pack_items = [0] * num_packs
+        for group in indices[i]:
+            pack = min(
+                (i for i in range(num_packs) if pack_items[i] < groups_per_pack),
+                key=pack_weights.__getitem__,
+            )
+            assert pack_items[pack] < groups_per_pack
+            pack_index[i, group] = pack
+            rank_in_pack[i, group] = pack_items[pack]
+            pack_weights[pack] += weight[i, group]
+            pack_items[pack] += 1
+    return pack_index, rank_in_pack
+def replicate_experts(
+    weight: torch.Tensor, num_phy: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized.
+    Parameters:
+        weight: [X, num_log]
+        num_phy: total number of experts after replication
+    Returns:
+        phy2log: [X, num_phy], logical expert id of each physical expert
+        rank: [X, num_phy], the replica rank
+        logcnt: [X, num_log], number of replicas for each logical expert
+    """
+    n, num_log = weight.shape
+    num_redundant = num_phy - num_log
+    assert num_redundant >= 0
+    device = weight.device
+    phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
+    rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
+    logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
+    arangen = torch.arange(n, dtype=torch.int64, device=device)
+    for i in range(num_log, num_phy):
+        redundant_indices = (weight / logcnt).max(dim=-1).indices
+        phy2log[:, i] = redundant_indices
+        rank[:, i] = logcnt[arangen, redundant_indices]
+        logcnt[arangen, redundant_indices] += 1
+    return phy2log, rank, logcnt
+def rebalance_experts_hierarchical(
+    weight: torch.Tensor,
+    num_physical_experts: int,
+    num_groups: int,
+    num_nodes: int,
+    num_gpus: int,
+):
+    """
+    Parameters:
+        weight: [num_moe_layers, num_logical_experts]
+        num_physical_experts: number of physical experts after replication
+        num_groups: number of expert groups
+        num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
+        num_gpus: number of GPUs, must be a multiple of `num_nodes`
+    Returns:
+        physical_to_logical_map: [num_moe_layers, num_physical_experts]
+        logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
+        logical_count: [num_moe_layers, num_logical_experts]
+    """
+    num_layers, num_logical_experts = weight.shape
+    assert num_logical_experts % num_groups == 0
+    group_size = num_logical_experts // num_groups
+    assert num_groups % num_nodes == 0
+    groups_per_node = num_groups // num_nodes
+    assert num_gpus % num_nodes == 0
+    assert num_physical_experts % num_gpus == 0
+    phy_experts_per_gpu = num_physical_experts // num_gpus
+    def inverse(perm: torch.Tensor) -> torch.Tensor:
+        inv = torch.empty_like(perm)
+        inv.scatter_(
+            1,
+            perm,
+            torch.arange(perm.size(1), dtype=torch.int64, device=perm.device).expand(
+                perm.shape
+            ),
+        )
+        return inv
+    # Step 1: pack groups to nodes
+    tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
+    group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes)
+    log2mlog = (
+        (
+            (group_pack_index * groups_per_node + group_rank_in_pack) * group_size
+        ).unsqueeze(-1)
+        + torch.arange(group_size, dtype=torch.int64, device=group_pack_index.device)
+    ).flatten(-2)
+    mlog2log = inverse(log2mlog)
+    # Step 2: construct redundant experts within nodes
+    # [num_layers * num_nodes, num_logical_experts // num_nodes]
+    tokens_per_mlog = weight.gather(-1, mlog2log).view(
+        -1, num_logical_experts // num_nodes
+    )
+    phy2mlog, phyrank, mlogcnt = replicate_experts(
+        tokens_per_mlog, num_physical_experts // num_nodes
+    )
+    # Step 3: pack physical_experts to GPUs
+    # [num_layers * num_nodes, num_physical_experts // num_nodes]
+    tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
+    pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes)
+    phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
+    pphy2phy = inverse(phy2pphy)
+    pphy2mlog = phy2mlog.gather(
+        -1, pphy2phy
+    )  # [num_layers * num_nodes, num_log_per_nodes]
+    pphy2mlog = (
+        pphy2mlog.view(num_layers, num_nodes, -1)
+        + torch.arange(
+            0,
+            num_logical_experts,
+            num_logical_experts // num_nodes,
+            device=group_pack_index.device,
+        ).view(1, -1, 1)
+    ).flatten(-2)
+    pphy2log = mlog2log.gather(-1, pphy2mlog)
+    pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
+    logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
+    return pphy2log, pphyrank, logcnt
+def rebalance_experts(
+    weight: torch.Tensor,
+    num_replicas: int,
+    num_groups: int,
+    num_nodes: int,
+    num_gpus: int,
+    enable_hierarchical: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Entry point for expert-parallelism load balancer.
+    Parameters:
+        weight: [layers, num_logical_experts], the load statistics for all logical experts
+        num_replicas: number of physical experts, must be a multiple of `num_gpus`
+        num_groups: number of expert groups
+        num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
+        num_gpus: number of GPUs, must be a multiple of `num_nodes`
+    Returns:
+        physical_to_logical_map: [layers, num_replicas], the expert index of each replica
+        logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
+        expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert
+    """
+    num_layers, num_logical_experts = weight.shape
+    weight = weight.float().cpu()
+    if enable_hierarchical:
+        # use hierarchical load-balance policy
+        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+            weight, num_replicas, num_groups, num_nodes, num_gpus
+        )
+    else:
+        # use global load-balance policy
+        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+            weight, num_replicas, 1, 1, num_gpus
+        )
+    maxlogcnt = logcnt.max().item()
+    log2phy: torch.Tensor = torch.full(
+        (num_layers, num_logical_experts, maxlogcnt),
+        -1,
+        dtype=torch.int64,
+        device=logcnt.device,
+    )
+    log2phy.view(num_layers, -1).scatter_(
+        -1,
+        phy2log * maxlogcnt + phyrank,
+        torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
+            num_layers, -1
+        ),
+    )
+    return phy2log, log2phy, logcnt
+__all__ = ["rebalance_experts"]

sglang/srt/managers/eplb_algorithms/deepseek_vec.py ADDED Viewed

@@ -0,0 +1,276 @@
+# This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py since that one is not a pypi package
+from typing import Optional, Tuple
+import torch
+def pack_groups(tokens_per_group: torch.Tensor, num_nodes: int) -> torch.Tensor:
+    num_layers, num_groups = tokens_per_group.shape
+    assert num_groups % num_nodes == 0
+    groups_per_rank = num_groups // num_nodes
+    indices = tokens_per_group.float().sort(-1, descending=True).indices.cpu()
+    ret = torch.full_like(
+        tokens_per_group, fill_value=-1, dtype=torch.int64, device="cpu"
+    )
+    for layer in range(num_layers):
+        node_tokens = [0] * num_nodes
+        node_groups = [0] * num_nodes
+        for group in indices[layer]:
+            def key_func(rank: int) -> int:
+                if node_groups[rank] >= groups_per_rank:
+                    return 1, 0
+                else:
+                    return 0, node_tokens[rank]
+            rank = min(range(num_nodes), key=key_func)
+            assert node_groups[rank] < groups_per_rank
+            ret[layer, group] = rank * groups_per_rank + node_groups[rank]
+            node_tokens[rank] += tokens_per_group[layer, group]
+            node_groups[rank] += 1
+    return ret
+def make_redundant_experts_chunkwise(
+    tokens_per_expert: torch.Tensor,
+    num_physical_experts: int,
+    num_local_physical_experts: int,
+    num_physical_experts_per_chunk: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    num_steps, num_moe_layers, num_logical_experts = tokens_per_expert.shape
+    num_redundancy_experts = num_physical_experts - num_logical_experts
+    physical_to_logical_map = torch.empty(
+        num_moe_layers,
+        num_physical_experts,
+        dtype=torch.int,
+        device=tokens_per_expert.device,
+    )
+    logical_to_physical_map = torch.full(
+        (num_moe_layers, num_logical_experts, num_redundancy_experts + 1),
+        -1,
+        dtype=torch.int,
+        device=tokens_per_expert.device,
+    )
+    logical_count = torch.ones(
+        num_moe_layers,
+        num_logical_experts,
+        dtype=torch.int,
+        device=tokens_per_expert.device,
+    )
+    assert num_physical_experts % num_physical_experts_per_chunk == 0
+    num_chunks = num_physical_experts // num_physical_experts_per_chunk
+    assert num_logical_experts % num_chunks == 0
+    num_logical_experts_per_group = num_logical_experts // num_chunks
+    assert num_redundancy_experts % num_chunks == 0
+    num_redundancy_experts_per_group = num_redundancy_experts // num_chunks
+    arange_num_moe_layers_num_groups = torch.arange(
+        num_moe_layers * num_chunks, dtype=torch.int, device=tokens_per_expert.device
+    )
+    arange_num_logical_experts = torch.arange(
+        num_logical_experts, dtype=torch.int, device=tokens_per_expert.device
+    )
+    arange_num_logical_experts_per_group = torch.arange(
+        num_logical_experts_per_group, dtype=torch.int, device=tokens_per_expert.device
+    )
+    arange_num_groups = torch.arange(
+        num_chunks, dtype=torch.int, device=tokens_per_expert.device
+    )
+    physical_to_logical_map.view(
+        num_moe_layers, num_chunks, num_physical_experts_per_chunk
+    )[:, :, :num_logical_experts_per_group] = arange_num_logical_experts.view(
+        num_chunks, num_logical_experts_per_group
+    )
+    logical_to_physical_map[:, :, 0] = (
+        arange_num_logical_experts_per_group.expand(
+            num_chunks, num_logical_experts_per_group
+        )
+        + arange_num_groups[:, None] * num_physical_experts_per_chunk
+    ).view(num_logical_experts)
+    tokens_per_expert_all_diff = tokens_per_expert + arange_num_logical_experts * 1e-4
+    for i in range(num_redundancy_experts_per_group):
+        score = (
+            tokens_per_expert_all_diff / logical_count
+        )  # NOTE: Values in score must be different from each other
+        score1 = tokens_per_expert / (logical_count + 1)
+        score = score.view(
+            num_steps, num_moe_layers, num_chunks, num_logical_experts_per_group
+        )
+        score1 = score1.view_as(score)
+        values, indices = score.max(-1, keepdim=True)
+        values = values.expand_as(score).contiguous()
+        score.scatter_(-1, indices, score1.gather(-1, indices))
+        values.scatter_(-1, indices, score.max(-1, keepdim=True).values)
+        redundancy_indices = values.sum(0).argmin(-1)
+        physical_to_logical_map.view(
+            num_moe_layers, num_chunks, num_physical_experts_per_chunk
+        )[:, :, num_logical_experts_per_group + i] = (
+            redundancy_indices + arange_num_groups * num_logical_experts_per_group
+        )
+        redundancy_count = (
+            logical_count.view(
+                num_moe_layers * num_chunks, num_logical_experts_per_group
+            )
+            .gather(-1, redundancy_indices.view(num_moe_layers * num_chunks, 1))
+            .squeeze(1)
+        )
+        physical_redundancy_indices = (
+            (
+                arange_num_groups * num_physical_experts_per_chunk
+                + num_logical_experts_per_group
+                + i
+            )
+            .expand(num_moe_layers, num_chunks)
+            .flatten()
+        )
+        logical_to_physical_map.view(
+            num_moe_layers * num_chunks,
+            num_logical_experts_per_group,
+            num_redundancy_experts + 1,
+        )[
+            arange_num_moe_layers_num_groups,
+            redundancy_indices.view(num_moe_layers * num_chunks),
+            redundancy_count,
+        ] = physical_redundancy_indices
+        logical_count.view(num_moe_layers * num_chunks, num_logical_experts_per_group)[
+            arange_num_moe_layers_num_groups,
+            redundancy_indices.view(num_moe_layers * num_chunks),
+        ] += 1
+    if num_local_physical_experts > 1:
+        # Load-balancing between GPUs
+        physical_to_logical_map_int64 = physical_to_logical_map.to(torch.int64)
+        counts = logical_count.gather(-1, physical_to_logical_map_int64)
+        score = tokens_per_expert.sum(0).gather(-1, physical_to_logical_map_int64)
+        score = score / counts
+        score = score.view(num_moe_layers, num_chunks, num_physical_experts_per_chunk)
+        indices = score.argsort(-1, descending=True)
+        indices += torch.arange(
+            0,
+            num_physical_experts,
+            num_physical_experts_per_chunk,
+            dtype=indices.dtype,
+            device=indices.device,
+        )[None, :, None]
+        assert num_physical_experts_per_chunk % num_local_physical_experts == 0
+        num_local_groups = num_physical_experts_per_chunk // num_local_physical_experts
+        indices = indices.view(
+            num_moe_layers, num_chunks, num_local_physical_experts, num_local_groups
+        )
+        indices[:, :, 1::2, :] = indices[:, :, 1::2, :].flip(-1)
+        indices = indices.transpose(2, 3)
+        indices = indices.reshape(num_moe_layers, num_physical_experts)
+        physical_to_logical_map = physical_to_logical_map.gather(-1, indices)
+        mask = logical_to_physical_map == -1
+        logical_to_physical_map[mask] = 0
+        logical_to_physical_map = (
+            indices.argsort(-1)
+            .gather(
+                -1, logical_to_physical_map.view(num_moe_layers, -1).to(torch.int64)
+            )
+            .view_as(logical_to_physical_map)
+            .to(torch.int)
+        )
+        logical_to_physical_map[mask] = -1
+    return physical_to_logical_map, logical_to_physical_map, logical_count
+def decode_rebalance_experts(
+    tokens_per_expert: torch.Tensor,
+    num_physical_experts: int,
+    num_local_physical_experts: int,
+):
+    return make_redundant_experts_chunkwise(
+        tokens_per_expert,
+        num_physical_experts,
+        num_local_physical_experts,
+        num_physical_experts,
+    )
+def prefill_rebalance_experts(
+    tokens_per_expert: torch.Tensor,
+    num_physical_experts: int,
+    num_local_physical_experts: int,
+    num_groups: int,
+    num_nodes: int,
+):
+    tokens_per_expert = tokens_per_expert.float().cpu()
+    num_steps, _, num_logical_experts = tokens_per_expert.shape
+    assert num_logical_experts % num_groups == 0
+    group_size = num_logical_experts // num_groups
+    assert num_groups % num_nodes == 0, f"{num_groups=} {num_nodes=}"
+    tokens_per_group = tokens_per_expert.sum(0).unflatten(-1, (num_groups, -1)).sum(-1)
+    group_perm = pack_groups(
+        tokens_per_group, num_nodes
+    )  # [num_moe_layers, num_groups] => [num_moe_layers, num_nodes]
+    # log2mlog [layers, #logexp] -> [layers, #logexp]
+    log2mlog = (
+        (group_perm * group_size).unsqueeze(-1)
+        + torch.arange(group_size, dtype=torch.int64, device=group_perm.device)
+    ).flatten(-2)
+    # mlog2log [layers, #logexp] -> [layers, #logexp], inverse of log2mlog
+    mlog2log = torch.empty_like(log2mlog)
+    arange = torch.arange(
+        num_logical_experts, dtype=torch.int64, device=mlog2log.device
+    )
+    mlog2log.scatter_(1, log2mlog, arange.expand(log2mlog.size(0), -1))
+    # tokens_per_mlog[i][j][k] = tokens_per_expert[i][j][mlog2log[j][k]]
+    tokens_per_mlog = tokens_per_expert.gather(
+        2, mlog2log.unsqueeze(0).expand(num_steps, -1, -1)
+    )
+    phy2mlog, mlog2phy, mlog_count = make_redundant_experts_chunkwise(
+        tokens_per_mlog,
+        num_physical_experts,
+        num_local_physical_experts,
+        num_physical_experts // num_nodes,
+    )
+    # phy2log[i][j] = mlog2log[i][phy2mlog[i][j]]
+    phy2log = mlog2log.gather(1, phy2mlog.to(torch.int64))
+    # mlog2phy: [num_moe_layers, num_logical_experts, ...]
+    # log2phy[i][j][k] = mlog2phy[i][log2mlog[i][j]][k]
+    log2phy = mlog2phy.gather(
+        1, log2mlog.unsqueeze(-1).expand(-1, -1, mlog2phy.size(-1)).to(torch.int64)
+    )
+    # log_count[i][j] = mlog_count[i][log2mlog[i][j]]
+    log_count = mlog_count.gather(1, log2mlog)
+    return phy2log, log2phy, log_count
+def rebalance_experts(
+    tokens_per_expert: torch.Tensor,
+    num_physical_experts: int,
+    num_local_physical_experts: int,
+    num_groups: Optional[int],
+    num_nodes: int,
+    enable_hierarchical: bool,
+):
+    if enable_hierarchical:
+        return prefill_rebalance_experts(
+            tokens_per_expert=tokens_per_expert,
+            num_physical_experts=num_physical_experts,
+            num_local_physical_experts=num_local_physical_experts,
+            num_groups=num_groups,
+            num_nodes=num_nodes,
+        )
+    else:
+        return decode_rebalance_experts(
+            tokens_per_expert=tokens_per_expert,
+            num_physical_experts=num_physical_experts,
+            num_local_physical_experts=num_local_physical_experts,
+        )

sglang/srt/managers/eplb_manager.py ADDED Viewed

@@ -0,0 +1,96 @@
+import logging
+import time
+from typing import TYPE_CHECKING, List
+import torch.cuda
+from sglang.srt.managers.expert_distribution import (
+    get_global_expert_distribution_recorder,
+)
+from sglang.srt.managers.expert_location import ExpertLocationMetadata
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.model_runner import ModelRunner
+logger = logging.getLogger(__name__)
+class EPLBManager:
+    def __init__(self, model_runner: "ModelRunner"):
+        super().__init__()
+        self._model_runner = model_runner
+        self._server_args = model_runner.server_args
+        self._rebalance_layers_per_chunk = (
+            self._server_args.eplb_rebalance_layers_per_chunk
+        )
+        self._rebalance_num_iterations = self._server_args.eplb_rebalance_num_iterations
+        # Otherwise, the circular buffer will contain stale data. If the case is needed, it can be implemented.
+        assert (
+            self._server_args.eplb_rebalance_num_iterations
+            >= self._server_args.expert_distribution_recorder_buffer_size
+        ), "eplb_rebalance_num_iterations must be greater than expert_distribution_recorder_buffer_size"
+        if not get_global_expert_distribution_recorder().recording:
+            get_global_expert_distribution_recorder().start_record()
+        logger.info(
+            f"[EPLBManager] system started, will rebalance per {self._rebalance_num_iterations} iterations."
+        )
+        self._main_generator = self._entrypoint()
+    def on_forward_pass_end(self):
+        next(self._main_generator)
+    # can be more complex if needed
+    def _entrypoint(self):
+        while True:
+            for _ in range(self._rebalance_num_iterations):
+                yield
+            yield from self.rebalance()
+    def rebalance(self):
+        logger.info("[EPLBManager] rebalance start")
+        enable_timing = self._rebalance_layers_per_chunk is None
+        if enable_timing:
+            torch.cuda.synchronize()
+            time_start = time.time()
+        logical_count = get_global_expert_distribution_recorder().dump_record(
+            output_mode="object"
+        )["logical_count"]
+        expert_location_metadata = ExpertLocationMetadata.init_by_eplb(
+            self._server_args, self._model_runner.model_config, logical_count
+        )
+        update_layer_ids_chunks = self._compute_update_layer_ids_chunks()
+        for chunk_index, update_layer_ids in enumerate(update_layer_ids_chunks):
+            if len(update_layer_ids_chunks) > 1:
+                yield
+            self._model_runner.update_expert_location(
+                expert_location_metadata,
+                update_layer_ids=update_layer_ids,
+            )
+        msg = f"[EPLBManager] rebalance end"
+        if enable_timing:
+            torch.cuda.synchronize()
+            time_end = time.time()
+            msg += f" time={time_end - time_start:.3f}s"
+        logger.info(msg)
+    def _compute_update_layer_ids_chunks(self) -> List[List[int]]:
+        all_layer_ids = sorted(
+            list(self._model_runner.model.routed_experts_weights_of_layer.keys())
+        )
+        chunk_size = self._rebalance_layers_per_chunk or 1000000
+        return list(_chunk_list(all_layer_ids, chunk_size=chunk_size))
+def _chunk_list(items: List, chunk_size):
+    for start_index in range(0, len(items), chunk_size):
+        yield items[start_index : start_index + chunk_size]

sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl