PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (318) hide show

sglang/srt/managers/expert_distribution.py CHANGED Viewed

@@ -18,7 +18,7 @@ from abc import ABC
 from collections import deque
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Dict, List, Literal, Optional, Tuple, Type
+from typing import Any, Dict, List, Literal, Optional, Tuple, Type
 import einops
 import torch
@@ -91,6 +91,10 @@ class ExpertDistributionRecorder(ABC):
     def dump_record(self, output_mode: _OutputMode = "file"):
         self._on_not_implemented()
+    @property
+    def recording(self):
+        return False
     def _on_not_implemented(self):
         raise Exception(
             "Please set ServerArgs.expert_distribution_recorder_mode to use ExpertDistributionRecorder."
@@ -123,6 +127,12 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
             for k in self._accumulator.get_single_pass_gatherer_keys()
         }
+        if server_args.enable_expert_distribution_metrics:
+            logger.info(
+                "ExpertDistributionRecorder auto start record since enable_expert_distribution_metrics"
+            )
+            self.start_record()
     def with_current_layer(self, layer_idx):
         return self._current_layer_idx.with_value(layer_idx)
@@ -221,6 +231,10 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
         self._reset()
         return output
+    @property
+    def recording(self):
+        return self._recording
 _global_expert_distribution_recorder: Optional[ExpertDistributionRecorder] = (
     _ExpertDistributionRecorderNoop()
@@ -250,15 +264,23 @@ class _SinglePassGatherer(ABC):
             return _DetailSinglePassGatherer(
                 server_args, expert_location_metadata, rank
             )
+        if server_args.expert_distribution_recorder_mode == "stat_approx":
+            if server_args.enable_deepep_moe and (server_args.deepep_mode == "normal"):
+                return _DeepepNormalSinglePassGatherer(expert_location_metadata, rank)
+            else:
+                raise NotImplementedError
         if server_args.enable_deepep_moe:
             if server_args.deepep_mode == "normal":
-                return _DeepepNormalSinglePassGatherer(expert_location_metadata, rank)
+                return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
             elif server_args.deepep_mode == "low_latency":
                 return _DeepepLowLatencySinglePassGatherer(
                     expert_location_metadata, rank
                 )
             else:
                 raise NotImplementedError
         return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
     def __init__(self, expert_location_metadata: "ExpertLocationMetadata", rank: int):
@@ -293,7 +315,82 @@ class _SinglePassGatherer(ABC):
         raise NotImplementedError
-class _LayerBasedSinglePassGatherer(_SinglePassGatherer):
+class _DetailSinglePassGatherer(_SinglePassGatherer):
+    # DeepSeek V3 has this value; should generalize later
+    _TOP_K_NUM = 8
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        expert_location_metadata: "ExpertLocationMetadata",
+        rank: int,
+    ):
+        super().__init__(expert_location_metadata, rank)
+        self._metadata: Optional[Dict[str, Any]] = None
+        self._topk_ids_of_layer = torch.zeros(
+            (
+                expert_location_metadata.num_layers,
+                # TODO determine the max number
+                server_args.chunked_prefill_size * 8,
+                self._TOP_K_NUM,
+            ),
+            dtype=torch.int32,
+            device=server_args.device,
+        )
+        self._misc_objects: List[Dict[str, Any]] = []
+        assert (
+            not server_args.enable_two_batch_overlap
+        ), "DetailSinglePassGatherer does not support TBO yet"
+        # TODO assert shared experts fusion is disabled, o/w data is wrong
+    def on_forward_pass_start(self, forward_batch: ForwardBatch):
+        assert self._metadata is None
+        self._metadata = dict(
+            # TODO pr-chain
+            # rids=forward_batch.rids,
+            input_ids=forward_batch.input_ids.cpu().tolist(),
+            positions=forward_batch.positions.cpu().tolist(),
+            extend_seq_lens=forward_batch.extend_seq_lens_cpu,
+            forward_mode=forward_batch.forward_mode.value,
+        )
+    def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
+        self._topk_ids_of_layer[layer_idx, : topk_ids.shape[0], : topk_ids.shape[1]] = (
+            topk_ids
+        )
+    def on_deepep_dispatch_normal(
+        self,
+        layer_idx: int,
+        local_physical_count_of_layer: List[int],
+        num_tokens_per_rank,
+        num_tokens_per_rdma_rank,
+        num_tokens_per_expert,
+    ):
+        self._misc_objects.append(
+            dict(
+                layer_id=layer_idx,
+                num_tokens_per_rank=num_tokens_per_rank.cpu().tolist(),
+                num_tokens_per_rdma_rank=num_tokens_per_rdma_rank.cpu().tolist(),
+                num_tokens_per_expert=num_tokens_per_expert.cpu().tolist(),
+            )
+        )
+    def reset(self):
+        self._topk_ids_of_layer[...] = -1
+        self._misc_objects.clear()
+        self._metadata = None
+    def collect(self) -> Dict:
+        num_tokens = len(self._metadata["input_ids"])
+        return dict(
+            **self._metadata,
+            topk_ids_of_layer=self._topk_ids_of_layer[:, :num_tokens, :].clone().cpu(),
+            misc_objects=self._misc_objects,
+        )
+class _LayerBasedCpuSinglePassGatherer(_SinglePassGatherer):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._objects_of_layer = {}
@@ -322,29 +419,63 @@ def _list_sum(a: List, b: List) -> List:
     return [x + y for x, y in zip(a, b, strict=True)]
-class _SelectExpertsSinglePassGatherer(_LayerBasedSinglePassGatherer):
-    # pretty slow, but we will use the DeepEP Gatherer in production
-    def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
-        topk_ids_list = topk_ids.to("cpu", non_blocking=True).numpy().tolist()
-        torch.cuda.synchronize()
-        global_physical_count = [
-            0
-        ] * self._expert_location_metadata.num_physical_experts
-        for token_record in topk_ids_list:
-            for global_physical_expert_idx in token_record:
-                global_physical_count[global_physical_expert_idx] += 1
+class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
+    def __init__(self, *args, enable_global_physical_experts: bool, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._enable_global_physical_experts = enable_global_physical_experts
+        self._data = torch.zeros(
+            (
+                self._expert_location_metadata.num_layers,
+                (
+                    self._expert_location_metadata.num_physical_experts
+                    if enable_global_physical_experts
+                    else self._expert_location_metadata.num_local_physical_experts
+                ),
+            ),
+            dtype=torch.int,
+            device="cuda",
+        )
-        self._on_layer_data(layer_idx, global_physical_count)
+    def reset(self):
+        self._data[...] = 0
     def collect(self) -> Dict:
-        global_physical_count = super()._collect_objects(
-            pad_len=self._expert_location_metadata.num_physical_experts
-        )
+        if self._enable_global_physical_experts:
+            global_physical_count = self._data
+        else:
+            # Can optimize if bottleneck
+            global_physical_count = _convert_local_to_global_physical_count(
+                self._data,
+                rank=self._rank,
+                num_local_physical_experts=self._expert_location_metadata.num_local_physical_experts,
+                num_physical_experts=self._expert_location_metadata.num_physical_experts,
+            )
         return dict(global_physical_count=global_physical_count)
-class _DeepepNormalSinglePassGatherer(_LayerBasedSinglePassGatherer):
+class _SelectExpertsSinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, enable_global_physical_experts=True)
+    # can optimize (e.g. fuse / compile)
+    def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
+        topk_ids = topk_ids.flatten()
+        mask = topk_ids != -1
+        self._data[layer_idx, :].scatter_add_(
+            dim=0, index=topk_ids.masked_fill(~mask, 0).long(), src=mask.int()
+        )
+class _DeepepNormalSinglePassGatherer(_LayerBasedCpuSinglePassGatherer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if torch.distributed.get_rank() == 0:
+            logger.info(
+                "DeepepNormalSinglePassGatherer gathers approximate statistics. "
+                "If used with small batch size, consider using expert_distribution_recorder_mode=stat."
+            )
     def on_deepep_dispatch_normal(
         self,
         layer_idx: int,
@@ -369,17 +500,9 @@ class _DeepepNormalSinglePassGatherer(_LayerBasedSinglePassGatherer):
         return dict(global_physical_count=global_physical_count)
-class _DeepepLowLatencySinglePassGatherer(_SinglePassGatherer):
+class _DeepepLowLatencySinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._data = torch.zeros(
-            (
-                self._expert_location_metadata.num_layers,
-                self._expert_location_metadata.num_local_physical_experts,
-            ),
-            dtype=torch.int,
-            device="cuda",
-        )
+        super().__init__(*args, **kwargs, enable_global_physical_experts=False)
     def on_deepep_dispatch_low_latency(
         self, layer_idx: int, local_physical_count_of_layer: torch.Tensor
@@ -387,19 +510,6 @@ class _DeepepLowLatencySinglePassGatherer(_SinglePassGatherer):
         # Most naive implementation, can optimize later
         self._data[layer_idx, :] += local_physical_count_of_layer
-    def reset(self):
-        self._data[...] = 0
-    def collect(self) -> Dict:
-        # Can optimize if bottleneck
-        global_physical_count = _convert_local_to_global_physical_count(
-            self._data,
-            rank=self._rank,
-            num_local_physical_experts=self._expert_location_metadata.num_local_physical_experts,
-            num_physical_experts=self._expert_location_metadata.num_physical_experts,
-        )
-        return dict(global_physical_count=global_physical_count)
 def _convert_local_to_global_physical_count(
     local_physical_count: torch.Tensor,
@@ -438,9 +548,9 @@ class _Accumulator(ABC):
     def get_class(server_args: ServerArgs) -> Type["_Accumulator"]:
         return {
             "stat": _StatAccumulator,
-            # TODO pr-chain: enable this later
-            # "per_pass": _DetailAccumulator,
-            # "per_token": _DetailAccumulator,
+            "stat_approx": _StatAccumulator,
+            "per_pass": _DetailAccumulator,
+            "per_token": _DetailAccumulator,
         }[server_args.expert_distribution_recorder_mode]
     def __init__(
@@ -547,6 +657,63 @@ class _DequeCollection:
         return {d.maxlen: sum(d) / len(d) for d in self._dequeues}
+class _DetailAccumulator(_UtilizationRateAccumulatorMixin):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._records = []
+    def get_single_pass_gatherer_keys(self):
+        if False:  # TODO `server_args.enable_two_batch_overlap`
+            return [_SINGLE_PASS_GATHERER_KEY_PRIMARY, "child_a", "child_b"]
+        return super().get_single_pass_gatherer_keys()
+    def get_single_pass_gatherer_key(self, debug_name: Optional[str]):
+        if False:  # TODO `server_args.enable_two_batch_overlap`
+            return debug_name or _SINGLE_PASS_GATHERER_KEY_PRIMARY
+        return super().get_single_pass_gatherer_key(debug_name)
+    def append(
+        self,
+        forward_pass_id: int,
+        gatherer_key: str,
+        single_pass_data: Dict,
+    ):
+        super().append(forward_pass_id, gatherer_key, single_pass_data)
+        def _process_object(obj):
+            if isinstance(obj, torch.Tensor):
+                return obj.cpu().clone()
+            return obj
+        single_pass_data_processed = {
+            k: _process_object(v) for k, v in single_pass_data.items()
+        }
+        self._records.append(
+            dict(
+                forward_pass_id=forward_pass_id,
+                rank=self._rank,
+                gatherer_key=gatherer_key,
+                **single_pass_data_processed,
+            )
+        )
+    def reset(self):
+        super().reset()
+        self._records.clear()
+    def dump(self, output_mode: _OutputMode):
+        assert output_mode == "file"
+        output = dict(
+            records=self._records,
+            # NOTE: This may change during recording, so here we say it is the "last" one
+            last_physical_to_logical_map=self._expert_location_metadata.physical_to_logical_map,
+        )
+        _dump_to_file(
+            f"expert_distribution_recorder_{time.time()}_{self._rank}.pt", output
+        )
 class _StatAccumulator(_UtilizationRateAccumulatorMixin):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -560,6 +727,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
             dtype=torch.int32,
             device=self._server_args.device,
         )
+        self._first_dump = True
     def append(
         self,
@@ -584,9 +752,15 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
             num_logical_experts=self._expert_location_metadata.num_logical_experts,
             physical_to_logical_map=self._expert_location_metadata.physical_to_logical_map,
         )
+        if self._first_dump:
+            self._first_dump = False
+            torch.cuda.empty_cache()
         torch.distributed.all_reduce(
             logical_count_of_buffered_step, op=torch.distributed.ReduceOp.SUM
         )
         output = dict(
             rank=self._rank,
             logical_count=logical_count_of_buffered_step,

sglang/srt/managers/expert_location.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # ==============================================================================
 import json
 import logging
+import random
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional
@@ -22,7 +23,7 @@ import torch.distributed
 import torch.nn.functional as F
 from sglang.srt.configs.model_config import ModelConfig
-from sglang.srt.managers import deepseek_eplb
+from sglang.srt.managers import eplb_algorithms
 from sglang.srt.model_loader import get_model_architecture
 from sglang.srt.server_args import ServerArgs
@@ -32,9 +33,11 @@ logger = logging.getLogger(__name__)
 @dataclass
 class ExpertLocationMetadata:
     physical_to_logical_map: torch.Tensor  # (layers, num_physical_experts)
+    physical_to_logical_map_cpu: torch.Tensor
     logical_to_all_physical_map: torch.Tensor  # (layers, num_logical_experts, X)
     logical_to_all_physical_map_num_valid: torch.Tensor  # (layers, num_logical_experts)
-    logical_to_rank_dispatch_physical_map: torch.Tensor  # (layers, num_logical_experts)
+    # (layers, num_logical_experts)
+    logical_to_rank_dispatch_physical_map: Optional[torch.Tensor]
     # -------------------------------- properties ------------------------------------
@@ -69,11 +72,8 @@ class ExpertLocationMetadata:
         num_layers_2, num_logical_experts_1 = (
             self.logical_to_all_physical_map_num_valid.shape
         )
-        num_layers_3, num_logical_experts_2 = (
-            self.logical_to_rank_dispatch_physical_map.shape
-        )
-        assert num_layers_0 == num_layers_1 == num_layers_2 == num_layers_3
-        assert num_logical_experts_0 == num_logical_experts_1 == num_logical_experts_2
+        assert num_layers_0 == num_layers_1 == num_layers_2
+        assert num_logical_experts_0 == num_logical_experts_1
         assert num_physical_experts_0 == num_physical_experts_1
     # -------------------------------- construction ------------------------------------
@@ -116,6 +116,7 @@ class ExpertLocationMetadata:
         )
         return ExpertLocationMetadata._init_raw(
+            server_args=server_args,
             ep_size=common["ep_size"],
             physical_to_logical_map=physical_to_logical_map,
             logical_to_all_physical_map=logical_to_all_physical_map,
@@ -134,26 +135,31 @@ class ExpertLocationMetadata:
         common = ExpertLocationMetadata._init_common(server_args, model_config)
         model_config_for_expert_location = common["model_config_for_expert_location"]
         num_physical_experts = common["num_physical_experts"]
-        phase = server_args.disaggregation_mode
-        if phase == "null":
-            phase = "decode"
+        num_groups = model_config_for_expert_location.num_groups
+        num_nodes = server_args.nnodes
         physical_to_logical_map, logical_to_all_physical_map, expert_count = (
-            deepseek_eplb.rebalance_experts(
+            eplb_algorithms.rebalance_experts(
                 tokens_per_expert=logical_count,
                 num_physical_experts=num_physical_experts,
                 num_local_physical_experts=num_physical_experts // common["ep_size"],
-                num_groups=model_config_for_expert_location.num_groups,
-                num_nodes=server_args.nnodes,
-                phase=phase,
+                num_groups=num_groups,
+                num_nodes=num_nodes,
+                algorithm=eplb_algorithms.compute_algorithm(
+                    raw_algorithm=server_args.eplb_algorithm,
+                    num_groups=num_groups,
+                    num_nodes=num_nodes,
+                ),
             )
         )
         return ExpertLocationMetadata._init_raw(
+            server_args=server_args,
             ep_size=common["ep_size"],
-            physical_to_logical_map=physical_to_logical_map,
-            logical_to_all_physical_map=logical_to_all_physical_map,
+            physical_to_logical_map=physical_to_logical_map.to(server_args.device),
+            logical_to_all_physical_map=logical_to_all_physical_map.to(
+                server_args.device
+            ),
         )
     @staticmethod
@@ -179,6 +185,7 @@ class ExpertLocationMetadata:
     @staticmethod
     def _init_raw(
+        server_args: ServerArgs,
         ep_size: int,
         physical_to_logical_map: torch.Tensor,
         logical_to_all_physical_map: torch.Tensor,
@@ -197,14 +204,19 @@ class ExpertLocationMetadata:
         return ExpertLocationMetadata(
             physical_to_logical_map=physical_to_logical_map,
+            physical_to_logical_map_cpu=physical_to_logical_map.cpu(),
             logical_to_all_physical_map=logical_to_all_physical_map_padded,
             logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
-            logical_to_rank_dispatch_physical_map=compute_logical_to_rank_dispatch_physical_map(
-                logical_to_all_physical_map=logical_to_all_physical_map,
-                logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
-                num_gpus=ep_size,
-                num_physical_experts=num_physical_experts,
-                ep_rank=torch.distributed.get_rank(),
+            logical_to_rank_dispatch_physical_map=(
+                compute_logical_to_rank_dispatch_physical_map(
+                    logical_to_all_physical_map=logical_to_all_physical_map,
+                    num_gpus=ep_size,
+                    num_physical_experts=num_physical_experts,
+                    # TODO improve when we have real EP rank
+                    ep_rank=torch.distributed.get_rank() % ep_size,
+                )
+                if server_args.ep_dispatch_algorithm == "static"
+                else None
             ),
         )
@@ -213,6 +225,7 @@ class ExpertLocationMetadata:
     def update(
         self,
         other: "ExpertLocationMetadata",
+        update_layer_ids: List[int],
     ):
         for field in [
             "ep_size",
@@ -221,12 +234,21 @@ class ExpertLocationMetadata:
         for field in [
             "physical_to_logical_map",
+            "physical_to_logical_map_cpu",
             "logical_to_all_physical_map",
             "logical_to_all_physical_map_num_valid",
             "logical_to_rank_dispatch_physical_map",
         ]:
-            dst = getattr(self, field)
-            dst[...] = getattr(other, field)
+            other_field = getattr(other, field)
+            self_field = getattr(self, field)
+            assert (other_field is not None) == (self_field is not None)
+            if self_field is not None:
+                mask_update = torch.tensor(
+                    [i in update_layer_ids for i in range(self.num_layers)]
+                )
+                mask_update = mask_update.view(*([-1] + [1] * (self_field.dim() - 1)))
+                mask_update = mask_update.to(self_field.device, non_blocking=True)
+                self_field[...] = torch.where(mask_update, other_field, self_field)
     # -------------------------------- usage ------------------------------------
@@ -292,49 +314,82 @@ def _pad_nested_array(arr, pad_value):
     return padded
-# TODO use more sophisticated approaches
+# TODO optimize performance (rewrite and/or run in separate process with overlap)
 def compute_logical_to_rank_dispatch_physical_map(
     logical_to_all_physical_map: torch.Tensor,
-    logical_to_all_physical_map_num_valid: torch.Tensor,
     num_gpus: int,
     num_physical_experts: int,
     ep_rank: int,
-    base_seed: int = 42,
+    seed: int = 42,
 ):
-    device = logical_to_all_physical_map.device
+    r = random.Random(seed)
     num_local_physical_experts = num_physical_experts // num_gpus
     num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
+    dtype = logical_to_all_physical_map.dtype
-    g = torch.Generator(device=device)
-    g.manual_seed(base_seed + ep_rank)
-    output_shape = (num_layers, num_logical_experts)
-    chosen_index = (
-        torch.randint(
-            0, 65536, output_shape, dtype=torch.int32, device=device, generator=g
-        )
-        % logical_to_all_physical_map_num_valid
+    logical_to_rank_dispatch_physical_map = torch.full(
+        size=(num_gpus, num_layers, num_logical_experts),
+        fill_value=-1,
+        dtype=dtype,
     )
-    logical_to_rank_dispatch_physical_map = torch.gather(
-        logical_to_all_physical_map, dim=2, index=chosen_index.unsqueeze(-1)
-    ).squeeze(-1)
-    assert logical_to_rank_dispatch_physical_map.shape == output_shape
-    for index in range(logical_to_all_physical_map_num_valid.max().item()):
-        partial_logical_to_all_physical_map = logical_to_all_physical_map[:, :, index]
-        is_valid = partial_logical_to_all_physical_map != -1
-        is_same_gpu = (
-            partial_logical_to_all_physical_map // num_local_physical_experts
-        ) == ep_rank
-        logical_to_rank_dispatch_physical_map = torch.where(
-            is_valid & is_same_gpu,
-            partial_logical_to_all_physical_map,
-            logical_to_rank_dispatch_physical_map,
-        )
+    for layer_id in range(num_layers):
+        for logical_expert_id in range(num_logical_experts):
+            candidate_physical_expert_ids = _logical_to_all_physical_raw(
+                logical_to_all_physical_map, layer_id, logical_expert_id
+            )
+            output_partial = logical_to_rank_dispatch_physical_map[
+                :, layer_id, logical_expert_id
+            ]
+            for gpu_id in range(num_gpus):
+                same_gpu_physical_expert_ids = [
+                    physical_expert_id
+                    for physical_expert_id in candidate_physical_expert_ids
+                    if _compute_gpu_id_of_physical_expert(
+                        physical_expert_id, num_local_physical_experts
+                    )
+                    == gpu_id
+                ]
+                if len(same_gpu_physical_expert_ids) > 0:
+                    output_partial[gpu_id] = same_gpu_physical_expert_ids[0]
+            num_remain = torch.sum(output_partial == -1).item()
+            output_partial[output_partial == -1] = torch.tensor(
+                _fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
+                dtype=dtype,
+            )
     assert torch.all(logical_to_rank_dispatch_physical_map != -1)
-    return logical_to_rank_dispatch_physical_map
+    device = logical_to_all_physical_map.device
+    return logical_to_rank_dispatch_physical_map[ep_rank, :, :].to(device)
+def _logical_to_all_physical_raw(
+    logical_to_all_physical_map, layer_id: int, logical_expert_id: int
+) -> List[int]:
+    return [
+        physical_expert_id
+        for physical_expert_id in logical_to_all_physical_map[
+            layer_id, logical_expert_id
+        ].tolist()
+        if physical_expert_id != -1
+    ]
+def _compute_gpu_id_of_physical_expert(
+    physical_expert_id: int, num_local_physical_experts: int
+) -> int:
+    return physical_expert_id // num_local_physical_experts
+def _fair_choices(arr: List, k: int, r: random.Random) -> List:
+    quotient, remainder = divmod(k, len(arr))
+    ans = arr * quotient + r.sample(arr, k=remainder)
+    r.shuffle(ans)
+    return ans
 @dataclass
@@ -363,7 +418,6 @@ def compute_initial_expert_location_metadata(
 ) -> ExpertLocationMetadata:
     data = server_args.init_expert_location
     if data == "trivial":
-        logger.info("init_expert_location from trivial")
         return ExpertLocationMetadata.init_trivial(server_args, model_config)
     # TODO unify with the utils function

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl