PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (318) hide show

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 import torch.distributed as dist
 from torch import nn
-from sglang.srt.distributed import get_tensor_model_parallel_group
+from sglang.srt.distributed import get_tp_group
 from sglang.srt.layers.dp_attention import get_attention_tp_group
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -30,7 +30,7 @@ class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
         self.use_nan_detection = global_server_args_dict["enable_nan_detection"]
-        self.tp_sync_group = get_tensor_model_parallel_group().device_group
+        self.tp_sync_group = get_tp_group().device_group
         if global_server_args_dict["enable_dp_attention"]:
             self.tp_sync_group = get_attention_tp_group().device_group
@@ -59,7 +59,7 @@ class Sampler(nn.Module):
         # Apply the custom logit processors if registered in the sampling info.
         if sampling_info.has_custom_logit_processor:
-            self._apply_custom_logit_processor(logits, sampling_info)
+            apply_custom_logit_processor(logits, sampling_info)
         if self.use_nan_detection and torch.any(torch.isnan(logits)):
             logger.warning("Detected errors during sampling! NaN in the logits.")
@@ -81,54 +81,39 @@ class Sampler(nn.Module):
             probs = logits
             del logits
-            if global_server_args_dict["sampling_backend"] == "flashinfer":
-                if return_logprob:
-                    # NOTE: the top_p_renorm_prob from flashinfer has numerical problems,
-                    # https://github.com/flashinfer-ai/flashinfer/issues/708
-                    # so we use the torch implementation.
-                    # clamp to avoid -inf
-                    logprobs = torch.log(
-                        top_p_normalize_probs_torch(probs, sampling_info.top_ps)
-                    ).clamp(min=torch.finfo(probs.dtype).min)
-                max_top_k_round, batch_size = 32, probs.shape[0]
-                if sampling_info.need_min_p_sampling:
-                    probs = top_k_renorm_prob(probs, sampling_info.top_ks)
-                    probs = top_p_renorm_prob(probs, sampling_info.top_ps)
-                    batch_next_token_ids = min_p_sampling_from_probs(
-                        probs, sampling_info.min_ps
-                    )
-                else:
-                    # Check Nan will throw exception, only check when crash_on_warnings is True
-                    check_nan = self.use_nan_detection and crash_on_warnings()
-                    batch_next_token_ids = top_k_top_p_sampling_from_probs(
+            if True:  # Keep this redundant check to simplify some internal code sync
+                if global_server_args_dict["sampling_backend"] == "flashinfer":
+                    if sampling_info.need_min_p_sampling:
+                        probs = top_k_renorm_prob(probs, sampling_info.top_ks)
+                        probs = top_p_renorm_prob(probs, sampling_info.top_ps)
+                        batch_next_token_ids = min_p_sampling_from_probs(
+                            probs, sampling_info.min_ps
+                        )
+                    else:
+                        batch_next_token_ids = top_k_top_p_sampling_from_probs(
+                            probs,
+                            sampling_info.top_ks,
+                            sampling_info.top_ps,
+                            filter_apply_order="joint",
+                            check_nan=self.use_nan_detection,
+                        )
+                elif global_server_args_dict["sampling_backend"] == "pytorch":
+                    # A slower fallback implementation with torch native operations.
+                    batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
                         probs,
                         sampling_info.top_ks,
                         sampling_info.top_ps,
-                        filter_apply_order="joint",
-                        check_nan=check_nan,
+                        sampling_info.min_ps,
+                        sampling_info.need_min_p_sampling,
+                    )
+                else:
+                    raise ValueError(
+                        f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
                     )
-            elif global_server_args_dict["sampling_backend"] == "pytorch":
-                # A slower fallback implementation with torch native operations.
-                batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
-                    probs,
-                    sampling_info.top_ks,
-                    sampling_info.top_ps,
-                    sampling_info.min_ps,
-                    sampling_info.need_min_p_sampling,
-                )
-                if return_logprob:
-                    # clamp to avoid -inf
-                    logprobs = torch.log(
-                        top_p_normalize_probs_torch(probs, sampling_info.top_ps)
-                    ).clamp(min=torch.finfo(probs.dtype).min)
-            else:
-                raise ValueError(
-                    f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
-                )
+            if return_logprob:
+                # clamp to avoid -inf
+                logprobs = torch.log(probs).clamp(min=torch.finfo(probs.dtype).min)
         # Attach logprobs to logits_output (in-place modification)
         if return_logprob:
@@ -165,39 +150,6 @@ class Sampler(nn.Module):
         return batch_next_token_ids
-    def _apply_custom_logit_processor(
-        self, logits: torch.Tensor, sampling_batch_info: SamplingBatchInfo
-    ):
-        """Apply custom logit processors to the logits.
-        This function will modify the logits in-place."""
-        assert logits.shape[0] == len(sampling_batch_info), (
-            f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
-            f"sampling_batch_info ({len(sampling_batch_info)})"
-        )
-        for _, (
-            processor,
-            batch_mask,
-        ) in sampling_batch_info.custom_logit_processor.items():
-            # Get the batch indices that need to be processed
-            batch_indices = batch_mask.nonzero(as_tuple=True)[0]
-            assert batch_mask.shape[0] == len(sampling_batch_info), (
-                f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
-                f"sampling_batch_info ({len(sampling_batch_info)})"
-            )
-            # Apply the processor to the logits
-            logits[batch_mask] = processor(
-                logits[batch_mask],
-                [sampling_batch_info.custom_params[i] for i in batch_indices],
-            )
-            logger.debug(
-                f"Custom logit processor {processor.__class__.__name__} is applied."
-            )
 def top_k_top_p_min_p_sampling_from_probs_torch(
     probs: torch.Tensor,
@@ -226,6 +178,14 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
     return batch_next_token_ids
+def sampling_from_probs_torch(probs: torch.Tensor):
+    """A sampling implementation with native pytorch operations, without
+    top-k, top-p, or min-p filtering."""
+    sampled_index = torch.multinomial(probs, num_samples=1)
+    batch_next_token_ids = sampled_index.view(-1).to(torch.int32)
+    return batch_next_token_ids
 def top_p_normalize_probs_torch(
     probs: torch.Tensor,
     top_ps: torch.Tensor,
@@ -264,3 +224,44 @@ def get_token_ids_logprobs(logprobs: torch.Tensor, token_ids_logprobs: List[List
             output_token_ids_logprobs_idx.append([])
     return output_token_ids_logprobs_val, output_token_ids_logprobs_idx
+def apply_custom_logit_processor(
+    logits: torch.Tensor,
+    sampling_batch_info: SamplingBatchInfo,
+    num_tokens_in_batch: int = 1,
+):
+    """Apply custom logit processors to the logits.
+    This function will modify the logits in-place.
+    num_tokens_in_batch is needed to support spec decoding, where each batch can contain multiple
+    tokens. By default, we assume each batch contains only 1 token.
+    """
+    assert logits.shape[0] == len(sampling_batch_info) * num_tokens_in_batch, (
+        f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
+        f"sampling_batch_info ({len(sampling_batch_info)}) x num_tokens_in_batch "
+        f"({num_tokens_in_batch})"
+    )
+    for _, (
+        processor,
+        batch_mask,
+    ) in sampling_batch_info.custom_logit_processor.items():
+        # Get the batch indices that need to be processed
+        batch_indices = batch_mask.nonzero(as_tuple=True)[0]
+        assert batch_mask.shape[0] == len(sampling_batch_info), (
+            f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
+            f"sampling_batch_info ({len(sampling_batch_info)})"
+        )
+        batch_mask = torch.repeat_interleave(batch_mask, num_tokens_in_batch)
+        # Apply the processor to the logits
+        logits[batch_mask] = processor(
+            logits[batch_mask],
+            [sampling_batch_info.custom_params[i] for i in batch_indices],
+        )
+        logger.debug(
+            f"Custom logit processor {processor.__class__.__name__} is applied."
+        )

sglang/srt/layers/utils.py CHANGED Viewed

@@ -33,3 +33,9 @@ class PPMissingLayer(torch.nn.Identity):
         """
         input = args[0] if args else next(iter(kwargs.values()))
         return (input,) if self.return_tuple else input
+def is_sm100_supported(device=None) -> bool:
+    return (torch.cuda.get_device_capability(device)[0] == 10) and (
+        torch.version.cuda >= "12.8"
+    )

sglang/srt/lora/layers.py CHANGED Viewed

@@ -137,7 +137,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         self.A_buffer_gate_up = A_buffer
         if self.lora_backend.fuse_stacked_lora_b:
             # B_buffer_gate_up: (num_lora, 2 * output_dim, r)
-            if not hasattr(self, "B_buffer_gate_up") or self.B_buffer_gate_up is None:
+            if getattr(self, "B_buffer_gate_up", None) is None:
                 self.B_buffer_gate_up = torch.empty(
                     (
                         B_buffer[0].shape[0],
@@ -202,7 +202,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
             output_dim_q, output_dim_kv = B_buffer_q.shape[-2], B_buffer_kv.shape[-2]
             # B_buffer_qkv: (num_lora, output_dim_q + 2 * output_dim_kv, r)
-            if not hasattr(self, "B_buffer_qkv") or self.B_buffer_qkv is None:
+            if getattr(self, "B_buffer_qkv", None) is None:
                 self.B_buffer_qkv = torch.empty(
                     (
                         B_buffer_q[0].shape[0],
@@ -221,20 +221,17 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
             )
             # Offsets of q/k/v in output dimension
-            if not hasattr(self, "output_offset") or self.output_offset is None:
-                self.output_offset = torch.empty(
-                    4, dtype=torch.int32, device=B_buffer_q.device
+            if getattr(self, "output_offset", None) is None:
+                self.output_offset = torch.tensor(
+                    [
+                        0,
+                        output_dim_q,
+                        output_dim_q + output_dim_kv,
+                        output_dim_q + 2 * output_dim_kv,
+                    ],
+                    dtype=torch.int32,
+                    device=B_buffer_q.device,
                 )
-            self.output_offset[:4] = torch.tensor(
-                [
-                    0,
-                    output_dim_q,
-                    output_dim_q + output_dim_kv,
-                    output_dim_q + 2 * output_dim_kv,
-                ],
-                dtype=torch.int32,
-                device=B_buffer_q.device,
-            )
             # For computing number of launched blocks
             self.max_qkv_out_dim = max(output_dim_q, output_dim_kv)
         else:

sglang/srt/lora/lora.py CHANGED Viewed

@@ -92,11 +92,12 @@ class LoRAAdapter(nn.Module):
         for i in range(self.base_hf_config.num_hidden_layers):
             layer = self.layers[i]
             weight_names = [name for name, _ in layer.weights.items()]
-            self.stack_qkv_proj(weight_names, layer.weights)
-            self.stack_gate_up_proj(weight_names, layer.weights)
-    def stack_qkv_proj(self, weight_names: List[str], weights: Dict[str, torch.Tensor]):
+            self.normalize_qkv_proj(weight_names, layer.weights)
+            self.normalize_gate_up_proj(weight_names, layer.weights)
+    def normalize_qkv_proj(
+        self, weight_names: List[str], weights: Dict[str, torch.Tensor]
+    ):
         # Collect target q/k/v modules. This process is necessary since there might be no lora attached to k_proj
         target_module = set()
         for weight_name in weight_names:
@@ -106,6 +107,8 @@ class LoRAAdapter(nn.Module):
                 target_module.add("q_proj")
             if "v_proj" in weight_name:
                 target_module.add("v_proj")
+            if "qkv_proj" in weight_name:
+                target_module.add("qkv_proj")
         if len(target_module) == 0:
             return
@@ -148,8 +151,35 @@ class LoRAAdapter(nn.Module):
                     if "k_proj" in target_module:
                         weights.pop(k_name)
                     weights.pop(v_name)
+            elif "qkv_proj" in weight_name:
+                # If qkv_proj is already stacked, we normalize it following the SGL convention.
+                qkv_name = weight_name
+                q_name = weight_name.replace("qkv_proj", "q_proj")
+                k_name = weight_name.replace("qkv_proj", "k_proj")
+                v_name = weight_name.replace("qkv_proj", "v_proj")
+                kv_name = weight_name.replace("qkv_proj", "kv_proj")
+                if "lora_A" in weight_name:
+                    weights[qkv_name] = weights[qkv_name].repeat(3, 1)
+                else:
+                    head_size = (
+                        self.base_hf_config.hidden_size
+                        // self.base_hf_config.num_attention_heads
+                    )
+                    weights[q_name], k_proj_weight, v_proj_weight = torch.split(
+                        weights[qkv_name],
+                        [
+                            head_size * self.base_hf_config.num_attention_heads,
+                            head_size * self.base_hf_config.num_key_value_heads,
+                            head_size * self.base_hf_config.num_key_value_heads,
+                        ],
+                        dim=0,
+                    )
+                    weights[kv_name] = torch.stack(
+                        [k_proj_weight, v_proj_weight],
+                        dim=0,
+                    )
-    def stack_gate_up_proj(
+    def normalize_gate_up_proj(
         self, weight_names: List[str], weights: Dict[str, torch.Tensor]
     ):
         for weight_name in weight_names:
@@ -179,3 +209,17 @@ class LoRAAdapter(nn.Module):
                 weights.pop(weight_name)
                 if up_name in weights:
                     weights.pop(up_name)
+            elif "gate_up_proj" in weight_name:
+                # If gate_up_proj is already stacked, we normalize it following the SGL convention
+                gate_up_name = weight_name
+                if "lora_A" in weight_name:
+                    weights[gate_up_name] = weights[gate_up_name].repeat(2, 1)
+                else:
+                    output_dim = weights[gate_up_name].shape[0] // 2
+                    weights[gate_up_name] = torch.stack(
+                        [
+                            weights[gate_up_name][:output_dim, :],
+                            weights[gate_up_name][output_dim:, :],
+                        ],
+                        dim=0,
+                    )

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -32,7 +32,7 @@ from sglang.srt.lora.utils import (
     LoRAType,
     get_customized_names_from_hf_names,
     get_layer_id,
-    get_stacked_name,
+    get_normalized_lora_weight_names,
     get_weight_name,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -101,10 +101,13 @@ class LoRAManager:
             self.hf_target_names.update(self.configs[name].target_modules)
         # Target lora weight names for lora_a and lora_b modules respectively.
-        # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}
-        self.lora_weight_names: Set[Tuple[str]] = set(
-            [get_stacked_name(module) for module in self.hf_target_names]
-        )
+        weights_A: List[str] = []
+        weights_B: List[str] = []
+        for module in self.hf_target_names:
+            lora_A, lora_B = get_normalized_lora_weight_names(module)
+            weights_A += lora_A
+            weights_B += lora_B
+        self.lora_weight_names: Tuple[Set[str]] = set(weights_A), set(weights_B)
         # load all weights to cpu
         self.loras: Dict[str, LoRAAdapter] = {}
@@ -263,7 +266,18 @@ class LoRAManager:
         self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = {
             i: [] for i in range(self.base_hf_config.num_hidden_layers)
         }
         for module_name, module in self.base_model.named_modules():
+            # TODO (lifuhuang): in the future, we should consider generalizing the
+            # should_apply_lora function to support mapping by full module name instead
+            # of just the last part (e.g., "qkv_proj") to support scenarios with multiple
+            # attention stacks (e.g., multimodal models).
+            # See: https://github.com/sgl-project/sglang/issues/6608
+            if getattr(
+                self.base_model, "should_apply_lora", None
+            ) and not self.base_model.should_apply_lora(module_name):
+                continue
             # The module should be converted if it is included in target_names
             if module_name.split(".")[-1] in customized_target_names:
                 layer_id = get_layer_id(module_name)

sglang/srt/lora/mem_pool.py CHANGED Viewed

@@ -91,18 +91,16 @@ class LoRAMemoryPool:
     def init_buffers(
         self,
-        lora_weight_names: Set[Tuple[str]],
+        lora_weight_names: Tuple[Set[str]],
         base_model: torch.nn.Module,
     ):
         # lora_weight_names is a set of name pairs indicating each pair of lora modules to load
         #   e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj"), ("o_proj", "o_proj")}
-        self.lora_weight_names: Set[Tuple[str]] = lora_weight_names
+        self.lora_weight_names: Tuple[Set[str]] = lora_weight_names
         device = next(base_model.parameters()).device
-        lora_module_A_names = set([name[0] for name in lora_weight_names])
-        lora_module_B_names = set([name[1] for name in lora_weight_names])
         # Init A tensor, column_major=False
-        for module_A in lora_module_A_names:
+        for module_A in lora_weight_names[0]:
             lora_A_shape = self.get_lora_A_shape(module_A, base_model)
             self.A_buffer[module_A] = [
                 torch.empty(
@@ -110,10 +108,10 @@ class LoRAMemoryPool:
                     dtype=self.dtype,
                     device=device,
                 )
-                for i in range(self.num_layer)
+                for _ in range(self.num_layer)
             ]
         # Init B tensor, column_major=True
-        for module_B in lora_module_B_names:
+        for module_B in lora_weight_names[1]:
             lora_B_shape = self.get_lora_B_shape(module_B, base_model)
             self.B_buffer[module_B] = [
                 torch.empty(
@@ -159,6 +157,10 @@ class LoRAMemoryPool:
     def load_lora_weight_to_buffer(
         self, uid: str, buffer_id: int, lora_adapter: LoRAAdapter = None
     ):
+        def check_lora_weight_shape(buffer_view: torch.Tensor, weight: torch.Tensor):
+            assert (
+                buffer_view.shape == weight.shape
+            ), f"LoRA buffer shape {buffer_view.shape} does not match weight shape {weight.shape}."
         if uid is None:
             for i in range(self.num_layer):
@@ -210,21 +212,27 @@ class LoRAMemoryPool:
             for name, weights in temp_A_buffer.items():
                 c = get_stacked_multiply(name)
-                self.A_buffer[name][layer_id][buffer_id][: lora_rank * c, :].copy_(
-                    weights
-                )
+                buffer_view = self.A_buffer[name][layer_id][buffer_id][
+                    : lora_rank * c, :
+                ]
+                check_lora_weight_shape(buffer_view, weights)
+                buffer_view.copy_(weights)
             for name, weights in temp_B_buffer.items():
                 c = get_stacked_multiply(name)
                 if c > 1:
                     for stacked_id in range(c):
-                        self.B_buffer[name][layer_id][stacked_id][buffer_id][
-                            :, :lora_rank
-                        ].copy_(weights[stacked_id])
+                        buffer_view = self.B_buffer[name][layer_id][stacked_id][
+                            buffer_id
+                        ][:, :lora_rank]
+                        check_lora_weight_shape(buffer_view, weights[stacked_id])
+                        buffer_view.copy_(weights[stacked_id])
                 else:
-                    self.B_buffer[name][layer_id][0][buffer_id][:, :lora_rank].copy_(
-                        weights
-                    )
+                    buffer_view = self.B_buffer[name][layer_id][0][buffer_id][
+                        :, :lora_rank
+                    ]
+                    check_lora_weight_shape(buffer_view, weights)
+                    buffer_view.copy_(weights)
     def get_tensor(
         self, weight_name: str, layer_id: int, lora_type: LoRAType

sglang/srt/lora/utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
 from dataclasses import dataclass
 from enum import Enum
-from typing import Optional, Set, Tuple
+from typing import List, Optional, Set, Tuple
 import torch
@@ -106,18 +106,22 @@ def get_hidden_dim(
             raise NotImplementedError()
-def get_stacked_name(name: str) -> Tuple[str]:
+def get_normalized_lora_weight_names(name: str) -> Tuple[List[str], List[str]]:
     """
-    Mapping a target module name to (stacked name for Lora A, stacked name for Lora B)
+    Mapping a target module name to names of the normized LoRA weights.
+    Returned tuple contains (name for Lora A, name for Lora B)
     """
     params_mapping = {
-        "q_proj": ("qkv_proj", "q_proj"),
-        "k_proj": ("qkv_proj", "kv_proj"),
-        "v_proj": ("qkv_proj", "kv_proj"),
-        "gate_proj": ("gate_up_proj", "gate_up_proj"),
-        "up_proj": ("gate_up_proj", "gate_up_proj"),
+        "q_proj": (["qkv_proj"], ["q_proj"]),
+        "k_proj": (["qkv_proj"], ["kv_proj"]),
+        "v_proj": (["qkv_proj"], ["kv_proj"]),
+        "gate_proj": (["gate_up_proj"], ["gate_up_proj"]),
+        "up_proj": (["gate_up_proj"], ["gate_up_proj"]),
+        "qkv_proj": (["qkv_proj"], ["q_proj", "kv_proj"]),
+        "gate_up_proj": (["gate_up_proj"], ["gate_up_proj"]),
     }
-    return params_mapping.get(name, (name, name))
+    stacked = params_mapping.get(name, ([name], [name]))
+    return stacked
 def get_stacked_multiply(module_name: str) -> int:
@@ -133,7 +137,7 @@ def get_stacked_multiply(module_name: str) -> int:
 def get_weight_name(
-    target_name: str, lora_weight_names: Set[Tuple[str]], lora_type: LoRAType
+    target_name: str, lora_weight_names: Tuple[Set[str]], lora_type: LoRAType
 ) -> Optional[str]:
     """
     target_name is name of a given module,
@@ -142,9 +146,9 @@ def get_weight_name(
     Else raise ValueError.
     """
     idx = 0 if lora_type == LoRAType.LORA_A else 1
-    for weight_name_pair in lora_weight_names:
-        if weight_name_pair[idx] in target_name:
-            return weight_name_pair[idx]
+    for weight_name in lora_weight_names[idx]:
+        if weight_name in target_name:
+            return weight_name
     raise ValueError(
         f"Cannot find weight name for {target_name} in {lora_weight_names}"
     )

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -248,12 +248,20 @@ class DataParallelController:
     def round_robin_scheduler(self, req: Req):
         if self.server_args.disaggregation_mode == "null":
-            self.workers[self.round_robin_counter].send_pyobj(req)
-            self.round_robin_counter = (self.round_robin_counter + 1) % len(
-                self.workers
-            )
+            if req.data_parallel_rank is not None:
+                logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
+                self.workers[req.data_parallel_rank].send_pyobj(req)
+            else:
+                self.workers[self.round_robin_counter].send_pyobj(req)
+                self.round_robin_counter = (self.round_robin_counter + 1) % len(
+                    self.workers
+                )
         else:
-            self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
+            if req.data_parallel_rank is not None:
+                logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
+                self.workers[req.data_parallel_rank].send_pyobj(req)
+            else:
+                self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
     def shortest_queue_scheduler(self, input_requests):
         raise NotImplementedError()

sglang/srt/managers/eplb_algorithms/__init__.py ADDED Viewed

@@ -0,0 +1,63 @@
+from enum import Enum, auto
+from typing import Optional
+import torch
+from sglang.srt.managers.eplb_algorithms import deepseek, deepseek_vec
+class EplbAlgorithm(Enum):
+    deepseek = auto()
+    deepseek_hierarchical = auto()
+    deepseek_vec = auto()
+    deepseek_vec_hierarchical = auto()
+    # TODO may have more algorithm later
+def rebalance_experts(
+    tokens_per_expert: torch.Tensor,
+    num_physical_experts: int,
+    num_local_physical_experts: int,
+    num_groups: Optional[int],
+    num_nodes: int,
+    algorithm: EplbAlgorithm,
+):
+    if algorithm in [EplbAlgorithm.deepseek, EplbAlgorithm.deepseek_hierarchical]:
+        return deepseek.rebalance_experts(
+            weight=tokens_per_expert.sum(dim=0),
+            num_replicas=num_physical_experts,
+            num_groups=num_groups,
+            num_nodes=num_nodes,
+            num_gpus=num_physical_experts // num_local_physical_experts,
+            enable_hierarchical=algorithm == EplbAlgorithm.deepseek_hierarchical,
+        )
+    if algorithm in [
+        EplbAlgorithm.deepseek_vec,
+        EplbAlgorithm.deepseek_vec_hierarchical,
+    ]:
+        return deepseek_vec.rebalance_experts(
+            tokens_per_expert=tokens_per_expert,
+            num_physical_experts=num_physical_experts,
+            num_local_physical_experts=num_local_physical_experts,
+            num_groups=num_groups,
+            num_nodes=num_nodes,
+            enable_hierarchical=algorithm == EplbAlgorithm.deepseek_vec_hierarchical,
+        )
+    raise NotImplementedError
+def compute_algorithm(
+    raw_algorithm: str,
+    num_groups: Optional[int],
+    num_nodes: int,
+) -> EplbAlgorithm:
+    if raw_algorithm != "auto":
+        return EplbAlgorithm[raw_algorithm]
+    # TODO test on real scenarios and know which ones perform better
+    if (num_groups is not None) and (num_groups % num_nodes == 0):
+        return EplbAlgorithm.deepseek_hierarchical
+    else:
+        return EplbAlgorithm.deepseek

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl