PyPI - sglang - Versions diffs - 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +18 -3
sglang/compile_deep_gemm.py +13 -7
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +120 -0
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +25 -2
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -5
sglang/srt/entrypoints/engine.py +13 -5
sglang/srt/entrypoints/http_server.py +22 -3
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +7 -0
sglang/srt/eplb/expert_distribution.py +34 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +7 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +3 -1
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +10 -2
sglang/srt/layers/communicator.py +23 -1
sglang/srt/layers/layernorm.py +16 -2
sglang/srt/layers/logits_processor.py +4 -20
sglang/srt/layers/moe/ep_moe/layer.py +0 -18
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/moe_runner/deep_gemm.py +53 -33
sglang/srt/layers/moe/token_dispatcher/deepep.py +12 -9
sglang/srt/layers/moe/topk.py +31 -6
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +9 -78
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/rotary_embedding.py +117 -45
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +26 -4
sglang/srt/managers/multi_tokenizer_mixin.py +5 -0
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +164 -129
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +154 -59
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/memory_pool.py +171 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +11 -11
sglang/srt/model_executor/model_runner.py +76 -21
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +149 -34
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +0 -1
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +1 -1
sglang/srt/models/qwen3_moe.py +16 -8
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +103 -22
sglang/srt/single_batch_overlap.py +4 -1
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +55 -32
sglang/srt/utils/hf_transformers_utils.py +38 -16
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +10 -24
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +150 -136
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/entrypoints/openai/serving_chat.py CHANGED Viewed

@@ -535,6 +535,17 @@ class OpenAIServingChat(OpenAIServingBase):
                             choices=[choice_data],
                             model=request.model,
                         )
+                        # Add usage stats if continuous_usage_stats is enabled
+                        if (
+                            request.stream_options
+                            and request.stream_options.continuous_usage_stats
+                        ):
+                            chunk.usage = UsageProcessor.calculate_token_usage(
+                                prompt_tokens=prompt_tokens.get(index, 0),
+                                completion_tokens=completion_tokens.get(index, 0),
+                            )
                         yield f"data: {chunk.model_dump_json()}\n\n"
                 # Handle tool calls
@@ -579,6 +590,17 @@ class OpenAIServingChat(OpenAIServingBase):
                             choices=[choice_data],
                             model=request.model,
                         )
+                        # Add usage stats if continuous_usage_stats is enabled
+                        if (
+                            request.stream_options
+                            and request.stream_options.continuous_usage_stats
+                        ):
+                            chunk.usage = UsageProcessor.calculate_token_usage(
+                                prompt_tokens=prompt_tokens.get(index, 0),
+                                completion_tokens=completion_tokens.get(index, 0),
+                            )
                         yield f"data: {chunk.model_dump_json()}\n\n"
             # Send finish_reason chunks for each index that completed
@@ -1056,6 +1078,16 @@ class OpenAIServingChat(OpenAIServingBase):
                 choices=[choice_data],
                 model=request.model,
             )
+            # Add usage stats if continuous_usage_stats is enabled
+            if request.stream_options and request.stream_options.continuous_usage_stats:
+                prompt_tokens = content["meta_info"].get("prompt_tokens", 0)
+                completion_tokens = content["meta_info"].get("completion_tokens", 0)
+                chunk.usage = UsageProcessor.calculate_token_usage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                )
             yield f"data: {chunk.model_dump_json()}\n\n"
         # Yield tool calls
@@ -1096,6 +1128,16 @@ class OpenAIServingChat(OpenAIServingBase):
                 choices=[choice_data],
                 model=request.model,
             )
+            # Add usage stats if continuous_usage_stats is enabled
+            if request.stream_options and request.stream_options.continuous_usage_stats:
+                prompt_tokens = content["meta_info"].get("prompt_tokens", 0)
+                completion_tokens = content["meta_info"].get("completion_tokens", 0)
+                chunk.usage = UsageProcessor.calculate_token_usage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                )
             yield f"data: {chunk.model_dump_json()}\n\n"
     def _check_for_unstreamed_tool_args(

sglang/srt/entrypoints/openai/serving_completions.py CHANGED Viewed

@@ -272,6 +272,16 @@ class OpenAIServingCompletion(OpenAIServingBase):
                     model=request.model,
                 )
+                # Add usage stats if continuous_usage_stats is enabled
+                if (
+                    request.stream_options
+                    and request.stream_options.continuous_usage_stats
+                ):
+                    chunk.usage = UsageProcessor.calculate_token_usage(
+                        prompt_tokens=prompt_tokens.get(index, 0),
+                        completion_tokens=completion_tokens.get(index, 0),
+                    )
                 yield f"data: {chunk.model_dump_json()}\n\n"
             if request.return_hidden_states and hidden_states:

sglang/srt/entrypoints/openai/serving_embedding.py CHANGED Viewed

@@ -126,6 +126,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
             **prompt_kwargs,
             rid=request.rid,
             priority=request.priority,
+            dimensions=request.dimensions,
         )
         return adapted_request, request

sglang/srt/environ.py CHANGED Viewed

@@ -129,10 +129,13 @@ class Envs:
     SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
     SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
     SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
+    SGLANG_OTLP_EXPORTER_SCHEDULE_DELAY_MILLIS = EnvInt(500)
+    SGLANG_OTLP_EXPORTER_MAX_EXPORT_BATCH_SIZE = EnvInt(64)
     # Scheduler: memory leak test
     SGLANG_TEST_RETRACT = EnvBool(False)
     SGLANG_TEST_RETRACT_INTERVAL = EnvInt(3)
+    SGLANG_TEST_RETRACT_NO_PREFILL_BS = EnvInt(2 ** 31)
     SGLANG_ENABLE_RUNTIME_MEM_LEAK_CHECK = EnvBool(False)
     # Scheduler: new token ratio hyperparameters
@@ -180,6 +183,7 @@ class Envs:
     # Triton
     SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False)
+    SGLANG_USE_CUSTOM_TRITON_KERNEL_CACHE = EnvBool(False)
     # Torch Compile
     SGLANG_ENABLE_TORCH_COMPILE = EnvBool(False)
@@ -238,6 +242,9 @@ class Envs:
     SGLANG_IMAGE_MAX_PIXELS = EnvInt(16384 * 28 * 28)
     SGLANG_RESIZE_RESAMPLE = EnvStr("")
+    # Release & Resume Memory
+    SGLANG_MEMORY_SAVER_CUDA_GRAPH = EnvBool(False)
     # Ktransformers
     SGLANG_KT_MOE_NUM_GPU_EXPERTS = EnvInt(None)
     SGLANG_KT_MOE_CPUINFER = EnvInt(None)

sglang/srt/eplb/expert_distribution.py CHANGED Viewed

@@ -20,6 +20,7 @@ import time
 from abc import ABC
 from collections import deque
 from contextlib import contextmanager
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
 import einops
@@ -27,6 +28,7 @@ import torch
 import torch.distributed
 from sglang.srt.environ import envs
+from sglang.srt.metrics.collector import ExpertDispatchCollector
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import Withable, is_npu
@@ -660,6 +662,10 @@ class _UtilizationRateAccumulatorMixin(_Accumulator):
             self.window_sizes = [10, 100, 1000]
             self._history = _DequeCollection(maxlens=self.window_sizes)
             self._rank = torch.distributed.get_rank()
+            self._expert_dispatch_collector = ExpertDispatchCollector(
+                self._expert_location_metadata.ep_size
+            )
+            self._collection_counter = 0
     def append(
         self,
@@ -691,6 +697,8 @@ class _UtilizationRateAccumulatorMixin(_Accumulator):
         )
         if self._rank == 0:
+            self._collect_metrics_if_needed(gpu_physical_count)
             utilization_rate_tensor = compute_utilization_rate(gpu_physical_count)
             utilization_rate = torch.mean(utilization_rate_tensor).item()
             self._history.append(utilization_rate)
@@ -706,6 +714,31 @@ class _UtilizationRateAccumulatorMixin(_Accumulator):
                 # f"current_pass_per_layer={[round(x, 2) for x in utilization_rate_tensor.cpu().tolist()]}"
             )
+    def _collect_metrics_if_needed(self, gpu_physical_count: torch.Tensor):
+        # sglang:eplb_gpu_physical_count metric is disabled if SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL <= 0
+        if (
+            envs.SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL > 0
+            and self._collection_counter % envs.SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL
+            == 0
+        ):
+            for layer_idx in range(self._expert_location_metadata.num_layers):
+                count_of_layer = (
+                    self._expert_dispatch_collector.eplb_gpu_physical_count.labels(
+                        layer=str(layer_idx)
+                    )
+                )
+                # Exclude the +Inf bucket.
+                assert (
+                    self._expert_location_metadata.ep_size
+                    == len(count_of_layer._buckets) - 1
+                ), f"{self._expert_location_metadata.ep_size=}, {len(count_of_layer._buckets)=}"
+                for gpu_rank in range(self._expert_location_metadata.ep_size):
+                    count = gpu_physical_count[layer_idx, gpu_rank]
+                    if count > 0:
+                        count_of_layer._sum.inc(count * gpu_rank)
+                        count_of_layer._buckets[gpu_rank].inc(count)
+        self._collection_counter += 1
 class _DequeCollection:
     def __init__(self, maxlens: List[int]):
@@ -868,7 +901,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
 def _dump_to_file(name, data):
-    save_dir = envs.SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR.get()
+    save_dir = Path(envs.SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR.get())
     path_output = save_dir / name
     logger.info(f"Write expert distribution to {path_output}")
     if not save_dir.exists():

sglang/srt/eplb/expert_location.py CHANGED Viewed

@@ -85,7 +85,9 @@ class ExpertLocationMetadata:
     # -------------------------------- construction ------------------------------------
     @staticmethod
-    def init_trivial(server_args: ServerArgs, model_config: ModelConfig):
+    def init_trivial(
+        server_args: ServerArgs, model_config: ModelConfig, moe_ep_rank: int
+    ):
         """Trivial location - logical expert i corresponds to physical expert i"""
         common = ExpertLocationMetadata._init_common(server_args, model_config)
@@ -106,6 +108,7 @@ class ExpertLocationMetadata:
             server_args,
             model_config,
             physical_to_logical_map=physical_to_logical_map,
+            moe_ep_rank=moe_ep_rank,
         )
     @staticmethod
@@ -113,6 +116,7 @@ class ExpertLocationMetadata:
         server_args: ServerArgs,
         model_config: ModelConfig,
         physical_to_logical_map,
+        moe_ep_rank: int = None,
     ):
         if not isinstance(physical_to_logical_map, torch.Tensor):
             physical_to_logical_map = torch.tensor(physical_to_logical_map)
@@ -125,8 +129,11 @@ class ExpertLocationMetadata:
         model_config_for_expert_location = common["model_config_for_expert_location"]
         logical_to_all_physical_map = _compute_logical_to_all_physical_map(
-            physical_to_logical_map,
+            server_args=server_args,
+            physical_to_logical_map=physical_to_logical_map,
             num_logical_experts=model_config_for_expert_location.num_logical_experts,
+            ep_size=common["ep_size"],
+            moe_ep_rank=moe_ep_rank,
         )
         return ExpertLocationMetadata._init_raw(
@@ -233,7 +240,7 @@ class ExpertLocationMetadata:
                 compute_logical_to_rank_dispatch_physical_map(
                     server_args=server_args,
                     logical_to_all_physical_map=logical_to_all_physical_map,
-                    num_gpus=ep_size,
+                    ep_size=ep_size,
                     num_physical_experts=num_physical_experts,
                     # TODO improve when we have real EP rank
                     ep_rank=torch.distributed.get_rank() % ep_size,
@@ -303,7 +310,11 @@ def set_global_expert_location_metadata(value):
 def _compute_logical_to_all_physical_map(
-    physical_to_logical_map: torch.Tensor, num_logical_experts: int
+    server_args: ServerArgs,
+    physical_to_logical_map: torch.Tensor,
+    num_logical_experts: int,
+    ep_size: int,
+    moe_ep_rank: int,
 ):
     # This is rarely called, so we use for loops for maximum clarity
@@ -312,6 +323,8 @@ def _compute_logical_to_all_physical_map(
     logical_to_all_physical_map = [
         [[] for _ in range(num_logical_experts)] for _ in range(num_layers)
     ]
+    # Find out the candidate physical experts for each logical expert on each layer
     for layer_id in range(num_layers):
         for physical_expert_id in range(num_physical_experts):
             logical_expert_id = physical_to_logical_map[
@@ -321,6 +334,32 @@ def _compute_logical_to_all_physical_map(
                 physical_expert_id
             )
+    # Replace by the physical expert on local GPU or node if possible
+    if moe_ep_rank is not None:
+        num_gpus_per_node = server_args.ep_size // server_args.nnodes
+        num_local_gpu_physical_experts = num_physical_experts // ep_size
+        num_local_node_physical_experts = (
+            num_local_gpu_physical_experts * num_gpus_per_node
+        )
+        for layer_id in range(num_layers):
+            for logical_expert_id in range(num_logical_experts):
+                # Try to find the nearest physical expert
+                nearest_expert = _find_nearest_expert(
+                    candidate_physical_expert_ids=logical_to_all_physical_map[layer_id][
+                        logical_expert_id
+                    ],
+                    num_local_gpu_physical_experts=num_local_gpu_physical_experts,
+                    moe_ep_rank=moe_ep_rank,
+                    num_gpus_per_node=num_gpus_per_node,
+                    num_local_node_physical_experts=num_local_node_physical_experts,
+                )
+                # Replace by the nearest physical expert
+                if nearest_expert != -1:
+                    logical_to_all_physical_map[layer_id][logical_expert_id] = [
+                        nearest_expert
+                    ]
     logical_to_all_physical_map = _pad_nested_array(
         logical_to_all_physical_map, pad_value=-1
     )
@@ -343,21 +382,21 @@ def _pad_nested_array(arr, pad_value):
 def compute_logical_to_rank_dispatch_physical_map(
     server_args: ServerArgs,
     logical_to_all_physical_map: torch.Tensor,
-    num_gpus: int,
+    ep_size: int,
     num_physical_experts: int,
     ep_rank: int,
     seed: int = 42,
 ):
     r = random.Random(seed)
-    num_local_gpu_physical_experts = num_physical_experts // num_gpus
+    num_local_gpu_physical_experts = num_physical_experts // ep_size
     num_gpus_per_node = server_args.ep_size // server_args.nnodes
     num_local_node_physical_experts = num_local_gpu_physical_experts * num_gpus_per_node
     num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
     dtype = logical_to_all_physical_map.dtype
     logical_to_rank_dispatch_physical_map = torch.full(
-        size=(num_gpus, num_layers, num_logical_experts),
+        size=(ep_size, num_layers, num_logical_experts),
         fill_value=-1,
         dtype=dtype,
     )
@@ -371,33 +410,17 @@ def compute_logical_to_rank_dispatch_physical_map(
                 :, layer_id, logical_expert_id
             ]
-            for gpu_id in range(num_gpus):
-                same_gpu_physical_expert_ids = [
-                    physical_expert_id
-                    for physical_expert_id in candidate_physical_expert_ids
-                    if _compute_gpu_id_of_physical_expert(
-                        physical_expert_id, num_local_gpu_physical_experts
-                    )
-                    == gpu_id
-                ]
-                if len(same_gpu_physical_expert_ids) > 0:
-                    # 1. Prefer same-GPU experts
-                    output_partial[gpu_id] = same_gpu_physical_expert_ids[0]
-                else:
-                    # 2. Otherwise, prefer same-node experts
-                    node_id = gpu_id // num_gpus_per_node
-                    same_node_physical_expert_ids = [
-                        physical_expert_id
-                        for physical_expert_id in candidate_physical_expert_ids
-                        if _compute_node_id_of_physical_expert(
-                            physical_expert_id, num_local_node_physical_experts
-                        )
-                        == node_id
-                    ]
-                    if len(same_node_physical_expert_ids) > 0:
-                        output_partial[gpu_id] = same_node_physical_expert_ids[0]
+            for moe_ep_rank in range(ep_size):
+                # Fill with the nearest physical expert
+                output_partial[moe_ep_rank] = _find_nearest_expert(
+                    candidate_physical_expert_ids=candidate_physical_expert_ids,
+                    num_local_gpu_physical_experts=num_local_gpu_physical_experts,
+                    moe_ep_rank=moe_ep_rank,
+                    num_gpus_per_node=num_gpus_per_node,
+                    num_local_node_physical_experts=num_local_node_physical_experts,
+                )
-            # 3. Fill remaining slots with fair random choices
+            # Fill remaining slots with fair random choices
             num_remain = torch.sum(output_partial == -1).item()
             output_partial[output_partial == -1] = torch.tensor(
                 _fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
@@ -434,6 +457,46 @@ def _compute_node_id_of_physical_expert(
     return physical_expert_id // num_local_host_physical_experts
+def _find_nearest_expert(
+    candidate_physical_expert_ids: List[int],
+    num_local_gpu_physical_experts: int,
+    moe_ep_rank: int,
+    num_gpus_per_node: int,
+    num_local_node_physical_experts: int,
+) -> int:
+    # 1. If only one candidate, return it directly
+    if len(candidate_physical_expert_ids) == 1:
+        return candidate_physical_expert_ids[0]
+    # 2. Prefer same-GPU experts
+    same_gpu_physical_expert_ids = [
+        physical_expert_id
+        for physical_expert_id in candidate_physical_expert_ids
+        if _compute_gpu_id_of_physical_expert(
+            physical_expert_id, num_local_gpu_physical_experts
+        )
+        == moe_ep_rank
+    ]
+    if len(same_gpu_physical_expert_ids) > 0:
+        return same_gpu_physical_expert_ids[0]
+    # 3. Otherwise, prefer same-node experts
+    node_rank = moe_ep_rank // num_gpus_per_node
+    same_node_physical_expert_ids = [
+        physical_expert_id
+        for physical_expert_id in candidate_physical_expert_ids
+        if _compute_node_id_of_physical_expert(
+            physical_expert_id, num_local_node_physical_experts
+        )
+        == node_rank
+    ]
+    if len(same_node_physical_expert_ids) > 0:
+        return same_node_physical_expert_ids[0]
+    # 4. At last, leave it as -1 to indicate not found.
+    return -1
 def _fair_choices(arr: List, k: int, r: random.Random) -> List:
     quotient, remainder = divmod(k, len(arr))
     ans = arr * quotient + r.sample(arr, k=remainder)
@@ -459,11 +522,15 @@ class ModelConfigForExpertLocation:
 def compute_initial_expert_location_metadata(
-    server_args: ServerArgs, model_config: ModelConfig
+    server_args: ServerArgs,
+    model_config: ModelConfig,
+    moe_ep_rank: int,
 ) -> Optional[ExpertLocationMetadata]:
     data = server_args.init_expert_location
     if data == "trivial":
-        return ExpertLocationMetadata.init_trivial(server_args, model_config)
+        return ExpertLocationMetadata.init_trivial(
+            server_args, model_config, moe_ep_rank
+        )
     # TODO unify with the utils function
     if data.endswith(".pt"):
@@ -478,7 +545,10 @@ def compute_initial_expert_location_metadata(
             "init_expert_location from init_by_mapping using ServerArgs.init_expert_location"
         )
         return ExpertLocationMetadata.init_by_mapping(
-            server_args, model_config, **data_dict
+            server_args,
+            model_config,
+            **data_dict,
+            moe_ep_rank=moe_ep_rank,
         )
     elif "logical_count" in data_dict:
         logger.info(

sglang/srt/grpc/compile_proto.py CHANGED Viewed

@@ -18,6 +18,9 @@ Options:
 ### Install Dependencies
 pip install "grpcio==1.75.1" "grpcio-tools==1.75.1"
+Please make sure to use the same version of grpcio and grpcio-tools specified in pyproject.toml
+otherwise update the versions specified in pyproject.toml
 ### Run Script
 cd python/sglang/srt/grpc
 python compile_proto.py

sglang 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl