PyPI - sglang - Versions diffs - 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl - Mend

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

sglang/bench_offline_throughput.py +10 -8
sglang/bench_one_batch.py +7 -6
sglang/bench_one_batch_server.py +157 -21
sglang/bench_serving.py +137 -59
sglang/compile_deep_gemm.py +5 -5
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +78 -78
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +2 -2
sglang/srt/configs/model_config.py +40 -28
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +69 -43
sglang/srt/conversation.py +49 -44
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +129 -135
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
sglang/srt/disaggregation/fake/conn.py +3 -13
sglang/srt/disaggregation/kv_events.py +357 -0
sglang/srt/disaggregation/mini_lb.py +57 -24
sglang/srt/disaggregation/mooncake/conn.py +238 -122
sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
sglang/srt/disaggregation/nixl/conn.py +10 -19
sglang/srt/disaggregation/prefill.py +132 -47
sglang/srt/disaggregation/utils.py +123 -6
sglang/srt/distributed/utils.py +3 -3
sglang/srt/entrypoints/EngineBase.py +5 -0
sglang/srt/entrypoints/engine.py +44 -9
sglang/srt/entrypoints/http_server.py +23 -6
sglang/srt/entrypoints/http_server_engine.py +5 -2
sglang/srt/function_call/base_format_detector.py +250 -0
sglang/srt/function_call/core_types.py +34 -0
sglang/srt/function_call/deepseekv3_detector.py +157 -0
sglang/srt/function_call/ebnf_composer.py +234 -0
sglang/srt/function_call/function_call_parser.py +175 -0
sglang/srt/function_call/llama32_detector.py +74 -0
sglang/srt/function_call/mistral_detector.py +84 -0
sglang/srt/function_call/pythonic_detector.py +163 -0
sglang/srt/function_call/qwen25_detector.py +67 -0
sglang/srt/function_call/utils.py +35 -0
sglang/srt/hf_transformers_utils.py +46 -7
sglang/srt/layers/attention/aiter_backend.py +513 -0
sglang/srt/layers/attention/flashattention_backend.py +64 -18
sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
sglang/srt/layers/attention/flashmla_backend.py +340 -78
sglang/srt/layers/attention/triton_backend.py +3 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/utils.py +6 -4
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/communicator.py +451 -0
sglang/srt/layers/dp_attention.py +61 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/cutlass_moe.py +207 -0
sglang/srt/layers/moe/ep_moe/kernels.py +34 -12
sglang/srt/layers/moe/ep_moe/layer.py +105 -51
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
sglang/srt/layers/moe/topk.py +67 -10
sglang/srt/layers/multimodal.py +70 -0
sglang/srt/layers/quantization/__init__.py +8 -3
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/deep_gemm.py +77 -74
sglang/srt/layers/quantization/fp8.py +92 -2
sglang/srt/layers/quantization/fp8_kernel.py +3 -3
sglang/srt/layers/quantization/fp8_utils.py +6 -0
sglang/srt/layers/quantization/gptq.py +298 -6
sglang/srt/layers/quantization/int8_kernel.py +20 -7
sglang/srt/layers/quantization/qoq.py +244 -0
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +2 -4
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/deepseek_eplb.py +278 -0
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/eplb_manager.py +55 -0
sglang/srt/managers/expert_distribution.py +704 -56
sglang/srt/managers/expert_location.py +394 -0
sglang/srt/managers/expert_location_dispatch.py +91 -0
sglang/srt/managers/io_struct.py +19 -4
sglang/srt/managers/mm_utils.py +294 -140
sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
sglang/srt/managers/multimodal_processors/internvl.py +14 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
sglang/srt/managers/schedule_batch.py +122 -42
sglang/srt/managers/schedule_policy.py +1 -5
sglang/srt/managers/scheduler.py +205 -138
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +232 -58
sglang/srt/managers/tp_worker.py +12 -9
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/base_prefix_cache.py +3 -0
sglang/srt/mem_cache/chunk_cache.py +3 -1
sglang/srt/mem_cache/hiradix_cache.py +4 -4
sglang/srt/mem_cache/memory_pool.py +76 -52
sglang/srt/mem_cache/multimodal_cache.py +45 -0
sglang/srt/mem_cache/radix_cache.py +58 -5
sglang/srt/metrics/collector.py +314 -39
sglang/srt/mm_utils.py +10 -0
sglang/srt/model_executor/cuda_graph_runner.py +29 -19
sglang/srt/model_executor/expert_location_updater.py +422 -0
sglang/srt/model_executor/forward_batch_info.py +5 -1
sglang/srt/model_executor/model_runner.py +163 -68
sglang/srt/model_loader/loader.py +10 -6
sglang/srt/models/clip.py +5 -1
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +308 -351
sglang/srt/models/exaone.py +8 -3
sglang/srt/models/gemma3_mm.py +70 -33
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llama4.py +15 -8
sglang/srt/models/llava.py +258 -7
sglang/srt/models/mimo_mtp.py +220 -0
sglang/srt/models/minicpmo.py +5 -12
sglang/srt/models/mistral.py +71 -1
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/mllama.py +3 -3
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/qwen2.py +95 -26
sglang/srt/models/qwen2_5_vl.py +8 -0
sglang/srt/models/qwen2_moe.py +330 -60
sglang/srt/models/qwen2_vl.py +6 -0
sglang/srt/models/qwen3.py +52 -10
sglang/srt/models/qwen3_moe.py +411 -48
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/siglip.py +294 -0
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/openai_api/adapter.py +58 -20
sglang/srt/openai_api/protocol.py +6 -8
sglang/srt/operations.py +154 -0
sglang/srt/operations_strategy.py +31 -0
sglang/srt/reasoning_parser.py +3 -3
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +4 -56
sglang/srt/sampling/sampling_params.py +2 -2
sglang/srt/server_args.py +162 -22
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +138 -7
sglang/srt/speculative/eagle_worker.py +69 -21
sglang/srt/utils.py +74 -17
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_cutlass_moe.py +278 -0
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +55 -14
sglang/utils.py +3 -3
sglang/version.py +1 -1
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +23 -13
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +178 -149
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
sglang/srt/function_call_parser.py +0 -858
sglang/srt/platforms/interface.py +0 -371
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
/sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0

sglang/srt/managers/expert_location.py ADDED Viewed

@@ -0,0 +1,394 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import json
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+import torch
+import torch.distributed
+import torch.nn.functional as F
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.managers import deepseek_eplb
+from sglang.srt.model_loader import get_model_architecture
+from sglang.srt.server_args import ServerArgs
+logger = logging.getLogger(__name__)
+@dataclass
+class ExpertLocationMetadata:
+    physical_to_logical_map: torch.Tensor  # (layers, num_physical_experts)
+    logical_to_all_physical_map: torch.Tensor  # (layers, num_logical_experts, X)
+    logical_to_all_physical_map_num_valid: torch.Tensor  # (layers, num_logical_experts)
+    logical_to_rank_dispatch_physical_map: torch.Tensor  # (layers, num_logical_experts)
+    # -------------------------------- properties ------------------------------------
+    @property
+    def num_layers(self) -> int:
+        return self.physical_to_logical_map.shape[0]
+    @property
+    def num_physical_experts(self) -> int:
+        return self.physical_to_logical_map.shape[1]
+    @property
+    def num_local_physical_experts(self) -> int:
+        ans, remainder = divmod(self.num_physical_experts, self.ep_size)
+        assert remainder == 0
+        return ans
+    @property
+    def num_logical_experts(self) -> int:
+        return self.logical_to_all_physical_map.shape[1]
+    @property
+    def ep_size(self):
+        # TODO change when EP size != world size
+        return torch.distributed.get_world_size()
+    def __post_init__(self):
+        num_layers_0, num_physical_experts_0 = self.physical_to_logical_map.shape
+        num_layers_1, num_logical_experts_0, num_physical_experts_1 = (
+            self.logical_to_all_physical_map.shape
+        )
+        num_layers_2, num_logical_experts_1 = (
+            self.logical_to_all_physical_map_num_valid.shape
+        )
+        num_layers_3, num_logical_experts_2 = (
+            self.logical_to_rank_dispatch_physical_map.shape
+        )
+        assert num_layers_0 == num_layers_1 == num_layers_2 == num_layers_3
+        assert num_logical_experts_0 == num_logical_experts_1 == num_logical_experts_2
+        assert num_physical_experts_0 == num_physical_experts_1
+    # -------------------------------- construction ------------------------------------
+    @staticmethod
+    def init_trivial(server_args: ServerArgs, model_config: ModelConfig):
+        """Trivial location - logical expert i corresponds to physical expert i"""
+        common = ExpertLocationMetadata._init_common(server_args, model_config)
+        num_physical_experts = common["num_physical_experts"]
+        model_config_for_expert_location = common["model_config_for_expert_location"]
+        num_layers = model_config_for_expert_location.num_layers
+        num_logical_experts = model_config_for_expert_location.num_logical_experts
+        physical_to_logical_map = (
+            torch.arange(0, num_physical_experts).repeat(num_layers, 1)
+            % num_logical_experts
+        )
+        return ExpertLocationMetadata.init_by_mapping(
+            server_args,
+            model_config,
+            physical_to_logical_map=physical_to_logical_map,
+        )
+    @staticmethod
+    def init_by_mapping(
+        server_args: ServerArgs,
+        model_config: ModelConfig,
+        physical_to_logical_map,
+    ):
+        if not isinstance(physical_to_logical_map, torch.Tensor):
+            physical_to_logical_map = torch.tensor(physical_to_logical_map)
+        physical_to_logical_map = physical_to_logical_map.to(server_args.device)
+        common = ExpertLocationMetadata._init_common(server_args, model_config)
+        model_config_for_expert_location = common["model_config_for_expert_location"]
+        logical_to_all_physical_map = _compute_logical_to_all_physical_map(
+            physical_to_logical_map,
+            num_logical_experts=model_config_for_expert_location.num_logical_experts,
+        )
+        return ExpertLocationMetadata._init_raw(
+            ep_size=common["ep_size"],
+            physical_to_logical_map=physical_to_logical_map,
+            logical_to_all_physical_map=logical_to_all_physical_map,
+        )
+    @staticmethod
+    def init_by_eplb(
+        server_args: ServerArgs, model_config: ModelConfig, logical_count: torch.Tensor
+    ):
+        if not isinstance(logical_count, torch.Tensor):
+            logical_count = torch.tensor(logical_count)
+        if len(logical_count.shape) == 2:
+            logical_count = logical_count.unsqueeze(0)
+        logical_count = logical_count.to(server_args.device)
+        common = ExpertLocationMetadata._init_common(server_args, model_config)
+        model_config_for_expert_location = common["model_config_for_expert_location"]
+        num_physical_experts = common["num_physical_experts"]
+        phase = server_args.disaggregation_mode
+        if phase == "null":
+            phase = "decode"
+        physical_to_logical_map, logical_to_all_physical_map, expert_count = (
+            deepseek_eplb.rebalance_experts(
+                tokens_per_expert=logical_count,
+                num_physical_experts=num_physical_experts,
+                num_local_physical_experts=num_physical_experts // common["ep_size"],
+                num_groups=model_config_for_expert_location.num_groups,
+                num_nodes=server_args.nnodes,
+                phase=phase,
+            )
+        )
+        return ExpertLocationMetadata._init_raw(
+            ep_size=common["ep_size"],
+            physical_to_logical_map=physical_to_logical_map,
+            logical_to_all_physical_map=logical_to_all_physical_map,
+        )
+    @staticmethod
+    def _init_common(server_args: ServerArgs, model_config: ModelConfig):
+        model_config_for_expert_location = (
+            ModelConfigForExpertLocation.from_model_config(model_config)
+        )
+        num_physical_experts = (
+            model_config_for_expert_location.num_logical_experts
+            + server_args.ep_num_redundant_experts
+        )
+        ep_size = server_args.ep_size
+        assert num_physical_experts % ep_size == 0
+        num_local_physical_experts = num_physical_experts // ep_size
+        return dict(
+            model_config_for_expert_location=model_config_for_expert_location,
+            num_physical_experts=num_physical_experts,
+            num_local_physical_experts=num_local_physical_experts,
+            ep_size=ep_size,
+        )
+    @staticmethod
+    def _init_raw(
+        ep_size: int,
+        physical_to_logical_map: torch.Tensor,
+        logical_to_all_physical_map: torch.Tensor,
+    ):
+        _, num_physical_experts = physical_to_logical_map.shape
+        logical_to_all_physical_map_padded = F.pad(
+            logical_to_all_physical_map,
+            (0, num_physical_experts - logical_to_all_physical_map.shape[-1]),
+            value=-1,
+        )
+        logical_to_all_physical_map_num_valid = torch.count_nonzero(
+            logical_to_all_physical_map != -1, dim=-1
+        )
+        return ExpertLocationMetadata(
+            physical_to_logical_map=physical_to_logical_map,
+            logical_to_all_physical_map=logical_to_all_physical_map_padded,
+            logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
+            logical_to_rank_dispatch_physical_map=compute_logical_to_rank_dispatch_physical_map(
+                logical_to_all_physical_map=logical_to_all_physical_map,
+                logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
+                num_gpus=ep_size,
+                num_physical_experts=num_physical_experts,
+                ep_rank=torch.distributed.get_rank(),
+            ),
+        )
+    # -------------------------------- mutation ------------------------------------
+    def update(
+        self,
+        other: "ExpertLocationMetadata",
+    ):
+        for field in [
+            "ep_size",
+        ]:
+            assert getattr(self, field) == getattr(other, field)
+        for field in [
+            "physical_to_logical_map",
+            "logical_to_all_physical_map",
+            "logical_to_all_physical_map_num_valid",
+            "logical_to_rank_dispatch_physical_map",
+        ]:
+            dst = getattr(self, field)
+            dst[...] = getattr(other, field)
+    # -------------------------------- usage ------------------------------------
+    def logical_to_all_physical(
+        self, layer_id: int, logical_expert_id: int
+    ) -> List[int]:
+        return [
+            physical_expert_id
+            for physical_expert_id in self.logical_to_all_physical_map[
+                layer_id, logical_expert_id
+            ].tolist()
+            if physical_expert_id != -1
+        ]
+_global_expert_location_metadata: Optional[ExpertLocationMetadata] = None
+def get_global_expert_location_metadata():
+    return _global_expert_location_metadata
+def set_global_expert_location_metadata(value):
+    global _global_expert_location_metadata
+    assert _global_expert_location_metadata is None
+    _global_expert_location_metadata = value
+def _compute_logical_to_all_physical_map(
+    physical_to_logical_map: torch.Tensor, num_logical_experts: int
+):
+    # This is rarely called, so we use for loops for maximum clarity
+    num_layers, num_physical_experts = physical_to_logical_map.shape
+    logical_to_all_physical_map = [
+        [[] for _ in range(num_logical_experts)] for _ in range(num_layers)
+    ]
+    for layer_id in range(num_layers):
+        for physical_expert_id in range(num_physical_experts):
+            logical_expert_id = physical_to_logical_map[
+                layer_id, physical_expert_id
+            ].item()
+            logical_to_all_physical_map[layer_id][logical_expert_id].append(
+                physical_expert_id
+            )
+    logical_to_all_physical_map = _pad_nested_array(
+        logical_to_all_physical_map, pad_value=-1
+    )
+    return torch.tensor(
+        logical_to_all_physical_map, device=physical_to_logical_map.device
+    )
+def _pad_nested_array(arr, pad_value):
+    max_len = max(len(inner) for outer in arr for inner in outer)
+    padded = [
+        [inner + [pad_value] * (max_len - len(inner)) for inner in outer]
+        for outer in arr
+    ]
+    return padded
+# TODO use more sophisticated approaches
+def compute_logical_to_rank_dispatch_physical_map(
+    logical_to_all_physical_map: torch.Tensor,
+    logical_to_all_physical_map_num_valid: torch.Tensor,
+    num_gpus: int,
+    num_physical_experts: int,
+    ep_rank: int,
+    base_seed: int = 42,
+):
+    device = logical_to_all_physical_map.device
+    num_local_physical_experts = num_physical_experts // num_gpus
+    num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
+    g = torch.Generator(device=device)
+    g.manual_seed(base_seed + ep_rank)
+    output_shape = (num_layers, num_logical_experts)
+    chosen_index = (
+        torch.randint(
+            0, 65536, output_shape, dtype=torch.int32, device=device, generator=g
+        )
+        % logical_to_all_physical_map_num_valid
+    )
+    logical_to_rank_dispatch_physical_map = torch.gather(
+        logical_to_all_physical_map, dim=2, index=chosen_index.unsqueeze(-1)
+    ).squeeze(-1)
+    assert logical_to_rank_dispatch_physical_map.shape == output_shape
+    for index in range(logical_to_all_physical_map_num_valid.max().item()):
+        partial_logical_to_all_physical_map = logical_to_all_physical_map[:, :, index]
+        is_valid = partial_logical_to_all_physical_map != -1
+        is_same_gpu = (
+            partial_logical_to_all_physical_map // num_local_physical_experts
+        ) == ep_rank
+        logical_to_rank_dispatch_physical_map = torch.where(
+            is_valid & is_same_gpu,
+            partial_logical_to_all_physical_map,
+            logical_to_rank_dispatch_physical_map,
+        )
+    assert torch.all(logical_to_rank_dispatch_physical_map != -1)
+    return logical_to_rank_dispatch_physical_map
+@dataclass
+class ModelConfigForExpertLocation:
+    num_layers: int
+    num_logical_experts: int
+    num_groups: Optional[int] = None
+    @staticmethod
+    def init_dummy():
+        return ModelConfigForExpertLocation(num_layers=1, num_logical_experts=1)
+    @staticmethod
+    def from_model_config(model_config: ModelConfig):
+        model_class, _ = get_model_architecture(model_config)
+        if hasattr(model_class, "get_model_config_for_expert_location"):
+            return model_class.get_model_config_for_expert_location(
+                model_config.hf_config
+            )
+        else:
+            return ModelConfigForExpertLocation.init_dummy()
+def compute_initial_expert_location_metadata(
+    server_args: ServerArgs, model_config: ModelConfig
+) -> ExpertLocationMetadata:
+    data = server_args.init_expert_location
+    if data == "trivial":
+        logger.info("init_expert_location from trivial")
+        return ExpertLocationMetadata.init_trivial(server_args, model_config)
+    # TODO unify with the utils function
+    if data.endswith(".pt"):
+        data_dict = torch.load(data, weights_only=True)
+    elif data.endswith(".json"):
+        data_dict = json.loads(Path(data).read_text())
+    else:
+        data_dict = json.loads(data)
+    if "physical_to_logical_map" in data_dict:
+        logger.info(
+            "init_expert_location from init_by_mapping using ServerArgs.init_expert_location"
+        )
+        return ExpertLocationMetadata.init_by_mapping(
+            server_args, model_config, **data_dict
+        )
+    elif "logical_count" in data_dict:
+        logger.info(
+            "init_expert_location from init_by_eplb using ServerArgs.init_expert_location"
+        )
+        return ExpertLocationMetadata.init_by_eplb(
+            server_args, model_config, logical_count=data_dict["logical_count"]
+        )
+    else:
+        raise NotImplementedError(
+            f"Unknown init_expert_location format ({list(data_dict.keys())=})"
+        )

sglang/srt/managers/expert_location_dispatch.py ADDED Viewed

@@ -0,0 +1,91 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from dataclasses import dataclass
+from typing import Literal, Optional
+import torch
+from sglang.srt.managers.expert_location import get_global_expert_location_metadata
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+@dataclass
+class ExpertLocationDispatchInfo:
+    ep_dispatch_algorithm: Literal["static", "random"]
+    # (num_logical_experts,)
+    partial_logical_to_rank_dispatch_physical_map: torch.Tensor
+    # (num_logical_experts, X)
+    partial_logical_to_all_physical_map: torch.Tensor
+    # (num_logical_experts,)
+    partial_logical_to_all_physical_map_num_valid: torch.Tensor
+    num_physical_experts: int
+    @classmethod
+    def init_new(cls, layer_id: int):
+        ep_dispatch_algorithm = global_server_args_dict["ep_dispatch_algorithm"]
+        expert_location_metadata = get_global_expert_location_metadata()
+        if ep_dispatch_algorithm is None:
+            return None
+        return cls(
+            ep_dispatch_algorithm=ep_dispatch_algorithm,
+            partial_logical_to_rank_dispatch_physical_map=expert_location_metadata.logical_to_rank_dispatch_physical_map[
+                layer_id, :
+            ],
+            partial_logical_to_all_physical_map=expert_location_metadata.logical_to_all_physical_map[
+                layer_id, :
+            ],
+            partial_logical_to_all_physical_map_num_valid=expert_location_metadata.logical_to_all_physical_map_num_valid[
+                layer_id, :
+            ],
+            num_physical_experts=expert_location_metadata.num_physical_experts,
+        )
+def topk_ids_logical_to_physical(
+    topk_ids: torch.Tensor, info: Optional[ExpertLocationDispatchInfo]
+) -> torch.Tensor:
+    if info is None:
+        return topk_ids
+    if info.ep_dispatch_algorithm == "static":
+        return _topk_ids_logical_to_physical_static(topk_ids, info)
+    if info.ep_dispatch_algorithm == "dynamic":
+        return _topk_ids_logical_to_physical_dynamic(topk_ids, info)
+    raise NotImplementedError
+def _topk_ids_logical_to_physical_static(
+    topk_ids: torch.Tensor, info: Optional[ExpertLocationDispatchInfo]
+) -> torch.Tensor:
+    return info.partial_logical_to_rank_dispatch_physical_map[topk_ids]
+def _topk_ids_logical_to_physical_dynamic(
+    topk_ids: torch.Tensor, info: Optional[ExpertLocationDispatchInfo]
+) -> torch.Tensor:
+    topk_ids_original_shape = topk_ids.shape
+    device = topk_ids.device
+    topk_ids = topk_ids.flatten()
+    chosen_dispatch_index = (
+        torch.randint(0, 65536, topk_ids.shape, dtype=torch.int32, device=device)
+        % info.partial_logical_to_all_physical_map_num_valid[topk_ids]
+    )
+    topk_ids = info.partial_logical_to_all_physical_map[topk_ids, chosen_dispatch_index]
+    topk_ids = topk_ids.view(topk_ids_original_shape)
+    return topk_ids

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # limitations under the License.
 # ==============================================================================
 """
-The definition of objects transfered between different
+The definition of objects transferred between different
 processes (TokenizerManager, DetokenizerManager, Controller).
 """
@@ -22,13 +22,15 @@ from dataclasses import dataclass, field
 from enum import Enum
 from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
+from sglang.srt.mm_utils import has_valid_data
 # handle serialization of Image for pydantic
 if TYPE_CHECKING:
     from PIL.Image import Image
 else:
     Image = Any
-from sglang.srt.managers.schedule_batch import BaseFinishReason
+from sglang.srt.managers.schedule_batch import BaseFinishReason, flatten_nested_list
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -40,6 +42,10 @@ class SessionParams:
     replace: Optional[bool] = None
+AudioDataItem = Union[str, Dict]
+ImageDataItem = Union[Image, str, Dict]
 @dataclass
 class GenerateReqInput:
     # The input prompt. It can be a single prompt or a batch of prompts.
@@ -55,10 +61,10 @@ class GenerateReqInput:
     # - List of lists of images (multiple images per request)
     # See also python/sglang/srt/utils.py:load_image for more details.
     image_data: Optional[
-        Union[List[List[Union[Image, str]]], List[Union[Image, str]], Union[Image, str]]
+        Union[List[List[ImageDataItem]], List[ImageDataItem], ImageDataItem]
     ] = None
     # The audio input. Like image data, it can be a file name, a url, or base64 encoded string.
-    audio_data: Optional[Union[List[str], str]] = None
+    audio_data: Optional[Union[List[AudioDataItem], AudioDataItem]] = None
     # The sampling_params. See descriptions below.
     sampling_params: Optional[Union[List[Dict], Dict]] = None
     # The request id.
@@ -100,6 +106,9 @@ class GenerateReqInput:
     bootstrap_port: Optional[Union[List[int], int]] = None
     bootstrap_room: Optional[Union[List[int], int]] = None
+    def contains_mm_input(self) -> bool:
+        return has_valid_data(self.image_data) or has_valid_data(self.audio_data)
     def normalize_batch_and_arguments(self):
         """
         Normalize the batch size and arguments for the request.
@@ -398,6 +407,7 @@ class GenerateReqInput:
                 else None
             ),
             return_hidden_states=self.return_hidden_states,
+            # if `__getitem__` is called, the bootstrap_host, bootstrap_port, bootstrap_room must be a list
             bootstrap_host=(
                 self.bootstrap_host[i] if self.bootstrap_host is not None else None
             ),
@@ -483,6 +493,9 @@ class EmbeddingReqInput:
     # The modalities of the image data [image, multi-images, video]
     modalities: Optional[List[str]] = None
+    def contains_mm_input(self) -> bool:
+        return has_valid_data(self.image_data) or has_valid_data(self.audio_data)
     def normalize_batch_and_arguments(self):
         # at least one of text, input_ids, or image should be provided
         if self.text is None and self.input_ids is None and self.image_data is None:
@@ -836,6 +849,8 @@ class ProfileReqInput:
     # the caller doesn't need to run stop_profile.
     num_steps: Optional[int] = None
     activities: Optional[List[Literal["CPU", "GPU", "MEM", "CUDA_PROFILER"]]] = None
+    with_stack: Optional[bool] = None
+    record_shapes: Optional[bool] = None
 class ProfileReqType(Enum):

sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl

sglang 0.4.6.post3py3-none-any.whl → 0.4.6.post5py3-none-any.whl