PyPI - sglang - Versions diffs - 0.4.7.post1__py3-none-any.whl → 0.4.8__py3-none-any.whl - Mend

sglang 0.4.7.post1py3-none-any.whl → 0.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

sglang/bench_one_batch.py +8 -6
sglang/srt/_custom_ops.py +2 -2
sglang/srt/code_completion_parser.py +2 -44
sglang/srt/constants.py +3 -0
sglang/srt/conversation.py +13 -3
sglang/srt/custom_op.py +5 -1
sglang/srt/disaggregation/decode.py +22 -28
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
sglang/srt/disaggregation/mini_lb.py +34 -4
sglang/srt/disaggregation/mooncake/conn.py +12 -16
sglang/srt/disaggregation/prefill.py +17 -13
sglang/srt/disaggregation/utils.py +46 -18
sglang/srt/distributed/parallel_state.py +12 -4
sglang/srt/entrypoints/engine.py +22 -28
sglang/srt/entrypoints/http_server.py +149 -79
sglang/srt/entrypoints/http_server_engine.py +0 -3
sglang/srt/entrypoints/openai/__init__.py +0 -0
sglang/srt/{openai_api → entrypoints/openai}/protocol.py +67 -29
sglang/srt/entrypoints/openai/serving_base.py +149 -0
sglang/srt/entrypoints/openai/serving_chat.py +921 -0
sglang/srt/entrypoints/openai/serving_completions.py +424 -0
sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
sglang/srt/entrypoints/openai/serving_score.py +61 -0
sglang/srt/entrypoints/openai/usage_processor.py +81 -0
sglang/srt/entrypoints/openai/utils.py +72 -0
sglang/srt/function_call/base_format_detector.py +7 -4
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/function_call/ebnf_composer.py +64 -10
sglang/srt/function_call/function_call_parser.py +6 -6
sglang/srt/function_call/llama32_detector.py +1 -1
sglang/srt/function_call/mistral_detector.py +1 -1
sglang/srt/function_call/pythonic_detector.py +1 -1
sglang/srt/function_call/qwen25_detector.py +1 -1
sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
sglang/srt/layers/activation.py +21 -3
sglang/srt/layers/attention/aiter_backend.py +5 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/cutlass_mla_backend.py +1 -0
sglang/srt/layers/attention/flashattention_backend.py +19 -9
sglang/srt/layers/attention/flashinfer_backend.py +9 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +7 -4
sglang/srt/layers/attention/flashmla_backend.py +5 -2
sglang/srt/layers/attention/tbo_backend.py +3 -3
sglang/srt/layers/attention/triton_backend.py +19 -11
sglang/srt/layers/communicator.py +5 -5
sglang/srt/layers/dp_attention.py +11 -2
sglang/srt/layers/layernorm.py +29 -2
sglang/srt/layers/logits_processor.py +2 -2
sglang/srt/layers/moe/ep_moe/kernels.py +159 -2
sglang/srt/layers/moe/ep_moe/layer.py +207 -1
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +6 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
sglang/srt/layers/moe/topk.py +91 -4
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
sglang/srt/layers/quantization/fp8.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +62 -8
sglang/srt/layers/quantization/utils.py +5 -2
sglang/srt/layers/rotary_embedding.py +42 -2
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/lora_manager.py +173 -74
sglang/srt/lora/mem_pool.py +49 -45
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +33 -15
sglang/srt/managers/io_struct.py +9 -12
sglang/srt/managers/schedule_batch.py +40 -31
sglang/srt/managers/schedule_policy.py +70 -56
sglang/srt/managers/scheduler.py +147 -62
sglang/srt/managers/template_manager.py +226 -0
sglang/srt/managers/tokenizer_manager.py +11 -8
sglang/srt/managers/tp_worker.py +12 -2
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
sglang/srt/mem_cache/base_prefix_cache.py +52 -8
sglang/srt/mem_cache/chunk_cache.py +11 -16
sglang/srt/mem_cache/hiradix_cache.py +34 -23
sglang/srt/mem_cache/memory_pool.py +118 -114
sglang/srt/mem_cache/radix_cache.py +20 -16
sglang/srt/model_executor/cuda_graph_runner.py +76 -45
sglang/srt/model_executor/forward_batch_info.py +18 -5
sglang/srt/model_executor/model_runner.py +22 -6
sglang/srt/model_loader/loader.py +8 -1
sglang/srt/model_loader/weight_utils.py +11 -2
sglang/srt/models/deepseek_nextn.py +29 -27
sglang/srt/models/deepseek_v2.py +108 -26
sglang/srt/models/glm4.py +312 -0
sglang/srt/models/mimo_mtp.py +2 -18
sglang/srt/reasoning_parser.py +21 -11
sglang/srt/server_args.py +36 -8
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -10
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +125 -12
sglang/srt/speculative/eagle_utils.py +80 -8
sglang/srt/speculative/eagle_worker.py +124 -41
sglang/srt/torch_memory_saver_adapter.py +19 -15
sglang/srt/utils.py +177 -11
sglang/test/test_block_fp8_ep.py +1 -0
sglang/test/test_utils.py +1 -0
sglang/version.py +1 -1
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/METADATA +4 -10
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/RECORD +104 -93
sglang/srt/entrypoints/verl_engine.py +0 -179
sglang/srt/openai_api/adapter.py +0 -2148
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/ep_moe/layer.py CHANGED Viewed

@@ -16,6 +16,7 @@ from sglang.srt.layers.moe.ep_moe.kernels import (
     ep_scatter,
     gelu_and_mul_triton_kernel,
     grouped_gemm_triton,
+    moe_ep_deepgemm_preprocess,
     post_reorder_triton_kernel,
     pre_reorder_triton_kernel,
     run_moe_ep_preproess,
@@ -33,10 +34,12 @@ from sglang.srt.layers.quantization.base_config import (
 )
 from sglang.srt.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod
 from sglang.srt.layers.quantization.fp8_kernel import (
+    is_fp8_fnuz,
     scaled_fp8_quant,
     sglang_per_token_group_quant_fp8,
     sglang_per_token_quant_fp8,
 )
+from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
 from sglang.srt.managers.expert_location import get_global_expert_location_metadata
 from sglang.srt.managers.expert_location_dispatch import ExpertLocationDispatchInfo
 from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -50,6 +53,7 @@ from sglang.srt.utils import (
 )
 _is_hip = is_hip()
+_is_fp8_fnuz = is_fp8_fnuz()
 if _is_hip:
     from vllm._custom_ops import scaled_fp8_quant
@@ -175,6 +179,7 @@ class EPMoE(torch.nn.Module):
         assert (
             num_fused_shared_experts == 0
         ), "num_fused_shared_experts is not supported in EP"
+        self.num_fused_shared_experts = num_fused_shared_experts
         self.num_experts_per_partition = self.num_experts // self.tp_size
         self.start_expert_id = self.tp_rank * self.num_experts_per_partition
         self.end_expert_id = self.start_expert_id + self.num_experts_per_partition - 1
@@ -224,13 +229,182 @@ class EPMoE(torch.nn.Module):
         self.grouped_gemm_runner = None
+        self.w13_weight_fp8 = (
+            self.w13_weight,
+            (
+                self.w13_weight_scale_inv
+                if self.use_block_quant
+                else self.w13_weight_scale
+            ),
+        )
+        self.w2_weight_fp8 = (
+            self.w2_weight,
+            self.w2_weight_scale_inv if self.use_block_quant else self.w2_weight_scale,
+        )
     def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
+        if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and self.use_fp8_w8a8:
+            return self.forward_deepgemm(hidden_states, router_logits)
+        else:
+            return self.forward_normal(hidden_states, router_logits)
+    def forward_deepgemm(
+        self, hidden_states: torch.Tensor, router_logits: torch.Tensor
+    ):
+        assert self.quant_method is not None
+        assert self.activation == "silu"
         hidden_states_shape = hidden_states.shape
         hidden_states_dtype = hidden_states.dtype
         hidden_states_device = hidden_states.device
+        topk_weights, topk_ids = select_experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            top_k=self.top_k,
+            use_grouped_topk=self.use_grouped_topk,
+            renormalize=self.renormalize,
+            topk_group=self.topk_group,
+            num_expert_group=self.num_expert_group,
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            correction_bias=self.correction_bias,
+            custom_routing_function=self.custom_routing_function,
+            routed_scaling_factor=self.routed_scaling_factor,
+        )
-        assert self.quant_method is not None
+        if not self.use_block_quant:
+            # Convert per-tensor quant to per-block quant by repeating scales for forward_deepgemm
+            scale_block_size = 128
+            w13_weight_scale_n = 2 * (
+                (self.intermediate_size + scale_block_size - 1) // scale_block_size
+            )
+            w13_weight_scale_k = (
+                hidden_states_shape[-1] + scale_block_size - 1
+            ) // scale_block_size
+            w13_weight_scale = (
+                self.w13_weight_scale.unsqueeze(1)
+                .repeat_interleave(w13_weight_scale_n, dim=1)
+                .unsqueeze(2)
+                .repeat_interleave(w13_weight_scale_k, dim=2)
+            )
+            self.w13_weight_fp8 = (
+                self.w13_weight,
+                w13_weight_scale,
+            )
+            w2_weight_scale_n = (
+                hidden_states_shape[-1] + scale_block_size - 1
+            ) // scale_block_size
+            w2_weight_scale_k = (
+                self.intermediate_size + scale_block_size - 1
+            ) // scale_block_size
+            w2_weight_scale = (
+                self.w2_weight_scale.unsqueeze(1)
+                .repeat_interleave(w2_weight_scale_n, dim=1)
+                .unsqueeze(2)
+                .repeat_interleave(w2_weight_scale_k, dim=2)
+            )
+            self.w2_weight_fp8 = (
+                self.w2_weight,
+                w2_weight_scale,
+            )
+        # PreReorder
+        m_max, masked_m, expected_m, src2dst, gateup_input, gateup_input_scale = (
+            moe_ep_deepgemm_preprocess(
+                topk_ids,
+                self.num_experts,
+                hidden_states,
+                self.top_k,
+                self.start_expert_id,
+                self.end_expert_id,
+                self.block_shape,
+            )
+        )
+        dispose_tensor(hidden_states)
+        # GroupGemm-0
+        gateup_input_fp8 = (
+            gateup_input,
+            deep_gemm_wrapper.get_col_major_tma_aligned_tensor(gateup_input_scale),
+        )
+        num_groups, m, k = gateup_input_fp8[0].size()
+        n = self.w13_weight.size(1)
+        gateup_output = torch.empty(
+            (num_groups, m, n), device=hidden_states_device, dtype=torch.bfloat16
+        )
+        deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
+            gateup_input_fp8, self.w13_weight_fp8, gateup_output, masked_m, expected_m
+        )
+        del gateup_input
+        del gateup_input_fp8
+        # Act
+        down_input = torch.empty(
+            (
+                gateup_output.shape[0],
+                gateup_output.shape[1],
+                gateup_output.shape[2] // 2,
+            ),
+            device=hidden_states_device,
+            dtype=self.fp8_dtype,
+        )
+        scale_block_size = 128
+        down_input_scale = torch.empty(
+            (
+                gateup_output.shape[0],
+                gateup_output.shape[1],
+                gateup_output.shape[2] // 2 // scale_block_size,
+            ),
+            device=hidden_states_device,
+            dtype=torch.float32,
+        )
+        silu_and_mul_masked_post_quant_fwd(
+            gateup_output,
+            down_input,
+            down_input_scale,
+            scale_block_size,
+            masked_m,
+        )
+        del gateup_output
+        # GroupGemm-1
+        n = self.w2_weight.size(1)
+        down_input_fp8 = (
+            down_input,
+            deep_gemm_wrapper.get_col_major_tma_aligned_tensor(down_input_scale),
+        )
+        down_output = torch.empty(
+            (num_groups, m, n), device=hidden_states_device, dtype=torch.bfloat16
+        )
+        deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
+            down_input_fp8, self.w2_weight_fp8, down_output, masked_m, expected_m
+        )
+        del down_input
+        del down_input_fp8
+        # PostReorder
+        output = torch.empty(
+            hidden_states_shape, dtype=hidden_states_dtype, device=hidden_states_device
+        )
+        post_reorder_triton_kernel[(hidden_states_shape[0],)](
+            down_output,
+            output,
+            src2dst,
+            topk_ids,
+            topk_weights,
+            self.start_expert_id,
+            self.end_expert_id,
+            self.top_k,
+            hidden_states_shape[1],
+            m_max * self.start_expert_id,
+            BLOCK_SIZE=512,
+        )
+        return output
+    def forward_normal(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
+        assert self.quant_method is not None
+        hidden_states_shape = hidden_states.shape
+        hidden_states_dtype = hidden_states.dtype
+        hidden_states_device = hidden_states.device
         if self.grouped_gemm_runner is None:
             self.grouped_gemm_runner = GroupedGemmRunner(
                 hidden_states.device,
@@ -246,6 +420,7 @@ class EPMoE(torch.nn.Module):
             renormalize=self.renormalize,
             topk_group=self.topk_group,
             num_expert_group=self.num_expert_group,
+            num_fused_shared_experts=self.num_fused_shared_experts,
             correction_bias=self.correction_bias,
             custom_routing_function=self.custom_routing_function,
             routed_scaling_factor=self.routed_scaling_factor,
@@ -437,6 +612,7 @@ class EPMoE(torch.nn.Module):
             self.end_expert_id,
             self.top_k,
             hidden_states_shape[1],
+            0,
             BLOCK_SIZE=512,
         )
         return output
@@ -843,6 +1019,33 @@ class Fp8EPMoEMethod(Fp8MoEMethod):
                     torch.max(layer.w13_weight_scale, dim=1).values,
                     requires_grad=False,
                 )
+            if self.block_quant:
+                # If ROCm, normalize the weights and scales to e4m3fnuz
+                if _is_fp8_fnuz:
+                    # activation_scheme: dynamic
+                    w13_weight, w13_weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                        weight=layer.w13_weight,
+                        weight_scale=layer.w13_weight_scale_inv,
+                        input_scale=None,
+                    )
+                    w2_weight, w2_weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                        weight=layer.w2_weight,
+                        weight_scale=layer.w2_weight_scale_inv,
+                        input_scale=None,
+                    )
+                    # Reset the parameter
+                    layer.w13_weight = torch.nn.Parameter(
+                        w13_weight, requires_grad=False
+                    )
+                    layer.w13_weight_scale_inv = torch.nn.Parameter(
+                        w13_weight_scale, requires_grad=False
+                    )
+                    layer.w13_input_scale = None
+                    layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+                    layer.w2_weight_scale_inv = torch.nn.Parameter(
+                        w2_weight_scale, requires_grad=False
+                    )
+                    layer.w2_input_scale = None
             return
     def apply(
@@ -1265,6 +1468,9 @@ class DeepEPMoE(EPMoE):
 def get_moe_impl_class():
     if global_server_args_dict["enable_deepep_moe"]:
         return DeepEPMoE
+    if global_server_args_dict["enable_flashinfer_moe"]:
+        # Must come before EPMoE because FusedMoE also supports enable_ep_moe
+        return FusedMoE
     if global_server_args_dict["enable_ep_moe"]:
         return EPMoE
     return FusedMoE

sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json ADDED Viewed

@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}

sglang/srt/layers/moe/fused_moe_triton/fused_moe.py CHANGED Viewed

@@ -25,9 +25,11 @@ from sglang.srt.layers.quantization.int8_kernel import (
     sglang_per_token_group_quant_int8,
 )
 from sglang.srt.utils import (
+    cpu_has_amx_support,
     direct_register_custom_op,
     get_bool_env_var,
     get_device_name,
+    is_cpu,
     is_cuda,
     is_hip,
     log_info_on_rank0,
@@ -36,9 +38,13 @@ from sglang.srt.utils import (
 _is_hip = is_hip()
 _is_cuda = is_cuda()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
 if _is_cuda:
     from sgl_kernel import gelu_and_mul, silu_and_mul
+elif _is_cpu and _is_cpu_amx_available:
+    pass
 else:
     from vllm import _custom_ops as vllm_ops
     from vllm._custom_ops import scaled_fp8_quant

sglang/srt/layers/moe/fused_moe_triton/layer.py CHANGED Viewed

@@ -32,6 +32,7 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if _use_aiter:
     from aiter import ActivationType
+    from aiter.fused_moe import fused_moe
     from aiter.fused_moe_bf16_asm import ck_moe_2stages
     from aiter.ops.shuffle import shuffle_weight
@@ -204,7 +205,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                     topk_weights, dtype=torch.float32
                 )  # topk_weights must be FP32 (float32)
-            return ck_moe_2stages(
+            return fused_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
@@ -241,7 +242,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
         inplace: bool = True,
+        no_combine: bool = False,
+        routed_scaling_factor: Optional[float] = None,
     ) -> torch.Tensor:
         return moe_forward_native(
             layer,
@@ -260,7 +265,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     def forward_tpu(self, *args, **kwargs) -> torch.Tensor:
         raise NotImplementedError("The TPU backend currently does not support MoE.")
-    forward_native = forward_cuda
+    forward_native = forward_cpu
 class FusedMoE(torch.nn.Module):
@@ -310,6 +315,8 @@ class FusedMoE(torch.nn.Module):
         inplace: bool = True,
         no_combine: bool = False,
         routed_scaling_factor: Optional[float] = None,
+        enable_flashinfer_moe: Optional[bool] = False,
+        enable_ep_moe: Optional[bool] = False,
     ):
         super().__init__()
@@ -320,9 +327,40 @@ class FusedMoE(torch.nn.Module):
         self.tp_size = (
             tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
         )
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.num_experts = num_experts
+        self.expert_map = None
+        if enable_flashinfer_moe and quant_config is None:
+            logger.warning("Disable flashinfer MoE when quantization config is None.")
+            enable_flashinfer_moe = False
+            enable_ep_moe = False
+        self.enable_flashinfer_moe = enable_flashinfer_moe
+        if enable_ep_moe:
+            assert (
+                self.enable_flashinfer_moe
+            ), "FusedMoE only supports EP with --enable-flashinfer-moe"
+            self.ep_size = self.tp_size
+            self.ep_rank = self.tp_rank
+            self.tp_size = 1
+            self.tp_rank = 0
+            # Create a tensor of size num_experts filled with -1
+            self.expert_map = torch.full((self.num_experts,), -1, dtype=torch.int32)
+            # Create a expert map for the local experts
+            assert num_experts % self.ep_size == 0
+            self.local_num_experts = num_experts // self.ep_size
+            self.expert_map[
+                self.ep_rank
+                * self.local_num_experts : (self.ep_rank + 1)
+                * self.local_num_experts
+            ] = torch.arange(0, self.local_num_experts, dtype=torch.int32, device="cpu")
+        else:
+            self.ep_size = 1
+            self.ep_rank = 0
+            self.local_num_experts = num_experts
         self.routed_scaling_factor = routed_scaling_factor
         self.top_k = top_k
-        self.num_experts = num_experts
         assert intermediate_size % self.tp_size == 0
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
         self.reduce_results = reduce_results
@@ -340,7 +378,6 @@ class FusedMoE(torch.nn.Module):
         self.use_presharded_weights = use_presharded_weights
         self.inplace = inplace
         self.no_combine = no_combine
-        self.local_num_experts = num_experts
         if quant_config is None:
             self.quant_method: Optional[QuantizeMethodBase] = (
@@ -348,11 +385,13 @@ class FusedMoE(torch.nn.Module):
             )
         else:
             self.quant_method = quant_config.get_quant_method(self, prefix)
+            if self.quant_method.__class__.__name__ == "ModelOptNvFp4FusedMoEMethod":
+                self.quant_method.enable_flashinfer_moe = self.enable_flashinfer_moe
         assert self.quant_method is not None
         self.quant_method.create_weights(
             layer=self,
-            num_experts=num_experts,
+            num_experts=self.local_num_experts,
             hidden_size=hidden_size,
             # FIXME: figure out which intermediate_size to use
             intermediate_size=self.intermediate_size_per_partition,
@@ -446,12 +485,15 @@ class FusedMoE(torch.nn.Module):
         # Narrow parameter and load.
         # w1, gate_proj: Load into first logical weight of w13.
-        if shard_id == "w1":
-            expert_data = expert_data.narrow(shard_dim, 0, shard_size)
         # w3, up_proj: Load into second logical weight of w13.
+        # trtllm cutlass kernel assumes differently
+        assert shard_id in ("w1", "w3")
+        switch_w13 = getattr(self.quant_method, "load_up_proj_weight_first", False)
+        if (switch_w13 and shard_id == "w1") or (not switch_w13 and shard_id == "w3"):
+            start = shard_size
         else:
-            assert shard_id == "w3"
-            expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
+            start = 0
+        expert_data = expert_data.narrow(shard_dim, start, shard_size)
         expert_data.copy_(loaded_weight)
     def _load_w2(
@@ -505,6 +547,11 @@ class FusedMoE(torch.nn.Module):
             assert shard_id in ("w1", "w3")
             expert_data.copy_(loaded_weight)
+    def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
+        if self.expert_map is None:
+            return expert_id
+        return self.expert_map[expert_id].item()
     def weight_loader(
         self,
         param: torch.nn.Parameter,
@@ -513,6 +560,13 @@ class FusedMoE(torch.nn.Module):
         shard_id: str,
         expert_id: int,
     ) -> None:
+        expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
+        if expert_id == -1:
+            return
+        # TP rank is set to 0 if EP is enabled
+        tp_rank = 0 if self.ep_size > 1 else get_tensor_model_parallel_rank()
         # compressed-tensors checkpoints with packed weights are stored flipped
         # TODO (mgoin): check self.quant_method.quant_config.quant_format
         # against known CompressionFormat enum values that have this quality
@@ -537,7 +591,6 @@ class FusedMoE(torch.nn.Module):
         SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
         expert_data = param.data[expert_id]
-        tp_rank = get_tensor_model_parallel_rank()
         # is_transposed: if the dim to shard the weight
         # should be flipped. Required by GPTQ, compressed-tensors
@@ -545,7 +598,7 @@ class FusedMoE(torch.nn.Module):
         is_transposed = getattr(param, "is_transposed", False)
         shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
         if is_transposed:
-            shard_dim = ~shard_dim
+            shard_dim = int(not shard_dim)
         # Case input scale: input_scale loading is only supported for fp8
         if "input_scale" in weight_name:
@@ -686,9 +739,19 @@ class FusedMoE(torch.nn.Module):
             activation=self.activation,
             apply_router_weight_on_input=self.apply_router_weight_on_input,
             routed_scaling_factor=self.routed_scaling_factor,
+            **(
+                dict(
+                    tp_rank=self.tp_rank,
+                    tp_size=self.tp_size,
+                    ep_rank=self.ep_rank,
+                    ep_size=self.ep_size,
+                )
+                if self.quant_method.__class__.__name__ == "ModelOptNvFp4FusedMoEMethod"
+                else {}
+            ),
         )
-        if self.reduce_results and self.tp_size > 1:
+        if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states

sglang 0.4.7.post1__py3-none-any.whl → 0.4.8__py3-none-any.whl

sglang 0.4.7.post1py3-none-any.whl → 0.4.8py3-none-any.whl