PyPI - sglang - Versions diffs - 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

sglang 0.5.0rc1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (203) hide show

sglang/bench_one_batch.py +0 -7
sglang/bench_one_batch_server.py +7 -2
sglang/bench_serving.py +3 -3
sglang/eval/llama3_eval.py +0 -1
sglang/srt/configs/model_config.py +25 -9
sglang/srt/configs/update_config.py +40 -5
sglang/srt/constrained/xgrammar_backend.py +23 -11
sglang/srt/conversation.py +2 -15
sglang/srt/disaggregation/ascend/conn.py +1 -3
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +1 -2
sglang/srt/disaggregation/launch_lb.py +7 -1
sglang/srt/disaggregation/mini_lb.py +11 -5
sglang/srt/disaggregation/mooncake/conn.py +141 -47
sglang/srt/disaggregation/prefill.py +261 -5
sglang/srt/disaggregation/utils.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/device_communicators/pynccl.py +68 -18
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
sglang/srt/distributed/naive_distributed.py +112 -0
sglang/srt/distributed/parallel_state.py +90 -4
sglang/srt/entrypoints/context.py +20 -1
sglang/srt/entrypoints/engine.py +29 -4
sglang/srt/entrypoints/http_server.py +76 -0
sglang/srt/entrypoints/openai/protocol.py +4 -2
sglang/srt/entrypoints/openai/serving_chat.py +23 -6
sglang/srt/entrypoints/openai/serving_completions.py +10 -1
sglang/srt/entrypoints/openai/serving_responses.py +2 -2
sglang/srt/eplb/expert_distribution.py +2 -3
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +24 -0
sglang/srt/host_shared_memory.py +83 -0
sglang/srt/layers/attention/ascend_backend.py +132 -22
sglang/srt/layers/attention/flashattention_backend.py +24 -17
sglang/srt/layers/attention/flashinfer_backend.py +14 -3
sglang/srt/layers/attention/flashinfer_mla_backend.py +227 -76
sglang/srt/layers/attention/triton_backend.py +109 -73
sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
sglang/srt/layers/attention/trtllm_mha_backend.py +398 -36
sglang/srt/layers/attention/trtllm_mla_backend.py +49 -19
sglang/srt/layers/attention/utils.py +94 -15
sglang/srt/layers/attention/vision.py +40 -13
sglang/srt/layers/attention/vision_utils.py +65 -0
sglang/srt/layers/communicator.py +58 -10
sglang/srt/layers/dp_attention.py +137 -27
sglang/srt/layers/elementwise.py +94 -0
sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
sglang/srt/layers/layernorm.py +8 -1
sglang/srt/layers/linear.py +24 -0
sglang/srt/layers/logits_processor.py +16 -18
sglang/srt/layers/moe/__init__.py +31 -0
sglang/srt/layers/moe/ep_moe/layer.py +37 -33
sglang/srt/layers/moe/fused_moe_native.py +14 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
sglang/srt/layers/moe/moe_runner/base.py +13 -0
sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
sglang/srt/layers/moe/router.py +15 -9
sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +167 -83
sglang/srt/layers/moe/utils.py +159 -18
sglang/srt/layers/multimodal.py +156 -40
sglang/srt/layers/quantization/__init__.py +18 -46
sglang/srt/layers/quantization/awq.py +22 -23
sglang/srt/layers/quantization/base_config.py +2 -6
sglang/srt/layers/quantization/blockwise_int8.py +4 -12
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -29
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
sglang/srt/layers/quantization/fp8.py +127 -119
sglang/srt/layers/quantization/fp8_kernel.py +195 -24
sglang/srt/layers/quantization/fp8_utils.py +34 -9
sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
sglang/srt/layers/quantization/gptq.py +17 -21
sglang/srt/layers/quantization/marlin_utils.py +26 -8
sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
sglang/srt/layers/quantization/modelopt_quant.py +217 -98
sglang/srt/layers/quantization/moe_wna16.py +10 -15
sglang/srt/layers/quantization/mxfp4.py +222 -39
sglang/srt/layers/quantization/quark/quark.py +390 -0
sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
sglang/srt/layers/quantization/unquant.py +34 -70
sglang/srt/layers/quantization/utils.py +77 -2
sglang/srt/layers/quantization/w4afp8.py +7 -8
sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
sglang/srt/layers/quantization/w8a8_int8.py +5 -13
sglang/srt/layers/radix_attention.py +6 -0
sglang/srt/layers/rotary_embedding.py +1 -0
sglang/srt/layers/sampler.py +5 -2
sglang/srt/lora/layers.py +6 -2
sglang/srt/lora/lora_manager.py +21 -22
sglang/srt/lora/lora_registry.py +3 -3
sglang/srt/lora/mem_pool.py +26 -24
sglang/srt/lora/utils.py +10 -12
sglang/srt/managers/cache_controller.py +80 -19
sglang/srt/managers/detokenizer_manager.py +10 -2
sglang/srt/managers/io_struct.py +23 -0
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/schedule_batch.py +22 -48
sglang/srt/managers/scheduler.py +28 -20
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/template_manager.py +7 -5
sglang/srt/managers/tokenizer_manager.py +88 -39
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/managers/utils.py +59 -1
sglang/srt/mem_cache/allocator.py +10 -157
sglang/srt/mem_cache/allocator_ascend.py +147 -0
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +14 -4
sglang/srt/mem_cache/memory_pool.py +3 -3
sglang/srt/mem_cache/memory_pool_host.py +35 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
sglang/srt/model_executor/cuda_graph_runner.py +33 -33
sglang/srt/model_executor/forward_batch_info.py +11 -10
sglang/srt/model_executor/model_runner.py +93 -78
sglang/srt/model_executor/npu_graph_runner.py +94 -0
sglang/srt/model_loader/loader.py +24 -6
sglang/srt/models/dbrx.py +12 -6
sglang/srt/models/deepseek.py +2 -1
sglang/srt/models/deepseek_nextn.py +5 -2
sglang/srt/models/deepseek_v2.py +226 -223
sglang/srt/models/ernie4.py +2 -2
sglang/srt/models/glm4_moe.py +27 -65
sglang/srt/models/glm4_moe_nextn.py +2 -1
sglang/srt/models/glm4v.py +52 -1
sglang/srt/models/glm4v_moe.py +8 -11
sglang/srt/models/gpt_oss.py +41 -76
sglang/srt/models/granitemoe.py +0 -1
sglang/srt/models/grok.py +376 -48
sglang/srt/models/interns1.py +12 -47
sglang/srt/models/internvl.py +6 -51
sglang/srt/models/llama.py +10 -2
sglang/srt/models/llama4.py +18 -7
sglang/srt/models/minicpm3.py +0 -1
sglang/srt/models/mixtral.py +0 -2
sglang/srt/models/nemotron_nas.py +435 -0
sglang/srt/models/olmoe.py +0 -1
sglang/srt/models/phi4mm.py +3 -21
sglang/srt/models/qwen2.py +2 -2
sglang/srt/models/qwen2_5_vl.py +2 -0
sglang/srt/models/qwen2_moe.py +23 -23
sglang/srt/models/qwen3.py +2 -2
sglang/srt/models/qwen3_classification.py +84 -0
sglang/srt/models/qwen3_moe.py +27 -43
sglang/srt/models/step3_vl.py +8 -3
sglang/srt/models/xverse_moe.py +11 -5
sglang/srt/multimodal/processors/base_processor.py +3 -3
sglang/srt/multimodal/processors/internvl.py +7 -2
sglang/srt/multimodal/processors/llava.py +11 -7
sglang/srt/offloader.py +433 -0
sglang/srt/operations.py +22 -2
sglang/srt/reasoning_parser.py +4 -3
sglang/srt/sampling/sampling_batch_info.py +7 -4
sglang/srt/server_args.py +264 -105
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -21
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_utils.py +36 -13
sglang/srt/speculative/eagle_worker.py +56 -3
sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
sglang/srt/two_batch_overlap.py +20 -19
sglang/srt/utils.py +68 -70
sglang/test/runners.py +8 -5
sglang/test/test_block_fp8.py +5 -6
sglang/test/test_block_fp8_ep.py +13 -19
sglang/test/test_cutlass_moe.py +4 -6
sglang/test/test_cutlass_w4a8_moe.py +4 -3
sglang/test/test_fp4_moe.py +4 -3
sglang/test/test_marlin_moe.py +1 -1
sglang/test/test_marlin_utils.py +1 -1
sglang/test/test_utils.py +7 -0
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/METADATA +11 -11
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/RECORD +201 -171
sglang/srt/layers/quantization/fp4.py +0 -557
sglang/srt/layers/quantization/scalar_type.py +0 -352
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/blockwise_int8.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from __future__ import annotations
 import logging
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 import torch
 from torch.nn import Module
@@ -22,6 +22,7 @@ from sglang.srt.layers.quantization.utils import is_layer_skipped
 from sglang.srt.utils import set_weight_attrs
 if TYPE_CHECKING:
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
     from sglang.srt.layers.moe.topk import TopKOutput
 ACTIVATION_SCHEMES = ["static", "dynamic"]
@@ -348,12 +349,7 @@ class BlockInt8MoEMethod(FusedMoEMethodBase):
         layer: torch.nn.Module,
         x: torch.Tensor,
         topk_output: TopKOutput,
-        *,
-        activation: str = "silu",
-        apply_router_weight_on_input: bool = False,
-        inplace: bool = True,
-        no_combine: bool = False,
-        routed_scaling_factor: Optional[float] = None,
+        moe_runner_config: MoeRunnerConfig,
     ) -> torch.Tensor:
         from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
@@ -363,15 +359,11 @@ class BlockInt8MoEMethod(FusedMoEMethodBase):
             layer.w13_weight,
             layer.w2_weight,
             topk_output=topk_output,
-            inplace=inplace,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
+            moe_runner_config=moe_runner_config,
             use_int8_w8a8=True,
             w1_scale=(layer.w13_weight_scale_inv),
             w2_scale=(layer.w2_weight_scale_inv),
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             block_shape=self.quant_config.weight_block_size,
-            no_combine=no_combine,
-            routed_scaling_factor=routed_scaling_factor,
         )

sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py CHANGED Viewed

@@ -16,19 +16,33 @@ from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz, scaled_fp8_qu
 from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
 from sglang.srt.layers.quantization.utils import (
     all_close_1d,
-    cpu_has_amx_support,
     per_tensor_dequantize,
     replace_parameter,
 )
-from sglang.srt.utils import is_cpu, is_cuda, is_hip, is_npu, set_weight_attrs
+from sglang.srt.utils import (
+    get_bool_env_var,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+    set_weight_attrs,
+)
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
     from sglang.srt.layers.moe.topk import TopKOutput
     from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
         CompressedTensorsConfig,
     )
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+if _use_aiter:
+    from aiter.ops.shuffle import shuffle_weight
+    from sglang.srt.layers.moe.rocm_moe_utils import rocm_fused_experts_tkw1
 try:
     import vllm
@@ -265,37 +279,66 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 max_w13_scales, requires_grad=False
             )
+        if _use_aiter:
+            with torch.no_grad():
+                # Pre-shuffle weights
+                layer.w13_weight = torch.nn.Parameter(
+                    shuffle_weight(layer.w13_weight.data, (16, 16)),
+                    requires_grad=False,
+                )
+                torch.cuda.empty_cache()
+                layer.w2_weight = torch.nn.Parameter(
+                    shuffle_weight(layer.w2_weight.data, (16, 16)),
+                    requires_grad=False,
+                )
+                torch.cuda.empty_cache()
     def apply(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
         topk_output: TopKOutput,
-        *,
-        activation: str = "silu",
-        apply_router_weight_on_input: bool = False,
-        inplace: bool = True,
-        no_combine: bool = False,
-        routed_scaling_factor: Optional[float] = None,
+        moe_runner_config: MoeRunnerConfig,
     ) -> torch.Tensor:
         from sglang.srt.layers.moe.fused_moe_triton import fused_experts
-        return fused_experts(
-            x,
-            layer.w13_weight,
-            layer.w2_weight,
-            topk_output=topk_output,
-            inplace=inplace,
-            activation=activation,
-            use_fp8_w8a8=True,
-            per_channel_quant=self.weight_quant.strategy
-            == QuantizationStrategy.CHANNEL,
-            w1_scale=layer.w13_weight_scale,
-            w2_scale=layer.w2_weight_scale,
-            a1_scale=layer.w13_input_scale,
-            a2_scale=layer.w2_input_scale,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            routed_scaling_factor=routed_scaling_factor,
-        )
+        if (
+            _use_aiter
+            and self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+            and moe_runner_config.apply_router_weight_on_input
+        ):
+            topk_weights, topk_ids, _ = topk_output
+            return rocm_fused_experts_tkw1(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=moe_runner_config.activation,
+                apply_router_weight_on_input=moe_runner_config.apply_router_weight_on_input,
+                use_fp8_w8a8=True,
+                per_channel_quant=self.weight_quant.strategy
+                == QuantizationStrategy.CHANNEL,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+            )
+        else:
+            return fused_experts(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_output=topk_output,
+                moe_runner_config=moe_runner_config,
+                use_fp8_w8a8=True,
+                per_channel_quant=self.weight_quant.strategy
+                == QuantizationStrategy.CHANNEL,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+            )
 class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
@@ -602,12 +645,12 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         layer: torch.nn.Module,
         x: torch.Tensor,
         topk_output: TopKOutput,
-        *,
-        activation: str = "silu",
-        **kwargs,
+        moe_runner_config: MoeRunnerConfig,
     ) -> torch.Tensor:
-        assert activation == "silu", "Only SiLU activation is supported."
+        assert (
+            moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
         topk_weights, topk_ids, router_logits = topk_output

sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py CHANGED Viewed

@@ -7,7 +7,8 @@ logger = logging.getLogger(__name__)
 def _compute_enable_deep_gemm():
     sm_version = get_device_sm()
-    if sm_version < 90:
+    # TODO fix blackwell fp8
+    if sm_version != 90:
         return False
     try:

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -49,6 +49,7 @@ from sglang.srt.layers.quantization.fp8_kernel import (
 )
 from sglang.srt.layers.quantization.fp8_utils import (
     apply_fp8_linear,
+    can_auto_enable_marlin_fp8,
     cutlass_fp8_supported,
     dispatch_w8a8_block_fp8_linear,
     input_to_float8,
@@ -79,6 +80,7 @@ from sglang.srt.utils import (
 )
 if TYPE_CHECKING:
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
     from sglang.srt.layers.moe.topk import TopKOutput
     from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config
@@ -208,17 +210,13 @@ class Fp8LinearMethod(LinearMethodBase):
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
-        self.use_marlin = (
-            get_bool_env_var("SGLANG_FORCE_FP8_MARLIN") and MARLIN_FP8_AVAILABLE
-        )
-        # Disable marlin for ROCm
-        if _is_hip:
-            self.use_marlin = False
+        self.use_marlin = False
+        if _is_cuda and MARLIN_FP8_AVAILABLE:
+            force_marlin = get_bool_env_var("SGLANG_FORCE_FP8_MARLIN")
+            auto_enable = can_auto_enable_marlin_fp8()
+            self.use_marlin = force_marlin or auto_enable
         self.block_quant = self.quant_config.weight_block_size is not None
-        if self.block_quant:
-            # Marlin doesn't support block-wise fp8
-            self.use_marlin = False
         self.w8a8_block_fp8_linear = dispatch_w8a8_block_fp8_linear()
@@ -331,7 +329,6 @@ class Fp8LinearMethod(LinearMethodBase):
                 layer.register_parameter("input_scale", None)
     def process_weights_after_loading(self, layer: Module) -> None:
-        # Block quant doesn't need to process weights after loading
         if self.block_quant:
             # If ROCm, normalize the weights and scales to e4m3fnuz
             if _is_fp8_fnuz:
@@ -341,7 +338,6 @@ class Fp8LinearMethod(LinearMethodBase):
                     weight_scale=layer.weight_scale_inv,
                     input_scale=None,
                 )
                 layer.input_scale = None
             elif _is_cpu:
                 assert (
@@ -351,90 +347,94 @@ class Fp8LinearMethod(LinearMethodBase):
                 return
             else:
                 weight, weight_scale = layer.weight.data, layer.weight_scale_inv.data
-            layer.weight = torch.nn.Parameter(weight, requires_grad=False)
-            layer.weight_scale_inv = torch.nn.Parameter(
-                weight_scale, requires_grad=False
-            )
-            return
+            layer.weight = Parameter(weight, requires_grad=False)
+            layer.weight_scale_inv = Parameter(weight_scale, requires_grad=False)
+        else:
+            layer.weight = Parameter(layer.weight.data, requires_grad=False)
-        layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
+            # If checkpoint not serialized fp8, quantize the weights.
+            if not self.quant_config.is_checkpoint_fp8_serialized:
+                if self.cutlass_fp8_supported or self.use_marlin:
+                    # apply per-channel quantization default as
+                    # cutlass sgl-kernel and marlin only support per-channel scale
+                    qweight, weight_scale = per_token_group_quant_fp8(
+                        layer.weight, layer.weight.shape[-1]
+                    )
+                    weight_scale = weight_scale.t().contiguous()
+                else:
+                    # per-tensor quantization
+                    qweight, weight_scale = input_to_float8(layer.weight)
+                # Update the layer with the new values.
+                layer.weight = Parameter(qweight.t(), requires_grad=False)
+                layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+                layer.input_scale = None
-        # If checkpoint not serialized fp8, quantize the weights.
-        if not self.quant_config.is_checkpoint_fp8_serialized:
-            if self.cutlass_fp8_supported or self.use_marlin:
-                # apply per-channel quantization default, as cutlass sgl-kernel and marlin only support per-channel scale
-                qweight, weight_scale = per_token_group_quant_fp8(
-                    layer.weight, layer.weight.shape[-1]
-                )
-                weight_scale = weight_scale.t().contiguous()
+            # If checkpoint is fp8, handle that there are N scales for N
+            # shards in a fused module
             else:
-                # per-tensor quantization
-                qweight, weight_scale = input_to_float8(layer.weight)
-            # Update the layer with the new values.
-            layer.weight = Parameter(qweight.t(), requires_grad=False)
-            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
-            layer.input_scale = None
-        # If checkpoint is fp8, handle that there are N scales for N
-        # shards in a fused module
-        else:
-            layer.weight_scale = torch.nn.Parameter(
-                layer.weight_scale.data, requires_grad=False
-            )
-            if (
-                hasattr(self.quant_config, "activation_scheme")
-                and self.quant_config.activation_scheme == "static"
-            ) or (
-                hasattr(self.quant_config, "linear_activation_scheme")
-                and self.quant_config.linear_activation_scheme == "static"
-            ):
-                layer.input_scale = torch.nn.Parameter(
-                    layer.input_scale.data, requires_grad=False
+                layer.weight_scale = Parameter(
+                    layer.weight_scale.data, requires_grad=False
                 )
+                if (
+                    hasattr(self.quant_config, "activation_scheme")
+                    and self.quant_config.activation_scheme == "static"
+                ) or (
+                    hasattr(self.quant_config, "linear_activation_scheme")
+                    and self.quant_config.linear_activation_scheme == "static"
+                ):
+                    layer.input_scale = Parameter(
+                        layer.input_scale.data, requires_grad=False
+                    )
-            # cutlass sgl-kernel and marlin only support per-channel scale
-            if self.cutlass_fp8_supported or self.use_marlin:
-                weight = layer.weight
-                weight_scale = convert_to_channelwise(
-                    layer.weight_scale, layer.logical_widths
-                )
-            else:
-                # Dequant -> Quant with max scale so we can run per tensor.
-                weight = layer.weight
-                weight_scale = layer.weight_scale
-                # If ROCm, normalize the weights and scales to e4m3fnuz
-                if _is_fp8_fnuz:
-                    weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                # cutlass sgl-kernel and marlin only support per-channel scale
+                if self.cutlass_fp8_supported or self.use_marlin:
+                    weight = layer.weight
+                    weight_scale = convert_to_channelwise(
+                        layer.weight_scale, layer.logical_widths
+                    )
+                else:
+                    # Dequant -> Quant with max scale so we can run per tensor.
+                    weight = layer.weight
+                    weight_scale = layer.weight_scale
+                    # If ROCm, normalize the weights and scales to e4m3fnuz
+                    if _is_fp8_fnuz:
+                        weight, weight_scale, input_scale = (
+                            normalize_e4m3fn_to_e4m3fnuz(
+                                weight=weight,
+                                weight_scale=weight_scale,
+                                input_scale=layer.input_scale,
+                            )
+                        )
+                        if input_scale is not None:
+                            layer.input_scale = Parameter(
+                                input_scale, requires_grad=False
+                            )
+                    weight_scale, weight = requantize_with_max_scale(
                         weight=weight,
                         weight_scale=weight_scale,
-                        input_scale=layer.input_scale,
+                        logical_widths=layer.logical_widths,
                     )
-                    if input_scale is not None:
-                        layer.input_scale = Parameter(input_scale, requires_grad=False)
-                weight_scale, weight = requantize_with_max_scale(
-                    weight=weight,
-                    weight_scale=weight_scale,
-                    logical_widths=layer.logical_widths,
-                )
-            # Update layer with new values.
-            layer.weight = Parameter(weight.t(), requires_grad=False)
-            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
-            if (
-                hasattr(self.quant_config, "activation_scheme")
-                and self.quant_config.activation_scheme == "static"
-            ) or (
-                hasattr(self.quant_config, "linear_activation_scheme")
-                and self.quant_config.linear_activation_scheme == "static"
-            ):
-                layer.input_scale = Parameter(
-                    layer.input_scale.max(), requires_grad=False
-                )
+                # Update layer with new values.
+                layer.weight = Parameter(weight.t(), requires_grad=False)
+                layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+                if (
+                    hasattr(self.quant_config, "activation_scheme")
+                    and self.quant_config.activation_scheme == "static"
+                ) or (
+                    hasattr(self.quant_config, "linear_activation_scheme")
+                    and self.quant_config.linear_activation_scheme == "static"
+                ):
+                    layer.input_scale = Parameter(
+                        layer.input_scale.max(), requires_grad=False
+                    )
         if self.use_marlin:
-            prepare_fp8_layer_for_marlin(layer)
+            if self.block_quant:
+                layer.weight_block_size = self.quant_config.weight_block_size
+            prepare_fp8_layer_for_marlin(layer, not self.block_quant)
             # Activations not quantized for marlin.
             del layer.input_scale
@@ -444,7 +444,6 @@ class Fp8LinearMethod(LinearMethodBase):
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if self.use_marlin:
             return apply_fp8_marlin_linear(
                 input=x,
@@ -515,6 +514,12 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         self.quant_config = quant_config
         self.block_quant = self.quant_config.weight_block_size is not None
         self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.use_cutlass_fused_experts_fp8 = (
+            get_bool_env_var("SGLANG_CUTLASS_MOE")
+            and self.cutlass_fp8_supported
+            and self.block_quant
+            and (is_sm100_supported() or is_sm90_supported())
+        )
     def create_weights(
         self,
@@ -961,6 +966,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
             # ROCm (_use_aiter): using column-wise scaling
             layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1)
             layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1)
@@ -982,12 +988,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         layer: torch.nn.Module,
         x: torch.Tensor,
         topk_output: TopKOutput,
-        *,
-        activation: str = "silu",
-        apply_router_weight_on_input: bool = False,
-        inplace: bool = True,
-        no_combine: bool = False,
-        routed_scaling_factor: Optional[float] = None,
+        moe_runner_config: MoeRunnerConfig,
     ) -> torch.Tensor:
         from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
@@ -996,7 +997,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             topk_weights, topk_ids, _ = topk_output
             x, topk_weights = apply_topk_weights_cpu(
-                apply_router_weight_on_input, topk_weights, x
+                moe_runner_config.apply_router_weight_on_input, topk_weights, x
             )
             return torch.ops.sgl_kernel.fused_experts_cpu(
@@ -1021,18 +1022,13 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 layer,
                 x,
                 topk_output,
-                activation,
-                no_combine,
+                moe_runner_config.activation,
+                moe_runner_config.no_combine,
             )
             if ret is not None:
                 return ret
-        if (
-            get_bool_env_var("SGLANG_CUTLASS_MOE")
-            and self.cutlass_fp8_supported
-            and self.block_quant
-            and (is_sm100_supported() or is_sm90_supported())
-        ):
+        if self.use_cutlass_fused_experts_fp8:
             from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
             topk_weights, topk_ids, _ = topk_output
@@ -1059,9 +1055,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 self.problem_sizes2,
                 use_fp8_blockscale=True,
             )
-            # TODO: Fuse into select_experts
-            if routed_scaling_factor is not None:
-                output *= routed_scaling_factor
+            # Scale by routed_scaling_factor is fused into select_experts.
             return output
         # Expert fusion with FP8 quantization
         return fused_experts(
@@ -1069,9 +1063,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.w13_weight,
             layer.w2_weight,
             topk_output=topk_output,
-            inplace=inplace and not no_combine,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
+            moe_runner_config=moe_runner_config,
             use_fp8_w8a8=True,
             w1_scale=(
                 layer.w13_weight_scale_inv
@@ -1084,30 +1076,44 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             block_shape=self.quant_config.weight_block_size,
-            no_combine=no_combine,
-            routed_scaling_factor=routed_scaling_factor,
         )
     def apply_with_router_logits(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        router_logits: torch.Tensor,
-        *,
-        activation: str = "silu",
-        routed_scaling_factor: Optional[float] = None,
+        topk_output: TopKOutput,
+        moe_runner_config: MoeRunnerConfig,
     ) -> torch.Tensor:
+        activation = moe_runner_config.activation
+        routed_scaling_factor = moe_runner_config.routed_scaling_factor
+        from flashinfer.fused_moe import trtllm_fp8_block_scale_moe
+        from sglang.srt.layers.moe.topk import TopKOutputChecker
+        assert TopKOutputChecker.format_is_bypassed(topk_output)
+        router_logits = topk_output.router_logits
+        topk_config = topk_output.topk_config
         assert (
             activation == "silu"
         ), "Only silu is supported for flashinfer blockscale fp8 moe"
         a_q, a_sf = per_token_group_quant_fp8(x, self.quant_config.weight_block_size[1])
         # NOTE: scales of hidden states have to be transposed!
         a_sf_t = a_sf.t().contiguous()
-        from flashinfer.fused_moe import trtllm_fp8_block_scale_moe
+        assert (
+            topk_config.num_expert_group is not None
+            and topk_config.topk_group is not None
+        ), "Current trtllm_fp8_block_scale_moe kernel does not support these two arguments as None"
+        if topk_config.correction_bias is None:
+            correction_bias = topk_config.correction_bias.to(x.dtype)
+        else:
+            correction_bias = None
         return trtllm_fp8_block_scale_moe(
             routing_logits=router_logits.to(torch.float32),
-            routing_bias=layer.correction_bias.to(x.dtype),
+            routing_bias=correction_bias,
             hidden_states=a_q,
             hidden_states_scale=a_sf_t,
             gemm1_weights=layer.w13_weight,
@@ -1115,15 +1121,17 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             gemm2_weights=layer.w2_weight,
             gemm2_weights_scale=layer.w2_weight_scale_inv,
             num_experts=layer.num_experts,
-            top_k=layer.top_k,
-            n_group=layer.num_expert_group,
-            topk_group=layer.topk_group,
+            top_k=topk_config.top_k,
+            n_group=topk_config.num_expert_group,
+            topk_group=topk_config.topk_group,
             intermediate_size=layer.w2_weight.shape[2],
             local_expert_offset=layer.moe_ep_rank * layer.num_local_experts,
             local_num_experts=layer.num_local_experts,
-            routed_scaling_factor=routed_scaling_factor,
+            routed_scaling_factor=(
+                routed_scaling_factor if routed_scaling_factor is not None else 1.0
+            ),
             tile_tokens_dim=get_tile_tokens_dim(
-                x.shape[0], layer.top_k, layer.num_experts
+                x.shape[0], topk_config.top_k, layer.num_experts
             ),
             routing_method_type=2,  # DeepSeek-styled routing method
             use_shuffled_weight=False,

sglang 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl

sglang 0.5.0rc1py3-none-any.whl → 0.5.1py3-none-any.whl