PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (359) hide show

sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} RENAMED Viewed

@@ -5,33 +5,22 @@ from dataclasses import dataclass
 from enum import IntEnum, auto
 from typing import Callable, Dict, List, Optional, Tuple
-import torch
 from tqdm.contrib.concurrent import thread_map
+from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
+    DEEPGEMM_BLACKWELL,
+    ENABLE_JIT_DEEPGEMM,
+)
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import get_bool_env_var, get_device_sm, get_int_env_var, is_cuda
+from sglang.srt.utils import get_bool_env_var, get_int_env_var
 logger = logging.getLogger(__name__)
-_ENABLE_JIT_DEEPGEMM = False
-try:
-    import deep_gemm
+if ENABLE_JIT_DEEPGEMM and not DEEPGEMM_BLACKWELL:
     from deep_gemm import get_num_sms
-    from deep_gemm.jit.compiler import get_nvcc_compiler
+    from deep_gemm.jit import build
     from deep_gemm.jit_kernels.gemm import get_best_configs
     from deep_gemm.jit_kernels.runtime import FP8GemmRuntime, GemmType
-    from deep_gemm.jit_kernels.tuner import jit_tuner
-    sm_version = get_device_sm()
-    if sm_version == 90:
-        if get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true"):
-            _ENABLE_JIT_DEEPGEMM = True
-except ImportError:
-    logger.warning("Failed to import deepgemm, disable _ENABLE_JIT_DEEPGEMM.")
-def get_enable_jit_deepgemm():
-    return _ENABLE_JIT_DEEPGEMM
 _BUILTIN_M_LIST = list(range(1, 1024 * 16 + 1))
@@ -52,8 +41,10 @@ os.environ["DG_JIT_CACHE_DIR"] = os.getenv(
 # NVRTC may have performance loss with some cases.
 # And NVCC JIT speed is also 9x faster in the ref commit
 _USE_NVRTC_DEFAULT = "0"
-if _ENABLE_JIT_DEEPGEMM:
+if ENABLE_JIT_DEEPGEMM:
     try:
+        from deep_gemm.jit.compiler import get_nvcc_compiler
         get_nvcc_compiler()
     except:
         logger.warning(
@@ -114,11 +105,12 @@ class DeepGemmKernelHelper:
 _INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dict()
+# TODO improve naming
 def _compile_warning_1():
     if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
         logger.warning(
             "Entering DeepGEMM JIT Pre-Compile session. "
-            "And it may takes a long time(Typically 10-20 mins) "
+            "It may takes a long time (typically 10-20 mins) "
             "if you have not run `sglang.compile_deep_gemm`. "
             "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
             " for pre-compilation to reduce the overhead if you have not run it before. "
@@ -127,6 +119,7 @@ def _compile_warning_1():
         )
+# TODO improve naming
 def _compile_warning_2():
     logger.warning(
         "Entering DeepGEMM JIT Single Kernel Compile session. "
@@ -148,32 +141,28 @@ def _compile_grouped_gemm_nt_f8f8bf16_masked_one(
     block_k = 128
     num_tma_threads = 128
     num_math_threads_per_group = 128
     kwargs = {
+        "GEMM_TYPE": GemmType.GroupedMasked,
         "NUM_TMA_THREADS": num_tma_threads,
         "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
+        "N": n,
+        "K": k,
+        "NUM_GROUPS": 1,
+        "BLOCK_M": block_m,
+        "BLOCK_N": block_n,
         "BLOCK_K": block_k,
+        "SWIZZLE_D_MODE": smem_config[1],
+        "BLOCK_N_PADDING": smem_config[2],
+        "NUM_STAGES": num_stages,
+        "NUM_TMA_MULTICAST": tma_multicast_config[0],
+        "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
         "NUM_SMS": num_sms,
         "SMEM_SIZE": smem_config[0],
     }
-    _, _ = jit_tuner.compile_and_tune(
-        name="m_grouped_gemm_fp8_fp8_bf16_nt",
-        keys={
-            "N": n,
-            "K": k,
-            "BLOCK_M": block_m,
-            "BLOCK_N": block_n,
-            "SWIZZLE_D_MODE": smem_config[1],
-            "BLOCK_N_PADDING": smem_config[2],
-            "NUM_GROUPS": num_groups,
-            "NUM_STAGES": num_stages,
-            "NUM_TMA_MULTICAST": tma_multicast_config[0],
-            "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
-            "GEMM_TYPE": GemmType.GroupedMasked,
-        },
-        space=(),
-        kwargs=kwargs,
-        runtime_cls=FP8GemmRuntime,
-    )
+    code = FP8GemmRuntime.generate(kwargs)
+    _ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
 def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
@@ -187,31 +176,26 @@ def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
     num_tma_threads = 128
     num_math_threads_per_group = 128
     kwargs = {
+        "GEMM_TYPE": GemmType.GroupedContiguous,
         "NUM_TMA_THREADS": num_tma_threads,
         "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
+        "N": n,
+        "K": k,
+        "NUM_GROUPS": 1,
+        "BLOCK_M": block_m,
+        "BLOCK_N": block_n,
         "BLOCK_K": block_k,
+        "SWIZZLE_D_MODE": smem_config[1],
+        "BLOCK_N_PADDING": smem_config[2],
+        "NUM_STAGES": num_stages,
+        "NUM_TMA_MULTICAST": tma_multicast_config[0],
+        "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
         "NUM_SMS": num_sms,
         "SMEM_SIZE": smem_config[0],
     }
-    _, _ = jit_tuner.compile_and_tune(
-        name="m_grouped_gemm_fp8_fp8_bf16_nt",
-        keys={
-            "N": n,
-            "K": k,
-            "BLOCK_M": block_m,
-            "BLOCK_N": block_n,
-            "SWIZZLE_D_MODE": smem_config[1],
-            "BLOCK_N_PADDING": smem_config[2],
-            "NUM_GROUPS": num_groups,
-            "NUM_STAGES": num_stages,
-            "NUM_TMA_MULTICAST": tma_multicast_config[0],
-            "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
-            "GEMM_TYPE": GemmType.GroupedContiguous,
-        },
-        space=(),
-        kwargs=kwargs,
-        runtime_cls=FP8GemmRuntime,
-    )
+    code = FP8GemmRuntime.generate(kwargs)
+    _ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
 def _compile_gemm_nt_f8f8bf16_one(
@@ -228,30 +212,26 @@ def _compile_gemm_nt_f8f8bf16_one(
         "GEMM_TYPE": GemmType.Normal,
         "NUM_TMA_THREADS": num_tma_threads,
         "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
+        "N": n,
+        "K": k,
         "NUM_GROUPS": 1,
+        "BLOCK_M": block_m,
+        "BLOCK_N": block_n,
         "BLOCK_K": block_k,
+        "SWIZZLE_D_MODE": smem_config[1],
+        "BLOCK_N_PADDING": smem_config[2],
+        "NUM_STAGES": num_stages,
+        "NUM_TMA_MULTICAST": tma_multicast_config[0],
+        "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
         "NUM_SMS": num_sms,
         "SMEM_SIZE": smem_config[0],
     }
-    _, _ = jit_tuner.compile_and_tune(
-        name="gemm_fp8_fp8_bf16_nt",
-        keys={
-            "N": n,
-            "K": k,
-            "BLOCK_M": block_m,
-            "BLOCK_N": block_n,
-            "SWIZZLE_D_MODE": smem_config[1],
-            "BLOCK_N_PADDING": smem_config[2],
-            "NUM_STAGES": num_stages,
-            "NUM_TMA_MULTICAST": tma_multicast_config[0],
-            "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
-        },
-        space=(),
-        kwargs=kwargs,
-        runtime_cls=FP8GemmRuntime,
-    )
+    code = FP8GemmRuntime.generate(kwargs)
+    _ = build("gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
+# TODO further refactor warmup-related
 _KERNEL_HELPER_DICT: Dict[DeepGemmKernelType, DeepGemmKernelHelper] = {
     DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED: DeepGemmKernelHelper(
         name="m_grouped_gemm_fp8_fp8_bf16_nt_masked",
@@ -284,7 +264,6 @@ def _maybe_compile_deep_gemm_one_type_all(
     num_groups: int,
     m_list: Optional[List[int]] = None,
 ) -> None:
     global _INITIALIZATION_DICT
     global _BUILTIN_M_LIST
@@ -318,56 +297,6 @@ def _maybe_compile_deep_gemm_one_type_all(
         thread_map(compile_func, collected_configs, max_workers=_COMPILE_WORKERS)
-def grouped_gemm_nt_f8f8bf16_masked(
-    lhs: Tuple[torch.Tensor, torch.Tensor],
-    rhs: Tuple[torch.Tensor, torch.Tensor],
-    out: torch.Tensor,
-    masked_m: torch.Tensor,
-    expected_m: int,
-):
-    num_groups, _, k = lhs[0].shape
-    _, n, _ = rhs[0].shape
-    kernel_type = DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED
-    _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
-    with _log_jit_build(expected_m, n, k, kernel_type):
-        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
-            lhs, rhs, out, masked_m, expected_m
-        )
-def grouped_gemm_nt_f8f8bf16_contig(
-    lhs: Tuple[torch.Tensor, torch.Tensor],
-    rhs: Tuple[torch.Tensor, torch.Tensor],
-    out: torch.Tensor,
-    m_indices: torch.Tensor,
-):
-    m, k = lhs[0].shape
-    num_groups, n, _ = rhs[0].shape
-    kernel_type = DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG
-    _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
-    with _log_jit_build(m, n, k, kernel_type):
-        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs, rhs, out, m_indices)
-def gemm_nt_f8f8bf16(
-    lhs: Tuple[torch.Tensor, torch.Tensor],
-    rhs: Tuple[torch.Tensor, torch.Tensor],
-    out: torch.Tensor,
-):
-    m, k = lhs[0].shape
-    n, _ = rhs[0].shape
-    kernel_type = DeepGemmKernelType.GEMM_NT_F8F8BF16
-    _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, 1)
-    with _log_jit_build(m, n, k, kernel_type):
-        deep_gemm.gemm_fp8_fp8_bf16_nt(lhs, rhs, out)
 @contextmanager
 def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
     if _IN_PRECOMPILE_STAGE:
@@ -382,7 +311,8 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
         ret = origin_func(self, *args, **kwargs)
         if ret is None:
             kernel_helper = _KERNEL_HELPER_DICT[kernel_type]
-            _compile_warning_2()
+            if not DEEPGEMM_BLACKWELL:
+                _compile_warning_2()
             logger.warning(
                 f"DeepGEMM JIT Compiling for <{kernel_helper.name}> M={M}, N={N}, K={K}. Please wait."
             )
@@ -391,3 +321,15 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
     RuntimeCache.get = __patched_func
     yield
     RuntimeCache.get = origin_func
+@contextmanager
+def deep_gemm_execution_hook(
+    m: int, n: int, k: int, num_groups: int, kernel_type: DeepGemmKernelType
+):
+    # not supported yet
+    if not DEEPGEMM_BLACKWELL:
+        _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
+    with _log_jit_build(m, n, k, kernel_type):
+        yield

sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py ADDED Viewed

@@ -0,0 +1,32 @@
+import logging
+from sglang.srt.utils import get_bool_env_var, get_device_sm
+logger = logging.getLogger(__name__)
+def _compute_enable_deep_gemm():
+    sm_version = get_device_sm()
+    if sm_version < 90:
+        return False
+    try:
+        import deep_gemm
+    except ImportError:
+        logger.warning("Failed to import deep_gemm, disable ENABLE_JIT_DEEPGEMM.")
+        return False
+    return get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true")
+ENABLE_JIT_DEEPGEMM = _compute_enable_deep_gemm()
+try:
+    from deep_gemm import fp8_gemm_nt
+    # They have not given a name to this breaking change
+    DEEPGEMM_BLACKWELL = True
+except ImportError:
+    DEEPGEMM_BLACKWELL = False
+DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL

sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py ADDED Viewed

@@ -0,0 +1,110 @@
+import logging
+from contextlib import contextmanager
+from typing import Tuple
+import torch
+from sglang.srt.layers.quantization.deep_gemm_wrapper import compile_utils
+from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
+    DEEPGEMM_BLACKWELL,
+    DEEPGEMM_SCALE_UE8M0,
+    ENABLE_JIT_DEEPGEMM,
+)
+from sglang.srt.server_args import ServerArgs
+logger = logging.getLogger(__name__)
+if ENABLE_JIT_DEEPGEMM:
+    import deep_gemm
+    if DEEPGEMM_BLACKWELL:
+        from deep_gemm import fp8_gemm_nt as _gemm_nt_f8f8bf16_raw
+        from deep_gemm import (
+            fp8_m_grouped_gemm_nt_masked as _grouped_gemm_nt_f8f8bf16_masked_raw,
+        )
+        from deep_gemm import (
+            m_grouped_fp8_gemm_nt_contiguous as _grouped_gemm_nt_f8f8bf16_contig_raw,
+        )
+    else:
+        from deep_gemm import gemm_fp8_fp8_bf16_nt as _gemm_nt_f8f8bf16_raw
+        from deep_gemm import get_col_major_tma_aligned_tensor
+        from deep_gemm import (
+            m_grouped_gemm_fp8_fp8_bf16_nt_contiguous as _grouped_gemm_nt_f8f8bf16_contig_raw,
+        )
+        from deep_gemm import (
+            m_grouped_gemm_fp8_fp8_bf16_nt_masked as _grouped_gemm_nt_f8f8bf16_masked_raw,
+        )
+def grouped_gemm_nt_f8f8bf16_masked(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+    masked_m: torch.Tensor,
+    expected_m: int,
+    recipe=None,
+):
+    num_groups, _, k = lhs[0].shape
+    _, n, _ = rhs[0].shape
+    kernel_type = compile_utils.DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED
+    with compile_utils.deep_gemm_execution_hook(
+        expected_m, n, k, num_groups, kernel_type
+    ):
+        _grouped_gemm_nt_f8f8bf16_masked_raw(
+            lhs,
+            rhs,
+            out,
+            masked_m,
+            expected_m,
+            **({"recipe": recipe} if DEEPGEMM_BLACKWELL else {})
+        )
+def grouped_gemm_nt_f8f8bf16_contig(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+    m_indices: torch.Tensor,
+):
+    m, k = lhs[0].shape
+    num_groups, n, _ = rhs[0].shape
+    kernel_type = compile_utils.DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG
+    with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
+        _grouped_gemm_nt_f8f8bf16_contig_raw(lhs, rhs, out, m_indices)
+def gemm_nt_f8f8bf16(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+):
+    m, k = lhs[0].shape
+    n, _ = rhs[0].shape
+    num_groups = 1
+    kernel_type = compile_utils.DeepGemmKernelType.GEMM_NT_F8F8BF16
+    with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
+        _gemm_nt_f8f8bf16_raw(
+            lhs,
+            rhs,
+            out,
+        )
+def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
+    compile_utils.update_deep_gemm_config(gpu_id, server_args)
+@contextmanager
+def configure_deep_gemm_num_sms(num_sms):
+    if num_sms is None:
+        yield
+    else:
+        original_num_sms = deep_gemm.get_num_sms()
+        deep_gemm.set_num_sms(num_sms)
+        try:
+            yield
+        finally:
+            deep_gemm.set_num_sms(original_num_sms)

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -49,10 +49,9 @@ from sglang.srt.layers.quantization.fp8_kernel import (
 )
 from sglang.srt.layers.quantization.fp8_utils import (
     apply_fp8_linear,
-    apply_w8a8_block_fp8_linear,
     cutlass_fp8_supported,
+    dispatch_w8a8_block_fp8_linear,
     input_to_float8,
-    is_sm100_supported,
     normalize_e4m3fn_to_e4m3fnuz,
 )
 from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
@@ -63,6 +62,7 @@ from sglang.srt.layers.quantization.utils import (
     per_tensor_dequantize,
     requantize_with_max_scale,
 )
+from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.utils import (
     get_bool_env_var,
     is_cuda,
@@ -77,8 +77,8 @@ _is_cuda = is_cuda()
 _is_fp8_fnuz = is_fp8_fnuz()
-use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT")
-use_aiter_moe = get_bool_env_var("SGLANG_AITER_MOE")
+_use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT")
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if _is_hip:
     from aiter import ActivationType, QuantType
@@ -209,6 +209,8 @@ class Fp8LinearMethod(LinearMethodBase):
             # Marlin doesn't support block-wise fp8
             self.use_marlin = False
+        self.w8a8_block_fp8_linear = dispatch_w8a8_block_fp8_linear()
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -417,7 +419,7 @@ class Fp8LinearMethod(LinearMethodBase):
             )
         if self.block_quant:
-            return apply_w8a8_block_fp8_linear(
+            return self.w8a8_block_fp8_linear(
                 input=x,
                 weight=layer.weight,
                 block_size=self.quant_config.weight_block_size,
@@ -485,7 +487,7 @@ class Fp8MoEMethod:
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
         if self.quant_config.is_checkpoint_fp8_serialized:
-            params_dtype = torch.uint32 if use_hip_int4 else torch.float8_e4m3fn
+            params_dtype = torch.uint32 if _use_hip_int4 else torch.float8_e4m3fn
         tp_size = get_tensor_model_parallel_world_size()
         if self.block_quant:
             block_n, block_k = (
@@ -510,7 +512,7 @@ class Fp8MoEMethod:
                     )
         # WEIGHTS
-        if _is_hip and use_hip_int4:
+        if _is_hip and _use_hip_int4:
             # INT4 MoE weight - INT32 packed
             w13_weight = torch.nn.Parameter(
                 torch.empty(
@@ -571,7 +573,7 @@ class Fp8MoEMethod:
             layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
             assert self.quant_config.activation_scheme == "dynamic"
             if (
-                get_bool_env_var("CUTLASS_MOE")
+                get_bool_env_var("SGLANG_CUTLASS_MOE")
                 and self.cutlass_fp8_supported
                 and is_sm100_supported()
             ):
@@ -639,7 +641,7 @@ class Fp8MoEMethod:
             layer.register_parameter("w13_weight_scale", w13_weight_scale)
             layer.register_parameter("w2_weight_scale", w2_weight_scale)
-            if _is_hip:  # and use_aiter_moe: TODO: add check back after triton kernel
+            if _is_hip:  # _use_aiter: TODO: add check back after triton kernel
                 # ROCm - using column scaling, duplicate scaling numbers in case per tensor scaling
                 w13_weight_scale1 = torch.nn.Parameter(
                     torch.ones(num_experts, 2 * intermediate_size, dtype=torch.float32),
@@ -666,7 +668,7 @@ class Fp8MoEMethod:
             set_weight_attrs(w13_weight_scale, extra_weight_attrs)
             set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-            if _is_hip and use_hip_int4:
+            if _is_hip and _use_hip_int4:
                 extra_weight_attrs.update(
                     {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
                 )
@@ -698,7 +700,7 @@ class Fp8MoEMethod:
             layer.w2_input_scale = None
     def process_weights_after_loading(self, layer: Module) -> None:
-        if _is_hip and use_hip_int4:
+        if _is_hip and _use_hip_int4:
             self.process_weights_hip_int4(layer)
             return
@@ -729,7 +731,7 @@ class Fp8MoEMethod:
                 )
                 layer.w2_input_scale = None
-            if _is_hip and use_aiter_moe:
+            if _use_aiter:
                 # Pre-shuffle weights
                 layer.w13_weight.data = shuffle_weight(
                     layer.w13_weight.contiguous(), (16, 16)
@@ -851,7 +853,7 @@ class Fp8MoEMethod:
             return
     def process_weights_hip_int4(self, layer: Module):
-        # TODO: and use_aiter_moe: add after triton kernel added
+        # TODO: _use_aiter: add after triton kernel added
         # INT4-FP8 (INT4 MoE Weight, FP8 Compute)
         # Weight Permutation
         layer.w13_weight = torch.nn.Parameter(
@@ -898,7 +900,7 @@ class Fp8MoEMethod:
             padding_size,  # Avoid circular import
         )
-        if use_aiter_moe:
+        if _use_aiter:
             layer.w13_weight = torch.nn.Parameter(
                 shuffle_weight(layer.w13_weight.data, (16, 16)),
                 requires_grad=False,
@@ -909,7 +911,7 @@ class Fp8MoEMethod:
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
-            # ROCm (use_aiter_moe): using column-wise scaling
+            # ROCm (_use_aiter): using column-wise scaling
             layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1)
             layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1)
         elif get_bool_env_var("SGLANG_MOE_PADDING"):
@@ -935,6 +937,7 @@ class Fp8MoEMethod:
         use_grouped_topk: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -955,6 +958,7 @@ class Fp8MoEMethod:
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             routed_scaling_factor=routed_scaling_factor,
@@ -973,14 +977,14 @@ class Fp8MoEMethod:
                 return ret
         if (
-            get_bool_env_var("CUTLASS_MOE")
+            get_bool_env_var("SGLANG_CUTLASS_MOE")
             and self.cutlass_fp8_supported
             and self.block_quant
             and is_sm100_supported()
         ):
-            from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts
+            from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
-            return cutlass_fused_experts(
+            return cutlass_fused_experts_fp8(
                 x,
                 layer.w13_weight.transpose(1, 2),
                 layer.w2_weight.transpose(1, 2),
@@ -1026,6 +1030,7 @@ class Fp8MoEMethod:
             a2_scale=layer.w2_input_scale,
             block_shape=self.quant_config.weight_block_size,
             no_combine=no_combine,
+            routed_scaling_factor=routed_scaling_factor,
         )
     def maybe_apply_hip_fused_experts(
@@ -1037,8 +1042,8 @@ class Fp8MoEMethod:
         activation: str = "silu",
         no_combine: bool = False,
     ) -> Optional[torch.Tensor]:
-        if use_hip_int4:
-            # TODO: add triton kernel and add check use_aiter_moe
+        if _use_hip_int4:
+            # TODO: add triton kernel and add check _use_aiter
             assert not no_combine, f"{no_combine=} is not supported."
             return ck_moe_2stages(
                 x,
@@ -1054,13 +1059,13 @@ class Fp8MoEMethod:
                 ),
             )
-        if use_aiter_moe:
+        if _use_aiter:
             assert not no_combine, f"{no_combine=} is not supported."
             if self.block_quant:
-                # TODO(use_aiter_moe): FP8 block_quant only supports 'silu' for the time-being.
+                # TODO(_use_aiter): FP8 block_quant only supports 'silu' for the time-being.
                 assert (
                     activation == "silu"
-                ), f"use_aiter_moe: FP8 bloack_quant {activation=} will be supported later, unset use_aiter_moe"
+                ), f"_use_aiter: FP8 bloack_quant {activation=} will be supported later, unset _use_aiter"
                 return asm_moe(
                     x,
                     layer.w13_weight,

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl