PyPI - sglang - Versions diffs - 0.4.6.post2__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl - Mend

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

sglang/bench_one_batch.py +1 -11
sglang/bench_serving.py +149 -1
sglang/lang/chat_template.py +44 -0
sglang/srt/configs/deepseekvl2.py +3 -0
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/internvl.py +696 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +17 -0
sglang/srt/constrained/xgrammar_backend.py +11 -19
sglang/srt/conversation.py +30 -3
sglang/srt/disaggregation/decode.py +4 -1
sglang/srt/disaggregation/mini_lb.py +74 -23
sglang/srt/disaggregation/mooncake/conn.py +9 -18
sglang/srt/disaggregation/nixl/conn.py +241 -71
sglang/srt/disaggregation/utils.py +44 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
sglang/srt/distributed/device_communicators/pynccl.py +2 -1
sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
sglang/srt/distributed/parallel_state.py +22 -1
sglang/srt/entrypoints/engine.py +14 -2
sglang/srt/entrypoints/http_server.py +28 -1
sglang/srt/entrypoints/verl_engine.py +3 -2
sglang/srt/hf_transformers_utils.py +20 -1
sglang/srt/layers/attention/flashattention_backend.py +146 -50
sglang/srt/layers/attention/flashinfer_backend.py +23 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
sglang/srt/layers/attention/merge_state.py +46 -0
sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
sglang/srt/layers/attention/vision.py +290 -163
sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
sglang/srt/layers/moe/ep_moe/layer.py +120 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +4 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
sglang/srt/layers/quantization/deep_gemm.py +5 -0
sglang/srt/layers/quantization/fp8.py +108 -95
sglang/srt/layers/quantization/fp8_kernel.py +79 -60
sglang/srt/layers/quantization/fp8_utils.py +71 -23
sglang/srt/layers/quantization/kv_cache.py +3 -10
sglang/srt/layers/quantization/utils.py +0 -5
sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
sglang/srt/lora/lora_manager.py +10 -13
sglang/srt/managers/cache_controller.py +115 -119
sglang/srt/managers/io_struct.py +10 -0
sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
sglang/srt/managers/multimodal_processors/internvl.py +232 -0
sglang/srt/managers/schedule_batch.py +19 -1
sglang/srt/managers/schedule_policy.py +11 -5
sglang/srt/managers/scheduler.py +28 -13
sglang/srt/managers/tokenizer_manager.py +24 -13
sglang/srt/managers/tp_worker.py +9 -12
sglang/srt/mem_cache/chunk_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +2 -2
sglang/srt/model_executor/model_runner.py +44 -33
sglang/srt/model_loader/loader.py +18 -11
sglang/srt/models/clip.py +4 -4
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_nextn.py +1 -20
sglang/srt/models/deepseek_v2.py +55 -20
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/internlm2.py +3 -0
sglang/srt/models/internvl.py +670 -0
sglang/srt/models/llama.py +1 -1
sglang/srt/models/llama4.py +53 -7
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/phi3_small.py +16 -2
sglang/srt/models/qwen2_5_vl.py +8 -4
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/xiaomi_mimo.py +171 -0
sglang/srt/openai_api/adapter.py +24 -40
sglang/srt/openai_api/protocol.py +28 -16
sglang/srt/reasoning_parser.py +2 -2
sglang/srt/sampling/sampling_batch_info.py +54 -2
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +30 -6
sglang/srt/utils.py +35 -1
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deepep_utils.py +219 -0
sglang/test/test_utils.py +3 -1
sglang/version.py +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +14 -6
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +90 -80
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -42,6 +42,8 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizeMethodBase,
 )
 from sglang.srt.layers.quantization.fp8_kernel import (
+    fp8_dtype,
+    is_fp8_fnuz,
     per_token_group_quant_fp8,
     scaled_fp8_quant,
 )
@@ -64,6 +66,7 @@ from sglang.srt.utils import (
     get_bool_env_var,
     is_cuda,
     is_hip,
+    log_info_on_rank0,
     print_warning_once,
     set_weight_attrs,
 )
@@ -71,6 +74,11 @@ from sglang.srt.utils import (
 _is_hip = is_hip()
 _is_cuda = is_cuda()
+_is_fp8_fnuz = is_fp8_fnuz()
+use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT")
+use_aiter_moe = get_bool_env_var("SGLANG_AITER_MOE")
 if _is_hip:
     from aiter import ActivationType, QuantType
     from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages
@@ -97,10 +105,7 @@ class Fp8Config(QuantizationConfig):
     ) -> None:
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         if is_checkpoint_fp8_serialized:
-            logger.warning(
-                "Detected fp8 checkpoint. Please note that the "
-                "format is experimental and subject to change."
-            )
+            log_info_on_rank0(logger, "Detected fp8 checkpoint.")
         if activation_scheme not in ACTIVATION_SCHEMES:
             raise ValueError(f"Unsupported activation scheme {activation_scheme}")
         self.activation_scheme = activation_scheme
@@ -306,25 +311,21 @@ class Fp8LinearMethod(LinearMethodBase):
         # Block quant doesn't need to process weights after loading
         if self.block_quant:
             # If ROCm, normalize the weights and scales to e4m3fnuz
-            if _is_hip:
+            if _is_fp8_fnuz:
                 # activation_scheme: dynamic
                 weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
                     weight=layer.weight,
                     weight_scale=layer.weight_scale_inv,
                     input_scale=None,
                 )
-                layer.weight = torch.nn.Parameter(weight, requires_grad=False)
-                layer.weight_scale_inv = torch.nn.Parameter(
-                    weight_scale, requires_grad=False
-                )
                 layer.input_scale = None
             else:
-                layer.weight = torch.nn.Parameter(
-                    layer.weight.data, requires_grad=False
-                )
-                layer.weight_scale_inv = torch.nn.Parameter(
-                    layer.weight_scale_inv.data, requires_grad=False
-                )
+                weight, weight_scale = layer.weight.data, layer.weight_scale_inv.data
+            layer.weight = torch.nn.Parameter(weight, requires_grad=False)
+            layer.weight_scale_inv = torch.nn.Parameter(
+                weight_scale, requires_grad=False
+            )
             return
         layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
@@ -368,7 +369,7 @@ class Fp8LinearMethod(LinearMethodBase):
                 weight = layer.weight
                 weight_scale = layer.weight_scale
                 # If ROCm, normalize the weights and scales to e4m3fnuz
-                if _is_hip:
+                if _is_fp8_fnuz:
                     weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
                         weight=weight,
                         weight_scale=weight_scale,
@@ -482,11 +483,7 @@ class Fp8MoEMethod:
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
         if self.quant_config.is_checkpoint_fp8_serialized:
-            params_dtype = (
-                torch.uint32
-                if get_bool_env_var("SGLANG_INT4_WEIGHT")
-                else torch.float8_e4m3fn
-            )
+            params_dtype = torch.uint32 if use_hip_int4 else torch.float8_e4m3fn
         tp_size = get_tensor_model_parallel_world_size()
         if self.block_quant:
             block_n, block_k = (
@@ -511,7 +508,7 @@ class Fp8MoEMethod:
                     )
         # WEIGHTS
-        if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
+        if _is_hip and use_hip_int4:
             # INT4 MoE weight - INT32 packed
             w13_weight = torch.nn.Parameter(
                 torch.empty(
@@ -583,9 +580,7 @@ class Fp8MoEMethod:
             layer.register_parameter("w13_weight_scale", w13_weight_scale)
             layer.register_parameter("w2_weight_scale", w2_weight_scale)
-            if (
-                _is_hip
-            ):  # and get_bool_env_var("SGLANG_AITER_MOE"): TODO: add check back after triton kernel
+            if _is_hip:  # and use_aiter_moe: TODO: add check back after triton kernel
                 # ROCm - using column scaling, duplicate scaling numbers in case per tensor scaling
                 w13_weight_scale1 = torch.nn.Parameter(
                     torch.ones(num_experts, 2 * intermediate_size, dtype=torch.float32),
@@ -612,7 +607,7 @@ class Fp8MoEMethod:
             set_weight_attrs(w13_weight_scale, extra_weight_attrs)
             set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-            if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
+            if _is_hip and use_hip_int4:
                 extra_weight_attrs.update(
                     {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
                 )
@@ -644,14 +639,14 @@ class Fp8MoEMethod:
             layer.w2_input_scale = None
     def process_weights_after_loading(self, layer: Module) -> None:
-        if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
+        if _is_hip and use_hip_int4:
             self.process_weights_hip_int4(layer)
             return
         # Block quant doesn't need to process weights after loading
         if self.block_quant:
             # If ROCm, normalize the weights and scales to e4m3fnuz
-            if _is_hip:
+            if _is_fp8_fnuz:
                 # activation_scheme: dynamic
                 w13_weight, w13_weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
                     weight=layer.w13_weight,
@@ -675,20 +670,19 @@ class Fp8MoEMethod:
                 )
                 layer.w2_input_scale = None
-                if get_bool_env_var("SGLANG_AITER_MOE"):
-                    # Pre-shuffle weights
-                    layer.w13_weight.data = shuffle_weight(
-                        layer.w13_weight.contiguous(), (16, 16)
-                    )
-                    layer.w2_weight.data = shuffle_weight(
-                        layer.w2_weight.contiguous(), (16, 16)
-                    )
+            if _is_hip and use_aiter_moe:
+                # Pre-shuffle weights
+                layer.w13_weight.data = shuffle_weight(
+                    layer.w13_weight.contiguous(), (16, 16)
+                )
+                layer.w2_weight.data = shuffle_weight(
+                    layer.w2_weight.contiguous(), (16, 16)
+                )
             return
         # If checkpoint is fp16 or bfloat16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
-            # If ROCm, use float8_e4m3fnuz instead (MI300x HW)
-            fp8_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
+            # If ROCm, fp8_dtype will be float8_e4m3fnuz (MI300x HW)
             w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
             w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
@@ -742,7 +736,7 @@ class Fp8MoEMethod:
                 )
             # If ROCm, normalize the weights and scales to e4m3fnuz
-            if _is_hip:
+            if _is_fp8_fnuz:
                 # Normalize the weights and scales
                 w13_weight, w13_weight_scale, w13_input_scale = (
                     normalize_e4m3fn_to_e4m3fnuz(
@@ -798,7 +792,7 @@ class Fp8MoEMethod:
             return
     def process_weights_hip_int4(self, layer: Module):
-        # TODO: and get_bool_env_var("SGLANG_AITER_MOE"): add after triton kernel added
+        # TODO: and use_aiter_moe: add after triton kernel added
         # INT4-FP8 (INT4 MoE Weight, FP8 Compute)
         # Weight Permutation
         layer.w13_weight = torch.nn.Parameter(
@@ -845,7 +839,7 @@ class Fp8MoEMethod:
             padding_size,  # Avoid circular import
         )
-        if get_bool_env_var("SGLANG_AITER_MOE"):
+        if use_aiter_moe:
             layer.w13_weight = torch.nn.Parameter(
                 shuffle_weight(layer.w13_weight.data, (16, 16)),
                 requires_grad=False,
@@ -856,7 +850,7 @@ class Fp8MoEMethod:
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
-            # ROCm (SGLANG_AITER_MOE): using column-wise scaling
+            # ROCm (use_aiter_moe): using column-wise scaling
             layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1)
             layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1)
         elif get_bool_env_var("SGLANG_MOE_PADDING"):
@@ -908,59 +902,16 @@ class Fp8MoEMethod:
         )
         if _is_hip:
-            if get_bool_env_var("SGLANG_INT4_WEIGHT"):
-                # TODO: add triton kernel and add check get_bool_env_var("SGLANG_AITER_MOE")
-                assert not no_combine, f"{no_combine=} is not supported."
-                return ck_moe_2stages(
-                    x,
-                    layer.w13_weight,
-                    layer.w2_weight,
-                    topk_weights,
-                    topk_ids,
-                    QuantType.per_Token,
-                    layer.w13_weight_scale1,
-                    layer.w2_weight_scale1,
-                    activation=(
-                        ActivationType.Silu
-                        if activation == "silu"
-                        else ActivationType.Gelu
-                    ),
-                )
-            if get_bool_env_var("SGLANG_AITER_MOE"):
-                assert not no_combine, f"{no_combine=} is not supported."
-                if self.block_quant:
-                    # TODO(SGLANG_AITER_MOE): FP8 block_quant only supports 'silu' for the time-being.
-                    assert (
-                        activation == "silu"
-                    ), f"SGLANG_AITER_MOE: FP8 bloack_quant {activation=} will be supported later, unset SGLANG_AITER_MOE"
-                    return asm_moe(
-                        x,
-                        layer.w13_weight,
-                        layer.w2_weight,
-                        topk_weights,
-                        topk_ids,
-                        layer.w13_weight_scale_inv,
-                        layer.w2_weight_scale_inv,
-                        block_shape=tuple(self.quant_config.weight_block_size),
-                        expert_mask=None,
-                    )
-                else:
-                    return ck_moe_2stages(
-                        x,
-                        layer.w13_weight,
-                        layer.w2_weight,
-                        topk_weights,
-                        topk_ids,
-                        QuantType.per_Token,
-                        layer.w13_weight_scale1,
-                        layer.w2_weight_scale1,
-                        activation=(
-                            ActivationType.Silu
-                            if activation == "silu"
-                            else ActivationType.Gelu
-                        ),
-                    )
+            ret = self.maybe_apply_hip_fused_experts(
+                layer,
+                x,
+                topk_weights,
+                topk_ids,
+                activation,
+                no_combine,
+            )
+            if ret is not None:
+                return ret
         # Expert fusion with FP8 quantization
         return fused_experts(
@@ -987,6 +938,68 @@ class Fp8MoEMethod:
             no_combine=no_combine,
         )
+    def maybe_apply_hip_fused_experts(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str = "silu",
+        no_combine: bool = False,
+    ) -> Optional[torch.Tensor]:
+        if use_hip_int4:
+            # TODO: add triton kernel and add check use_aiter_moe
+            assert not no_combine, f"{no_combine=} is not supported."
+            return ck_moe_2stages(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                QuantType.per_Token,
+                layer.w13_weight_scale1,
+                layer.w2_weight_scale1,
+                activation=(
+                    ActivationType.Silu if activation == "silu" else ActivationType.Gelu
+                ),
+            )
+        if use_aiter_moe:
+            assert not no_combine, f"{no_combine=} is not supported."
+            if self.block_quant:
+                # TODO(use_aiter_moe): FP8 block_quant only supports 'silu' for the time-being.
+                assert (
+                    activation == "silu"
+                ), f"use_aiter_moe: FP8 bloack_quant {activation=} will be supported later, unset use_aiter_moe"
+                return asm_moe(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    layer.w13_weight_scale_inv,
+                    layer.w2_weight_scale_inv,
+                    block_shape=tuple(self.quant_config.weight_block_size),
+                    expert_mask=None,
+                )
+            else:
+                return ck_moe_2stages(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    QuantType.per_Token,
+                    layer.w13_weight_scale1,
+                    layer.w2_weight_scale1,
+                    activation=(
+                        ActivationType.Silu
+                        if activation == "silu"
+                        else ActivationType.Gelu
+                    ),
+                )
+        return None
 class Fp8KVCacheMethod(BaseKVCacheMethod):
     """

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -16,6 +16,7 @@ import functools
 import json
 import logging
 import os
+from functools import lru_cache
 from typing import Any, Dict, List, Optional, Tuple
 import torch
@@ -29,17 +30,12 @@ from sglang.srt.utils import (
     get_device_name,
     is_cuda,
     is_hip,
+    log_info_on_rank0,
     supports_custom_op,
 )
 _is_hip = is_hip()
 _is_cuda = is_cuda()
-_fp8_type = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
-if _is_hip:
-    fp8_max = 224.0
-else:
-    fp8_max = torch.finfo(_fp8_type).max
-fp8_min = -fp8_max
 if _is_cuda:
     from sgl_kernel import (
@@ -54,6 +50,24 @@ if _is_cuda:
 logger = logging.getLogger(__name__)
+@lru_cache()
+def is_fp8_fnuz() -> bool:
+    if _is_hip:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
+    return False
+if is_fp8_fnuz():
+    fp8_dtype = torch.float8_e4m3fnuz
+    fp8_max = 224.0
+else:
+    fp8_dtype = torch.float8_e4m3fn
+    fp8_max = torch.finfo(fp8_dtype).max
+fp8_min = -fp8_max
 if supports_custom_op():
     def deep_gemm_fp8_fp8_bf16_nt(
@@ -198,7 +212,7 @@ def per_token_group_quant_fp8(
     ), "the last dimension of `x` cannot be divisible by `group_size`"
     assert x.is_contiguous(), "`x` is not contiguous"
-    x_q = torch.empty_like(x, device=x.device, dtype=_fp8_type)
+    x_q = torch.empty_like(x, device=x.device, dtype=fp8_dtype)
     M = x.numel() // group_size
     N = group_size
     if column_major_scales:
@@ -272,7 +286,7 @@ def sglang_per_token_group_quant_fp8(
     ), "the last dimension of `x` cannot be divisible by `group_size`"
     assert x.is_contiguous(), "`x` is not contiguous"
-    x_q = torch.empty_like(x, device=x.device, dtype=_fp8_type)
+    x_q = torch.empty_like(x, device=x.device, dtype=fp8_dtype)
     if column_major_scales:
         if scale_tma_aligned:
             # aligned to 4 * sizeof(float)
@@ -294,15 +308,15 @@ def sglang_per_token_group_quant_fp8(
             device=x.device,
             dtype=torch.float32,
         )
-    sgl_per_token_group_quant_fp8(x, x_q, x_s, group_size, eps, fp8_min, fp8_max)
+    if x.shape[0] > 0:
+        sgl_per_token_group_quant_fp8(x, x_q, x_s, group_size, eps, fp8_min, fp8_max)
     return x_q, x_s
 def sglang_per_token_quant_fp8(
     x: torch.Tensor,
-    dtype: torch.dtype = _fp8_type,
+    dtype: torch.dtype = fp8_dtype,
 ):
     assert x.is_contiguous(), "`x` is not contiguous"
@@ -384,7 +398,7 @@ def static_quant_fp8(
     assert x.is_contiguous(), "`x` is not contiguous"
     assert x_s.numel() == 1, "only supports per-tensor scale"
-    x_q = torch.empty_like(x, device=x.device, dtype=_fp8_type)
+    x_q = torch.empty_like(x, device=x.device, dtype=fp8_dtype)
     M = x.numel() // x.shape[-1]
     N = x.shape[-1]
     if repeat_scale:
@@ -685,9 +699,9 @@ def get_w8a8_block_fp8_configs(
     )
     if os.path.exists(config_file_path):
         with open(config_file_path) as f:
-            logger.info(
-                "Using configuration from %s for W8A8 Block FP8 kernel.",
-                config_file_path,
+            log_info_on_rank0(
+                logger,
+                f"Using configuration from {config_file_path} for W8A8 Block FP8 kernel.",
             )
             # If a configuration has been found, return it
             return {int(key): val for key, val in json.load(f).items()}
@@ -704,6 +718,28 @@ def get_w8a8_block_fp8_configs(
     return None
+def select_w8a8_block_fp8_matmul_kernel(M, N, META):
+    return _w8a8_block_fp8_matmul
+if _is_hip:
+    def use_w8a8_block_fp8_matmul_unrolledx4(M, N, META):
+        # Use manually unrolledx4 kernel on AMD GPU when the grid size is small.
+        # Empirical testing shows the sweet spot lies when it's less than the # of
+        # compute units available on the device.
+        num_workgroups = triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(
+            N, META["BLOCK_SIZE_N"]
+        )
+        num_workgroups <= get_device_core_count()
+    def select_w8a8_block_fp8_matmul_kernel(M, N, META):
+        if use_w8a8_block_fp8_matmul_unrolledx4(M, N, META):
+            return _w8a8_block_fp8_matmul_unrolledx4
+        else:
+            return _w8a8_block_fp8_matmul
 def w8a8_block_fp8_matmul(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -744,35 +780,6 @@ def w8a8_block_fp8_matmul(
     C_shape = A.shape[:-1] + (N,)
     C = A.new_empty(C_shape, dtype=output_dtype)
-    configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
-    if configs:
-        # If an optimal configuration map has been found, look up the
-        # optimal config
-        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
-    else:
-        # Default config
-        # Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
-        config = {
-            "BLOCK_SIZE_M": 64,
-            "BLOCK_SIZE_N": block_size[0],
-            "BLOCK_SIZE_K": block_size[1],
-            "GROUP_SIZE_M": 32,
-            "num_warps": 4,
-            "num_stages": 3,
-        }
-    def grid(META):
-        return (
-            triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
-        )
-    # Use manually unrolledx4 kernel on AMD GPU when the grid size is small.
-    # Empirical testing shows the sweet spot lies when it's less than the # of
-    # compute units available on the device.
-    num_workgroups = triton.cdiv(M, config["BLOCK_SIZE_M"]) * triton.cdiv(
-        N, config["BLOCK_SIZE_N"]
-    )
     # deepgemm only support bf16
     if C.dtype == torch.bfloat16 and _ENABLE_JIT_DEEPGEMM:
         if supports_custom_op():
@@ -780,11 +787,30 @@ def w8a8_block_fp8_matmul(
         else:
             deep_gemm_gemm_nt_f8f8bf16((A, As), (B, Bs), C)
     else:
-        kernel = (
-            _w8a8_block_fp8_matmul_unrolledx4
-            if (_is_hip == True and num_workgroups <= get_device_core_count())
-            else _w8a8_block_fp8_matmul
-        )
+        configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
+        if configs:
+            # If an optimal configuration map has been found, look up the
+            # optimal config
+            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+        else:
+            # Default config
+            # Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
+            config = {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": block_size[0],
+                "BLOCK_SIZE_K": block_size[1],
+                "GROUP_SIZE_M": 32,
+                "num_warps": 4,
+                "num_stages": 3,
+            }
+        def grid(META):
+            return (
+                triton.cdiv(M, META["BLOCK_SIZE_M"])
+                * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+            )
+        kernel = select_w8a8_block_fp8_matmul_kernel(M, N, config)
         kernel[grid](
             A,
@@ -879,7 +905,7 @@ def per_tensor_quant_mla_fp8(
         and x_s_out.device == x.device
     )
-    x_q = x.new_empty(x.size(), dtype=_fp8_type)
+    x_q = x.new_empty(x.size(), dtype=fp8_dtype)
     num_head, num_seq, head_size = x.shape
     BLOCK_SIZE = triton.next_power_of_2(head_size)
@@ -961,11 +987,11 @@ def _per_token_group_quant_mla_deep_gemm_masked_fp8(
         tl.store(y_s_ptr + gid * y_s_stride_g, y_s)
-def per_tensor_quant_mla_deep_gemm_masked_fp8(
+def per_token_group_quant_mla_deep_gemm_masked_fp8(
     x: torch.Tensor,
     group_size: int = 128,
     eps: float = 1e-12,
-    dtype: torch.dtype = torch.float8_e4m3fn,
+    dtype: torch.dtype = fp8_dtype,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     This function quantizes input values to float8 values with per-token-group-quantization
@@ -973,12 +999,6 @@ def per_tensor_quant_mla_deep_gemm_masked_fp8(
     """
     assert x.dim() == 3, "`x` is not a 3d-tensor"
-    finfo = torch.finfo(dtype)
-    fp8_max = finfo.max
-    if _is_hip:
-        dtype = torch.float8_e4m3fnuz
-        fp8_max = 224.0
     b, m, k = x.shape
     aligned_m = (m + 255) // 256 * 256  # 256 is the max block_m of the gemm kernel
     num_tiles_k = k // group_size
@@ -1043,10 +1063,9 @@ def scaled_fp8_quant(
     """
     assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
     shape = input.shape
-    out_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
     if num_token_padding:
         shape = (max(num_token_padding, input.shape[0]), shape[1])
-    output = torch.empty(shape, device=input.device, dtype=out_dtype)
+    output = torch.empty(shape, device=input.device, dtype=fp8_dtype)
     if scale is None:
         # Dynamic scaling

sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post3py3-none-any.whl