PyPI - sglang - Versions diffs - 0.4.5__py3-none-any.whl → 0.4.5.post1__py3-none-any.whl - Mend

sglang 0.4.5py3-none-any.whl → 0.4.5.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (121) hide show

sglang/bench_one_batch.py +21 -0
sglang/bench_serving.py +10 -4
sglang/srt/configs/model_config.py +37 -5
sglang/srt/constrained/base_grammar_backend.py +26 -5
sglang/srt/constrained/llguidance_backend.py +1 -0
sglang/srt/constrained/outlines_backend.py +1 -0
sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
sglang/srt/constrained/xgrammar_backend.py +1 -0
sglang/srt/disaggregation/base/__init__.py +8 -0
sglang/srt/disaggregation/base/conn.py +113 -0
sglang/srt/disaggregation/decode.py +18 -5
sglang/srt/disaggregation/mini_lb.py +53 -122
sglang/srt/disaggregation/mooncake/__init__.py +6 -0
sglang/srt/disaggregation/mooncake/conn.py +615 -0
sglang/srt/disaggregation/mooncake/transfer_engine.py +108 -0
sglang/srt/disaggregation/prefill.py +43 -19
sglang/srt/disaggregation/utils.py +31 -0
sglang/srt/entrypoints/EngineBase.py +53 -0
sglang/srt/entrypoints/engine.py +36 -8
sglang/srt/entrypoints/http_server.py +37 -8
sglang/srt/entrypoints/http_server_engine.py +142 -0
sglang/srt/entrypoints/verl_engine.py +37 -10
sglang/srt/hf_transformers_utils.py +4 -0
sglang/srt/layers/attention/flashattention_backend.py +330 -200
sglang/srt/layers/attention/flashinfer_backend.py +13 -7
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/dp_attention.py +2 -4
sglang/srt/layers/elementwise.py +15 -2
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/{E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +34 -34
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +38 -21
sglang/srt/layers/moe/router.py +7 -1
sglang/srt/layers/moe/topk.py +37 -16
sglang/srt/layers/quantization/__init__.py +12 -5
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +4 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +68 -45
sglang/srt/layers/quantization/fp8.py +25 -13
sglang/srt/layers/quantization/fp8_kernel.py +130 -4
sglang/srt/layers/quantization/fp8_utils.py +34 -6
sglang/srt/layers/quantization/kv_cache.py +43 -52
sglang/srt/layers/quantization/modelopt_quant.py +271 -4
sglang/srt/layers/quantization/w8a8_fp8.py +154 -4
sglang/srt/layers/quantization/w8a8_int8.py +1 -0
sglang/srt/layers/radix_attention.py +13 -1
sglang/srt/layers/rotary_embedding.py +12 -1
sglang/srt/managers/io_struct.py +254 -97
sglang/srt/managers/mm_utils.py +3 -2
sglang/srt/managers/multimodal_processors/base_processor.py +114 -77
sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
sglang/srt/managers/multimodal_processors/mllama4.py +21 -36
sglang/srt/managers/schedule_batch.py +62 -21
sglang/srt/managers/scheduler.py +71 -14
sglang/srt/managers/tokenizer_manager.py +17 -3
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/memory_pool.py +14 -1
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +7 -4
sglang/srt/model_executor/forward_batch_info.py +234 -15
sglang/srt/model_executor/model_runner.py +48 -9
sglang/srt/model_loader/loader.py +31 -4
sglang/srt/model_loader/weight_utils.py +4 -2
sglang/srt/models/baichuan.py +2 -0
sglang/srt/models/chatglm.py +1 -0
sglang/srt/models/commandr.py +1 -0
sglang/srt/models/dbrx.py +1 -0
sglang/srt/models/deepseek.py +1 -0
sglang/srt/models/deepseek_v2.py +248 -61
sglang/srt/models/exaone.py +1 -0
sglang/srt/models/gemma.py +1 -0
sglang/srt/models/gemma2.py +1 -0
sglang/srt/models/gemma3_causal.py +1 -0
sglang/srt/models/gpt2.py +1 -0
sglang/srt/models/gpt_bigcode.py +1 -0
sglang/srt/models/granite.py +1 -0
sglang/srt/models/grok.py +1 -0
sglang/srt/models/internlm2.py +1 -0
sglang/srt/models/llama.py +1 -0
sglang/srt/models/llama4.py +101 -34
sglang/srt/models/minicpm.py +1 -0
sglang/srt/models/minicpm3.py +2 -0
sglang/srt/models/mixtral.py +1 -0
sglang/srt/models/mixtral_quant.py +1 -0
sglang/srt/models/mllama.py +51 -8
sglang/srt/models/mllama4.py +102 -29
sglang/srt/models/olmo.py +1 -0
sglang/srt/models/olmo2.py +1 -0
sglang/srt/models/olmoe.py +1 -0
sglang/srt/models/phi3_small.py +1 -0
sglang/srt/models/qwen.py +1 -0
sglang/srt/models/qwen2.py +1 -0
sglang/srt/models/qwen2_5_vl.py +35 -70
sglang/srt/models/qwen2_moe.py +1 -0
sglang/srt/models/qwen2_vl.py +27 -25
sglang/srt/models/stablelm.py +1 -0
sglang/srt/models/xverse.py +1 -0
sglang/srt/models/xverse_moe.py +1 -0
sglang/srt/openai_api/adapter.py +4 -1
sglang/srt/patch_torch.py +11 -0
sglang/srt/server_args.py +34 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
sglang/srt/speculative/eagle_utils.py +1 -11
sglang/srt/speculative/eagle_worker.py +6 -2
sglang/srt/utils.py +120 -9
sglang/test/attention/test_flashattn_backend.py +259 -221
sglang/test/attention/test_flashattn_mla_backend.py +285 -0
sglang/test/attention/test_prefix_chunk_info.py +224 -0
sglang/test/test_block_fp8.py +57 -0
sglang/test/test_utils.py +19 -8
sglang/version.py +1 -1
{sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/METADATA +14 -4
{sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/RECORD +120 -106
sglang/srt/disaggregation/conn.py +0 -81
{sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.dist-info → sglang-0.4.5.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py CHANGED Viewed

@@ -103,16 +103,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             "input_activations"
         )
-        if not (
-            self.weight_quant.strategy == QuantizationStrategy.TENSOR
-            and self.input_quant.strategy == QuantizationStrategy.TENSOR
-        ):
-            raise ValueError(
-                "For FP8 Fused MoE layers, only per-tensor scales "
-                "for weights and activations are supported. Found "
-                f"{self.weight_quant}, {self.input_quant}"
-            )
         self.static_input_scales = not self.input_quant.dynamic
     def create_weights(
@@ -154,27 +144,50 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         set_weight_attrs(w2_weight, extra_weight_attrs)
         # WEIGHT_SCALES
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
-        )
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        # per-tensor quantization
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
+            weight_quant_method = FusedMoeWeightScaleSupported.TENSOR.value
+        elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    1,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
+                requires_grad=False,
+            )
+            weight_quant_method = FusedMoeWeightScaleSupported.CHANNEL.value
+        else:
+            raise ValueError(
+                f"Unsupported weight quantization strategy: {self.weight_quant.strategy}"
+            )
-        w2_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
-        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
         layer.register_parameter("w2_weight_scale", w2_weight_scale)
         # Add the quantization method used (per tensor/grouped/channel)
         # to ensure the weight scales are loaded in properly
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
-        )
+        extra_weight_attrs.update({"quant_method": weight_quant_method})
         set_weight_attrs(w13_weight_scale, extra_weight_attrs)
         set_weight_attrs(w2_weight_scale, extra_weight_attrs)
         # INPUT_SCALES
         if self.static_input_scales:
+            assert (
+                self.input_quant.strategy == QuantizationStrategy.TENSOR
+            ), "Only per-tensor quantization is supported for static input scales"
             w13_input_scale = torch.nn.Parameter(
                 torch.ones(num_experts, dtype=torch.float32), requires_grad=False
             )
@@ -241,31 +254,37 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 layer.w2_input_scale = torch.nn.Parameter(
                     w2_input_scale, requires_grad=False
                 )
-        # Fp8 moe kernel needs single weight scale for w13 per expert.
-        # We take the max then dequant and requant each expert.
-        assert layer.w13_weight_scale is not None
-        shard_size = layer.intermediate_size_per_partition
-        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-        for expert_id in range(layer.local_num_experts):
-            start = 0
-            for shard_id in range(2):
-                dq_weight = per_tensor_dequantize(
-                    layer.w13_weight[expert_id][start : start + shard_size, :],
-                    layer.w13_weight_scale[expert_id][shard_id],
-                )
-                if _is_cuda:
-                    layer.w13_weight[expert_id][start : start + shard_size, :], _ = (
-                        sgl_scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
-                    )
-                else:
-                    layer.w13_weight[expert_id][start : start + shard_size, :], _ = (
-                        vllm_ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            # Fp8 moe kernel needs single weight scale for w13 per expert.
+            # We take the max then dequant and requant each expert.
+            assert layer.w13_weight_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+            for expert_id in range(layer.local_num_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start : start + shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id],
                     )
-                start += shard_size
-        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, requires_grad=False)
+                    if _is_cuda:
+                        (
+                            layer.w13_weight[expert_id][start : start + shard_size, :],
+                            _,
+                        ) = sgl_scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+                    else:
+                        (
+                            layer.w13_weight[expert_id][start : start + shard_size, :],
+                            _,
+                        ) = vllm_ops.scaled_fp8_quant(
+                            dq_weight, max_w13_scales[expert_id]
+                        )
+                    start += shard_size
+            layer.w13_weight_scale = torch.nn.Parameter(
+                max_w13_scales, requires_grad=False
+            )
     def apply(
         self,
@@ -285,6 +304,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         activation: str = "silu",
         inplace: bool = True,
         no_combine: bool = False,
+        apply_router_weight_on_input: bool = False,
     ) -> torch.Tensor:
         from sglang.srt.layers.moe.fused_moe_triton import fused_experts
         from sglang.srt.layers.moe.topk import select_experts
@@ -310,10 +330,13 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             inplace=inplace,
             activation=activation,
             use_fp8_w8a8=True,
+            per_channel_quant=self.weight_quant.strategy
+            == QuantizationStrategy.CHANNEL,
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
+            apply_router_weight_on_input=apply_router_weight_on_input,
         )

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -71,7 +71,8 @@ ACTIVATION_SCHEMES = ["static", "dynamic"]
 _is_hip = is_hip()
 if _is_hip:
-    from aiter.fused_moe_bf16_asm import asm_moe
+    from aiter import ActivationType
+    from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages, ck_moe_2stages_win4
     from aiter.ops.shuffle import shuffle_weight
 _is_cuda = is_cuda()
@@ -487,7 +488,7 @@ class Fp8MoEMethod:
         if self.quant_config.is_checkpoint_fp8_serialized:
             params_dtype = (
-                torch.int32
+                torch.uint32
                 if get_bool_env_var("USE_INT4_WEIGHT")
                 else torch.float8_e4m3fn
             )
@@ -822,12 +823,14 @@ class Fp8MoEMethod:
         # INT4-FP8 (INT4 MoE Weight, FP8 Compute)
         # Weight Permutation
         layer.w13_weight = torch.nn.Parameter(
-            permute_weight(layer.w13_weight.data),
+            # permute_weight(layer.w13_weight.data),
+            shuffle_weight(layer.w13_weight.data, (16, 16)),
             requires_grad=False,
         )
         torch.cuda.empty_cache()
         layer.w2_weight = torch.nn.Parameter(
-            permute_weight(layer.w2_weight.data),
+            # permute_weight(layer.w2_weight.data),
+            shuffle_weight(layer.w2_weight.data, (16, 16)),
             requires_grad=False,
         )
         torch.cuda.empty_cache()
@@ -867,12 +870,14 @@ class Fp8MoEMethod:
         if get_bool_env_var("CK_MOE"):
             layer.w13_weight = torch.nn.Parameter(
-                permute_weight(layer.w13_weight.data),
+                # permute_weight(layer.w13_weight.data),
+                shuffle_weight(layer.w13_weight.data, (16, 16)),
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
             layer.w2_weight = torch.nn.Parameter(
-                permute_weight(layer.w2_weight.data),
+                # permute_weight(layer.w2_weight.data),
+                shuffle_weight(layer.w2_weight.data, (16, 16)),
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
@@ -928,7 +933,7 @@ class Fp8MoEMethod:
         if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
             # TODO: add triton kernel and add check get_bool_env_var("CK_MOE")
             assert not no_combine, f"{no_combine=} is not supported."
-            return asm_moe(
+            return ck_moe_2stages_win4(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
@@ -936,15 +941,17 @@ class Fp8MoEMethod:
                 topk_ids,
                 layer.w13_weight_scale1,
                 layer.w2_weight_scale1,
-                activation=activation,
+                activation=(
+                    ActivationType.Silu if activation == "silu" else ActivationType.Gelu
+                ),
             )
         if _is_hip and get_bool_env_var("CK_MOE"):
-            # TODO(CK_MOE): FP8 or FP8 block_quant only supports 'silu' for the time-being.
-            assert (
-                activation == "silu"
-            ), f"CK_MOE: FP8 and/or FP8 bloack_quant {activation=} will be supported later, unset CK_MOE"
             assert not no_combine, f"{no_combine=} is not supported."
             if self.block_quant:
+                # TODO(CK_MOE): FP8 block_quant only supports 'silu' for the time-being.
+                assert (
+                    activation == "silu"
+                ), f"CK_MOE: FP8 bloack_quant {activation=} will be supported later, unset CK_MOE"
                 return asm_moe(
                     x,
                     layer.w13_weight,
@@ -957,7 +964,7 @@ class Fp8MoEMethod:
                     expert_mask=None,
                 )
             else:
-                return asm_moe(
+                return ck_moe_2stages(
                     x,
                     layer.w13_weight,
                     layer.w2_weight,
@@ -965,6 +972,11 @@ class Fp8MoEMethod:
                     topk_ids,
                     layer.w13_weight_scale1,
                     layer.w2_weight_scale1,
+                    activation=(
+                        ActivationType.Silu
+                        if activation == "silu"
+                        else ActivationType.Gelu
+                    ),
                 )
         else:
             # Expert fusion with FP8 quantization

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -16,6 +16,7 @@ import functools
 import json
 import logging
 import os
+from contextlib import contextmanager
 from typing import Any, Dict, List, Optional, Tuple
 import torch
@@ -40,11 +41,13 @@ fp8_type_ = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
 _is_cuda = is_cuda()
 if _is_cuda:
-    import deep_gemm  # `pip install "sgl-kernel>=0.0.4.post3"`
+    import deep_gemm
     from sgl_kernel import sgl_per_token_group_quant_fp8, sgl_per_token_quant_fp8
     sm_version = get_device_sm()
-    if sm_version >= 90 and get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true"):
+    if sm_version == 90 and get_bool_env_var(
+        "SGL_ENABLE_JIT_DEEPGEMM", default="false"
+    ):
         _enable_jit_deepgemm = True
@@ -59,7 +62,10 @@ if supports_custom_op():
         Bs: torch.Tensor,
         C: torch.Tensor,
     ) -> None:
-        deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
+        M, K = A.shape
+        N, _ = B.shape
+        with _log_jit_build(M, N, K):
+            deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
     def deep_gemm_fp8_fp8_bf16_nt_fake(
         A: torch.Tensor,
@@ -708,6 +714,25 @@ def get_w8a8_block_fp8_configs(
     return None
+@contextmanager
+def _log_jit_build(M: int, N: int, K: int):
+    from deep_gemm.jit.runtime import RuntimeCache
+    origin_func = RuntimeCache.__getitem__
+    def __patched_func(self, *args, **kwargs):
+        ret = origin_func(self, *args, **kwargs)
+        if ret is None:
+            logger.warning(
+                f"DeepGEMM JIT code generation <gemm_fp8_fp8_bf16_nt>: M={M}, N={N}, K={K}. Please wait."
+            )
+        return ret
+    RuntimeCache.__getitem__ = __patched_func
+    yield
+    RuntimeCache.__getitem__ = origin_func
 def w8a8_block_fp8_matmul(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -782,7 +807,8 @@ def w8a8_block_fp8_matmul(
         if supports_custom_op():
             torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
         else:
-            deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
+            with _log_jit_build(M, N, K):
+                deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
     else:
         kernel = (
             _w8a8_block_fp8_matmul_unrolledx4
@@ -815,3 +841,103 @@ def w8a8_block_fp8_matmul(
         )
     return C
+@triton.jit
+def _per_tensor_quant_mla_fp8_stage1(
+    x_ptr,
+    x_s_ptr,
+    head_size,
+    x_stride_h,
+    x_stride_s,
+    eps,
+    fp8_max,
+    BLOCK_SIZE: tl.constexpr,
+):
+    seq_id = tl.program_id(0)
+    head_id = tl.program_id(1)
+    offset = tl.arange(0, BLOCK_SIZE)
+    mask = offset < head_size
+    x_ptr += head_id * x_stride_h + seq_id * x_stride_s
+    x = tl.load(x_ptr + offset, mask=mask, other=0.0).to(tl.float32)
+    _absmax = tl.maximum(tl.max(tl.abs(x)), eps)
+    tl.atomic_max(x_s_ptr, _absmax / fp8_max)
+@triton.jit
+def _per_tensor_quant_mla_fp8_stage2(
+    x_ptr,
+    x_s_ptr,
+    x_q_ptr,
+    num_seq,
+    head_size,
+    x_stride_h,
+    x_stride_s,
+    fp8_min,
+    fp8_max,
+    BLOCK_SIZE: tl.constexpr,
+):
+    seq_id = tl.program_id(0)
+    head_id = tl.program_id(1)
+    offset = tl.arange(0, BLOCK_SIZE)
+    mask = offset < head_size
+    x_s = tl.load(x_s_ptr)
+    x_s_inv = 1.0 / x_s
+    x_ptr += head_id * x_stride_h + seq_id * x_stride_s
+    x_q_ptr += head_id * num_seq * head_size + seq_id * head_size
+    x = tl.load(x_ptr + offset, mask=mask, other=0.0).to(tl.float32)
+    x_q = tl.clamp(x * x_s_inv, fp8_min, fp8_max).to(x_q_ptr.dtype.element_ty)
+    tl.store(x_q_ptr + offset, x_q, mask=mask)
+def per_tensor_quant_mla_fp8(
+    x: torch.Tensor, eps: float = 1e-12, dtype: torch.dtype = torch.float8_e4m3fn
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    This function quantizes input values to float8 values with tensor-wise quantization
+    and specialized for mla absorbed case.
+    """
+    assert x.dim() == 3, "`x` is not a 3d-tensor"
+    finfo = torch.finfo(dtype)
+    fp8_max = finfo.max
+    if _is_hip:
+        dtype = torch.float8_e4m3fnuz
+        fp8_max = 224.0
+    x_q = x.new_empty(x.size(), dtype=dtype)
+    x_s = torch.zeros((1,), dtype=torch.float32, device=x.device)
+    num_head, num_seq, head_size = x.shape
+    BLOCK_SIZE = triton.next_power_of_2(head_size)
+    grid = (num_seq, num_head)
+    _per_tensor_quant_mla_fp8_stage1[grid](
+        x,
+        x_s,
+        head_size,
+        x.stride(0),
+        x.stride(1),
+        eps,
+        fp8_max,
+        BLOCK_SIZE,
+    )
+    _per_tensor_quant_mla_fp8_stage2[grid](
+        x,
+        x_s,
+        x_q,
+        num_seq,
+        head_size,
+        x.stride(0),
+        x.stride(1),
+        -fp8_max,
+        fp8_max,
+        BLOCK_SIZE,
+    )
+    return x_q, x_s

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -168,12 +168,13 @@ def input_to_float8(
     """This function quantizes input values to float8 values with tensor-wise quantization."""
     finfo = torch.finfo(dtype)
     min_val, max_val = x.aminmax()
-    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    amax = torch.maximum(min_val.abs(), max_val.abs()).float().clamp(min=1e-12)
     fp8_max = finfo.max
     if _is_hip:
+        dtype = torch.float8_e4m3fnuz
         fp8_max = 224.0
     scale = fp8_max / amax
-    x_scl_sat = (x * scale).clamp(min=-fp8_max, max=fp8_max)
+    x_scl_sat = (x.float() * scale).clamp(min=-fp8_max, max=fp8_max)
     return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
@@ -212,7 +213,24 @@ def block_quant_to_tensor_quant(
         for j in range(n_tiles):
             x_dq_block_tiles[j][i][:, :] = x_dq_block_tiles[j][i] * x_s[j][i]
-    x_q_tensor, scale = input_to_float8(x_dq_block, dtype=x_q_block.dtype)
+    x_q_tensor, scale = (
+        sgl_scaled_fp8_quant(x_dq_block)
+        if _is_cuda
+        else input_to_float8(x_dq_block, dtype=x_q_block.dtype)
+    )
+    return x_q_tensor, scale
+def channel_quant_to_tensor_quant(
+    x_q_channel: torch.Tensor,
+    x_s: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    x_dq_channel = x_q_channel.to(torch.float32) * x_s
+    x_q_tensor, scale = (
+        sgl_scaled_fp8_quant(x_dq_channel)
+        if _is_cuda
+        else input_to_float8(x_dq_channel, dtype=x_q_channel.dtype)
+    )
     return x_q_tensor, scale
@@ -242,9 +260,19 @@ def apply_fp8_linear(
         if _is_cuda:
             qinput, x_scale = sglang_per_token_quant_fp8(input_2d)
         else:
-            qinput, x_scale = per_token_group_quant_fp8(
-                input_2d, group_size=input_2d.shape[1]
-            )
+            # TODO(kkhuang): temporarily enforce per-tensor activation scaling if weight is per-tensor scaling
+            # final solution should be: 1. add support to per-tensor activation scaling.
+            # 2. solve the torch.compile error from weight_scale.numel() == 1 and x_scale.numel() > 1 (below line#308)
+            if _is_hip and weight_scale.numel() == 1:
+                qinput, x_scale = ops.scaled_fp8_quant(
+                    input_2d,
+                    input_scale,
+                    use_per_token_if_dynamic=use_per_token_if_dynamic,
+                )
+            else:
+                qinput, x_scale = per_token_group_quant_fp8(
+                    input_2d, group_size=input_2d.shape[1]
+                )
     if cutlass_fp8_supported:
         try:

sglang/srt/layers/quantization/kv_cache.py CHANGED Viewed

@@ -8,6 +8,7 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.utils import is_hip
 _is_hip = is_hip()
@@ -17,7 +18,7 @@ logger = logging.getLogger(__name__)
 class BaseKVCacheMethod(QuantizeMethodBase):
     """
-    Quant method that adds `_k_scale` and `_v_scale` attributes to the
+    Quant method that adds `k_scale` and `v_scale` attributes to the
     Attention layer to support loading those scaling factors from checkpoints.
     The k/v_scale will be used to:
         - quantize k/v_cache entries before saving them to the cache
@@ -36,8 +37,12 @@ class BaseKVCacheMethod(QuantizeMethodBase):
         # Initialize the KV cache scales to -1.0, which is an invalid value.
         # If the k/v_scale appears in the checkpoint, it will be
         # overwritten when loading weights.
-        layer.k_scale = torch.nn.Parameter(torch.tensor(-1.0), requires_grad=False)
-        layer.v_scale = torch.nn.Parameter(torch.tensor(-1.0), requires_grad=False)
+        layer.k_scale = torch.nn.Parameter(
+            torch.tensor(-1.0, dtype=torch.float32), requires_grad=False
+        )
+        layer.v_scale = torch.nn.Parameter(
+            torch.tensor(-1.0, dtype=torch.float32), requires_grad=False
+        )
     @classmethod
     def is_fp8_fnuz(cls) -> bool:
@@ -47,52 +52,38 @@ class BaseKVCacheMethod(QuantizeMethodBase):
     def apply(self, layer: torch.nn.Module) -> torch.Tensor:
         raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.")
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0
-        # regardless whether the kv-scale is available in the checkpoint.
-        # No need to process kv scales after loading if we are going to
-        # calculate them on the fly.
-        if layer.kv_cache_dtype != "auto" and not layer.calculate_kv_scales:
-            if layer.k_scale > 0.0 and layer.v_scale > 0.0:
-                # We prefer to use separate k_scale and v_scale if present
-                k_scale = layer.k_scale.to("cpu").tolist()
-                v_scale = layer.v_scale.to("cpu").tolist()
-                if _is_hip and self.is_fp8_fnuz():
-                    k_scale *= 2
-                    v_scale *= 2
-            elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
-                # If no scales were loaded (both scales are invalid negative
-                # values), use the default value of 1.0
-                k_scale = 1.0
-                v_scale = 1.0
-            else:
-                # If we find a single kv_scale in the checkpoint, we remap
-                # kv_scale to k_scale during weight loading, and duplicate
-                # k_scale to v_scale here
-                assert layer.k_scale > 0.0
-                scale_to_duplicate = max(layer.k_scale, layer.v_scale)
-                k_scale = scale_to_duplicate.to("cpu").tolist()
-                v_scale = scale_to_duplicate.to("cpu").tolist()
-                if _is_hip and self.is_fp8_fnuz():
-                    k_scale *= 2
-                    v_scale *= 2
-            if not isinstance(k_scale, float) or not isinstance(v_scale, float):
-                raise ValueError(
-                    "Only support per-tensor scaling factor " "for fp8 KV cache"
-                )
-            # These are used in the final Attention.forward()
-            layer._k_scale.copy_(k_scale)
-            layer._v_scale.copy_(v_scale)
-            layer._k_scale_float = k_scale
-            layer._v_scale_float = v_scale
-            if k_scale == 1.0 and v_scale == 1.0 and "e5m2" not in layer.kv_cache_dtype:
-                logger.warning(
-                    "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
-                    "may cause accuracy issues. Please make sure k/v_scale "
-                    "scaling factors are available in the fp8 checkpoint."
-                )
-        del layer.k_scale
-        del layer.v_scale
+    def process_weights_after_loading(self, layer: RadixAttention) -> None:
+        if layer.k_scale > 0.0 and layer.v_scale > 0.0:
+            # We prefer to use separate k_scale and v_scale if present
+            k_scale = layer.k_scale.to("cpu").tolist()
+            v_scale = layer.v_scale.to("cpu").tolist()
+            if _is_hip and self.is_fp8_fnuz():
+                k_scale *= 2
+                v_scale *= 2
+        elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
+            # If no scales were loaded (both scales are invalid negative
+            # values), use the default value of 1.0
+            k_scale = 1.0
+            v_scale = 1.0
+        else:
+            # If we find a single kv_scale in the checkpoint, we remap
+            # kv_scale to k_scale during weight loading, and duplicate
+            # k_scale to v_scale here
+            assert layer.k_scale > 0.0
+            scale_to_duplicate = max(layer.k_scale, layer.v_scale)
+            k_scale = scale_to_duplicate.to("cpu").tolist()
+            v_scale = scale_to_duplicate.to("cpu").tolist()
+            if _is_hip and self.is_fp8_fnuz():
+                k_scale *= 2
+                v_scale *= 2
+        if not isinstance(k_scale, float) or not isinstance(v_scale, float):
+            raise ValueError(
+                "Only support per-tensor scaling factor " "for fp8 KV cache"
+            )
+        # These are used in the final Attention.forward()
+        layer.k_scale.copy_(k_scale)
+        layer.v_scale.copy_(v_scale)
+        layer.k_scale_float = k_scale
+        layer.v_scale_float = v_scale

sglang 0.4.5__py3-none-any.whl → 0.4.5.post1__py3-none-any.whl

sglang 0.4.5py3-none-any.whl → 0.4.5.post1py3-none-any.whl