PyPI - sglang - Versions diffs - 0.4.4.post4__py3-none-any.whl → 0.4.5.post1__py3-none-any.whl - Mend

sglang 0.4.4.post4py3-none-any.whl → 0.4.5.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

sglang/bench_one_batch.py +21 -0
sglang/bench_serving.py +10 -4
sglang/lang/chat_template.py +24 -0
sglang/srt/configs/model_config.py +40 -4
sglang/srt/constrained/base_grammar_backend.py +26 -5
sglang/srt/constrained/llguidance_backend.py +1 -0
sglang/srt/constrained/outlines_backend.py +1 -0
sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
sglang/srt/constrained/xgrammar_backend.py +1 -0
sglang/srt/conversation.py +29 -4
sglang/srt/disaggregation/base/__init__.py +8 -0
sglang/srt/disaggregation/base/conn.py +113 -0
sglang/srt/disaggregation/decode.py +18 -5
sglang/srt/disaggregation/mini_lb.py +53 -122
sglang/srt/disaggregation/mooncake/__init__.py +6 -0
sglang/srt/disaggregation/mooncake/conn.py +615 -0
sglang/srt/disaggregation/mooncake/transfer_engine.py +108 -0
sglang/srt/disaggregation/prefill.py +43 -19
sglang/srt/disaggregation/utils.py +31 -0
sglang/srt/entrypoints/EngineBase.py +53 -0
sglang/srt/entrypoints/engine.py +36 -8
sglang/srt/entrypoints/http_server.py +37 -8
sglang/srt/entrypoints/http_server_engine.py +142 -0
sglang/srt/entrypoints/verl_engine.py +37 -10
sglang/srt/hf_transformers_utils.py +4 -0
sglang/srt/layers/attention/flashattention_backend.py +609 -202
sglang/srt/layers/attention/flashinfer_backend.py +13 -7
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/dp_attention.py +2 -4
sglang/srt/layers/elementwise.py +15 -2
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
sglang/srt/layers/moe/fused_moe_native.py +5 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/{E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +34 -34
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +51 -24
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/router.py +7 -1
sglang/srt/layers/moe/topk.py +37 -16
sglang/srt/layers/quantization/__init__.py +13 -5
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +4 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +68 -45
sglang/srt/layers/quantization/fp8.py +28 -14
sglang/srt/layers/quantization/fp8_kernel.py +130 -4
sglang/srt/layers/quantization/fp8_utils.py +34 -6
sglang/srt/layers/quantization/kv_cache.py +43 -52
sglang/srt/layers/quantization/modelopt_quant.py +271 -4
sglang/srt/layers/quantization/moe_wna16.py +2 -0
sglang/srt/layers/quantization/w8a8_fp8.py +154 -4
sglang/srt/layers/quantization/w8a8_int8.py +3 -0
sglang/srt/layers/radix_attention.py +14 -0
sglang/srt/layers/rotary_embedding.py +75 -1
sglang/srt/managers/io_struct.py +254 -97
sglang/srt/managers/mm_utils.py +3 -2
sglang/srt/managers/multimodal_processors/base_processor.py +114 -77
sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
sglang/srt/managers/multimodal_processors/mllama4.py +146 -0
sglang/srt/managers/schedule_batch.py +62 -21
sglang/srt/managers/scheduler.py +71 -14
sglang/srt/managers/tokenizer_manager.py +17 -3
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/memory_pool.py +14 -1
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +7 -4
sglang/srt/model_executor/forward_batch_info.py +234 -15
sglang/srt/model_executor/model_runner.py +49 -9
sglang/srt/model_loader/loader.py +31 -4
sglang/srt/model_loader/weight_utils.py +4 -2
sglang/srt/models/baichuan.py +2 -0
sglang/srt/models/chatglm.py +1 -0
sglang/srt/models/commandr.py +1 -0
sglang/srt/models/dbrx.py +1 -0
sglang/srt/models/deepseek.py +1 -0
sglang/srt/models/deepseek_v2.py +248 -61
sglang/srt/models/exaone.py +1 -0
sglang/srt/models/gemma.py +1 -0
sglang/srt/models/gemma2.py +1 -0
sglang/srt/models/gemma3_causal.py +1 -0
sglang/srt/models/gpt2.py +1 -0
sglang/srt/models/gpt_bigcode.py +1 -0
sglang/srt/models/granite.py +1 -0
sglang/srt/models/grok.py +1 -0
sglang/srt/models/internlm2.py +1 -0
sglang/srt/models/llama.py +13 -4
sglang/srt/models/llama4.py +487 -0
sglang/srt/models/minicpm.py +1 -0
sglang/srt/models/minicpm3.py +2 -0
sglang/srt/models/mixtral.py +1 -0
sglang/srt/models/mixtral_quant.py +1 -0
sglang/srt/models/mllama.py +51 -8
sglang/srt/models/mllama4.py +227 -0
sglang/srt/models/olmo.py +1 -0
sglang/srt/models/olmo2.py +1 -0
sglang/srt/models/olmoe.py +1 -0
sglang/srt/models/phi3_small.py +1 -0
sglang/srt/models/qwen.py +1 -0
sglang/srt/models/qwen2.py +1 -0
sglang/srt/models/qwen2_5_vl.py +35 -70
sglang/srt/models/qwen2_moe.py +1 -0
sglang/srt/models/qwen2_vl.py +27 -25
sglang/srt/models/stablelm.py +1 -0
sglang/srt/models/xverse.py +1 -0
sglang/srt/models/xverse_moe.py +1 -0
sglang/srt/openai_api/adapter.py +4 -1
sglang/srt/patch_torch.py +11 -0
sglang/srt/server_args.py +34 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
sglang/srt/speculative/eagle_utils.py +1 -11
sglang/srt/speculative/eagle_worker.py +6 -2
sglang/srt/utils.py +120 -9
sglang/test/attention/test_flashattn_backend.py +259 -221
sglang/test/attention/test_flashattn_mla_backend.py +285 -0
sglang/test/attention/test_prefix_chunk_info.py +224 -0
sglang/test/test_block_fp8.py +57 -0
sglang/test/test_utils.py +19 -8
sglang/version.py +1 -1
{sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/METADATA +14 -4
{sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/RECORD +133 -109
sglang/srt/disaggregation/conn.py +0 -81
{sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py CHANGED Viewed

@@ -77,6 +77,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         sparsity_ignore_list: List[str],
         kv_cache_scheme: Optional[Dict[str, Any]] = None,
         config: Optional[Dict[str, Any]] = None,
+        packed_modules_mapping: Dict[str, List[str]] = {},
     ):
         super().__init__()
         self.ignore = ignore
@@ -87,6 +88,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         self.sparsity_scheme_map = sparsity_scheme_map
         self.sparsity_ignore_list = sparsity_ignore_list
         self.config = config
+        self.packed_modules_mapping = packed_modules_mapping
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
@@ -136,6 +138,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config(
             config=config
         )
+        packed_modules_mapping = config.get("packed_modules_mapping", {})
         return cls(
             target_scheme_map=target_scheme_map,
@@ -144,6 +147,7 @@ class CompressedTensorsConfig(QuantizationConfig):
             sparsity_scheme_map=sparsity_scheme_map,
             sparsity_ignore_list=sparsity_ignore_list,
             config=config,
+            packed_modules_mapping=packed_modules_mapping,
         )
     @classmethod

sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py CHANGED Viewed

@@ -103,16 +103,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             "input_activations"
         )
-        if not (
-            self.weight_quant.strategy == QuantizationStrategy.TENSOR
-            and self.input_quant.strategy == QuantizationStrategy.TENSOR
-        ):
-            raise ValueError(
-                "For FP8 Fused MoE layers, only per-tensor scales "
-                "for weights and activations are supported. Found "
-                f"{self.weight_quant}, {self.input_quant}"
-            )
         self.static_input_scales = not self.input_quant.dynamic
     def create_weights(
@@ -154,27 +144,50 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         set_weight_attrs(w2_weight, extra_weight_attrs)
         # WEIGHT_SCALES
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
-        )
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        # per-tensor quantization
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
+            weight_quant_method = FusedMoeWeightScaleSupported.TENSOR.value
+        elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    1,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
+                requires_grad=False,
+            )
+            weight_quant_method = FusedMoeWeightScaleSupported.CHANNEL.value
+        else:
+            raise ValueError(
+                f"Unsupported weight quantization strategy: {self.weight_quant.strategy}"
+            )
-        w2_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
-        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
         layer.register_parameter("w2_weight_scale", w2_weight_scale)
         # Add the quantization method used (per tensor/grouped/channel)
         # to ensure the weight scales are loaded in properly
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
-        )
+        extra_weight_attrs.update({"quant_method": weight_quant_method})
         set_weight_attrs(w13_weight_scale, extra_weight_attrs)
         set_weight_attrs(w2_weight_scale, extra_weight_attrs)
         # INPUT_SCALES
         if self.static_input_scales:
+            assert (
+                self.input_quant.strategy == QuantizationStrategy.TENSOR
+            ), "Only per-tensor quantization is supported for static input scales"
             w13_input_scale = torch.nn.Parameter(
                 torch.ones(num_experts, dtype=torch.float32), requires_grad=False
             )
@@ -241,31 +254,37 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 layer.w2_input_scale = torch.nn.Parameter(
                     w2_input_scale, requires_grad=False
                 )
-        # Fp8 moe kernel needs single weight scale for w13 per expert.
-        # We take the max then dequant and requant each expert.
-        assert layer.w13_weight_scale is not None
-        shard_size = layer.intermediate_size_per_partition
-        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-        for expert_id in range(layer.local_num_experts):
-            start = 0
-            for shard_id in range(2):
-                dq_weight = per_tensor_dequantize(
-                    layer.w13_weight[expert_id][start : start + shard_size, :],
-                    layer.w13_weight_scale[expert_id][shard_id],
-                )
-                if _is_cuda:
-                    layer.w13_weight[expert_id][start : start + shard_size, :], _ = (
-                        sgl_scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
-                    )
-                else:
-                    layer.w13_weight[expert_id][start : start + shard_size, :], _ = (
-                        vllm_ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            # Fp8 moe kernel needs single weight scale for w13 per expert.
+            # We take the max then dequant and requant each expert.
+            assert layer.w13_weight_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+            for expert_id in range(layer.local_num_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start : start + shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id],
                     )
-                start += shard_size
-        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, requires_grad=False)
+                    if _is_cuda:
+                        (
+                            layer.w13_weight[expert_id][start : start + shard_size, :],
+                            _,
+                        ) = sgl_scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+                    else:
+                        (
+                            layer.w13_weight[expert_id][start : start + shard_size, :],
+                            _,
+                        ) = vllm_ops.scaled_fp8_quant(
+                            dq_weight, max_w13_scales[expert_id]
+                        )
+                    start += shard_size
+            layer.w13_weight_scale = torch.nn.Parameter(
+                max_w13_scales, requires_grad=False
+            )
     def apply(
         self,
@@ -285,6 +304,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         activation: str = "silu",
         inplace: bool = True,
         no_combine: bool = False,
+        apply_router_weight_on_input: bool = False,
     ) -> torch.Tensor:
         from sglang.srt.layers.moe.fused_moe_triton import fused_experts
         from sglang.srt.layers.moe.topk import select_experts
@@ -310,10 +330,13 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             inplace=inplace,
             activation=activation,
             use_fp8_w8a8=True,
+            per_channel_quant=self.weight_quant.strategy
+            == QuantizationStrategy.CHANNEL,
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
+            apply_router_weight_on_input=apply_router_weight_on_input,
         )

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -71,7 +71,8 @@ ACTIVATION_SCHEMES = ["static", "dynamic"]
 _is_hip = is_hip()
 if _is_hip:
-    from aiter.fused_moe_bf16_asm import asm_moe
+    from aiter import ActivationType
+    from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages, ck_moe_2stages_win4
     from aiter.ops.shuffle import shuffle_weight
 _is_cuda = is_cuda()
@@ -487,7 +488,7 @@ class Fp8MoEMethod:
         if self.quant_config.is_checkpoint_fp8_serialized:
             params_dtype = (
-                torch.int32
+                torch.uint32
                 if get_bool_env_var("USE_INT4_WEIGHT")
                 else torch.float8_e4m3fn
             )
@@ -822,12 +823,14 @@ class Fp8MoEMethod:
         # INT4-FP8 (INT4 MoE Weight, FP8 Compute)
         # Weight Permutation
         layer.w13_weight = torch.nn.Parameter(
-            permute_weight(layer.w13_weight.data),
+            # permute_weight(layer.w13_weight.data),
+            shuffle_weight(layer.w13_weight.data, (16, 16)),
             requires_grad=False,
         )
         torch.cuda.empty_cache()
         layer.w2_weight = torch.nn.Parameter(
-            permute_weight(layer.w2_weight.data),
+            # permute_weight(layer.w2_weight.data),
+            shuffle_weight(layer.w2_weight.data, (16, 16)),
             requires_grad=False,
         )
         torch.cuda.empty_cache()
@@ -860,19 +863,21 @@ class Fp8MoEMethod:
             layer.w13_weight_scale1[expert_id] *= max_w13_scales[expert_id]
             layer.w2_weight_scale1[expert_id] *= layer.w2_weight_scale[expert_id]
-    def process_weights_hip_scale_padding(self, layer: Module, padding_size: int):
+    def process_weights_hip_scale_padding(self, layer: Module):
         from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
             padding_size,  # Avoid circular import
         )
         if get_bool_env_var("CK_MOE"):
             layer.w13_weight = torch.nn.Parameter(
-                permute_weight(layer.w13_weight.data),
+                # permute_weight(layer.w13_weight.data),
+                shuffle_weight(layer.w13_weight.data, (16, 16)),
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
             layer.w2_weight = torch.nn.Parameter(
-                permute_weight(layer.w2_weight.data),
+                # permute_weight(layer.w2_weight.data),
+                shuffle_weight(layer.w2_weight.data, (16, 16)),
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
@@ -905,6 +910,7 @@ class Fp8MoEMethod:
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
         inplace: bool = True,
         no_combine: bool = False,
     ) -> torch.Tensor:
@@ -927,7 +933,7 @@ class Fp8MoEMethod:
         if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
             # TODO: add triton kernel and add check get_bool_env_var("CK_MOE")
             assert not no_combine, f"{no_combine=} is not supported."
-            return asm_moe(
+            return ck_moe_2stages_win4(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
@@ -935,15 +941,17 @@ class Fp8MoEMethod:
                 topk_ids,
                 layer.w13_weight_scale1,
                 layer.w2_weight_scale1,
-                activation=activation,
+                activation=(
+                    ActivationType.Silu if activation == "silu" else ActivationType.Gelu
+                ),
             )
         if _is_hip and get_bool_env_var("CK_MOE"):
-            # TODO(CK_MOE): FP8 or FP8 block_quant only supports 'silu' for the time-being.
-            assert (
-                activation == "silu"
-            ), f"CK_MOE: FP8 and/or FP8 bloack_quant {activation=} will be supported later, unset CK_MOE"
             assert not no_combine, f"{no_combine=} is not supported."
             if self.block_quant:
+                # TODO(CK_MOE): FP8 block_quant only supports 'silu' for the time-being.
+                assert (
+                    activation == "silu"
+                ), f"CK_MOE: FP8 bloack_quant {activation=} will be supported later, unset CK_MOE"
                 return asm_moe(
                     x,
                     layer.w13_weight,
@@ -956,7 +964,7 @@ class Fp8MoEMethod:
                     expert_mask=None,
                 )
             else:
-                return asm_moe(
+                return ck_moe_2stages(
                     x,
                     layer.w13_weight,
                     layer.w2_weight,
@@ -964,6 +972,11 @@ class Fp8MoEMethod:
                     topk_ids,
                     layer.w13_weight_scale1,
                     layer.w2_weight_scale1,
+                    activation=(
+                        ActivationType.Silu
+                        if activation == "silu"
+                        else ActivationType.Gelu
+                    ),
                 )
         else:
             # Expert fusion with FP8 quantization
@@ -975,6 +988,7 @@ class Fp8MoEMethod:
                 topk_ids=topk_ids,
                 inplace=inplace and not no_combine,
                 activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input,
                 use_fp8_w8a8=True,
                 w1_scale=(
                     layer.w13_weight_scale_inv

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -16,6 +16,7 @@ import functools
 import json
 import logging
 import os
+from contextlib import contextmanager
 from typing import Any, Dict, List, Optional, Tuple
 import torch
@@ -40,11 +41,13 @@ fp8_type_ = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
 _is_cuda = is_cuda()
 if _is_cuda:
-    import deep_gemm  # `pip install "sgl-kernel>=0.0.4.post3"`
+    import deep_gemm
     from sgl_kernel import sgl_per_token_group_quant_fp8, sgl_per_token_quant_fp8
     sm_version = get_device_sm()
-    if sm_version >= 90 and get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true"):
+    if sm_version == 90 and get_bool_env_var(
+        "SGL_ENABLE_JIT_DEEPGEMM", default="false"
+    ):
         _enable_jit_deepgemm = True
@@ -59,7 +62,10 @@ if supports_custom_op():
         Bs: torch.Tensor,
         C: torch.Tensor,
     ) -> None:
-        deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
+        M, K = A.shape
+        N, _ = B.shape
+        with _log_jit_build(M, N, K):
+            deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
     def deep_gemm_fp8_fp8_bf16_nt_fake(
         A: torch.Tensor,
@@ -708,6 +714,25 @@ def get_w8a8_block_fp8_configs(
     return None
+@contextmanager
+def _log_jit_build(M: int, N: int, K: int):
+    from deep_gemm.jit.runtime import RuntimeCache
+    origin_func = RuntimeCache.__getitem__
+    def __patched_func(self, *args, **kwargs):
+        ret = origin_func(self, *args, **kwargs)
+        if ret is None:
+            logger.warning(
+                f"DeepGEMM JIT code generation <gemm_fp8_fp8_bf16_nt>: M={M}, N={N}, K={K}. Please wait."
+            )
+        return ret
+    RuntimeCache.__getitem__ = __patched_func
+    yield
+    RuntimeCache.__getitem__ = origin_func
 def w8a8_block_fp8_matmul(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -782,7 +807,8 @@ def w8a8_block_fp8_matmul(
         if supports_custom_op():
             torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
         else:
-            deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
+            with _log_jit_build(M, N, K):
+                deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
     else:
         kernel = (
             _w8a8_block_fp8_matmul_unrolledx4
@@ -815,3 +841,103 @@ def w8a8_block_fp8_matmul(
         )
     return C
+@triton.jit
+def _per_tensor_quant_mla_fp8_stage1(
+    x_ptr,
+    x_s_ptr,
+    head_size,
+    x_stride_h,
+    x_stride_s,
+    eps,
+    fp8_max,
+    BLOCK_SIZE: tl.constexpr,
+):
+    seq_id = tl.program_id(0)
+    head_id = tl.program_id(1)
+    offset = tl.arange(0, BLOCK_SIZE)
+    mask = offset < head_size
+    x_ptr += head_id * x_stride_h + seq_id * x_stride_s
+    x = tl.load(x_ptr + offset, mask=mask, other=0.0).to(tl.float32)
+    _absmax = tl.maximum(tl.max(tl.abs(x)), eps)
+    tl.atomic_max(x_s_ptr, _absmax / fp8_max)
+@triton.jit
+def _per_tensor_quant_mla_fp8_stage2(
+    x_ptr,
+    x_s_ptr,
+    x_q_ptr,
+    num_seq,
+    head_size,
+    x_stride_h,
+    x_stride_s,
+    fp8_min,
+    fp8_max,
+    BLOCK_SIZE: tl.constexpr,
+):
+    seq_id = tl.program_id(0)
+    head_id = tl.program_id(1)
+    offset = tl.arange(0, BLOCK_SIZE)
+    mask = offset < head_size
+    x_s = tl.load(x_s_ptr)
+    x_s_inv = 1.0 / x_s
+    x_ptr += head_id * x_stride_h + seq_id * x_stride_s
+    x_q_ptr += head_id * num_seq * head_size + seq_id * head_size
+    x = tl.load(x_ptr + offset, mask=mask, other=0.0).to(tl.float32)
+    x_q = tl.clamp(x * x_s_inv, fp8_min, fp8_max).to(x_q_ptr.dtype.element_ty)
+    tl.store(x_q_ptr + offset, x_q, mask=mask)
+def per_tensor_quant_mla_fp8(
+    x: torch.Tensor, eps: float = 1e-12, dtype: torch.dtype = torch.float8_e4m3fn
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    This function quantizes input values to float8 values with tensor-wise quantization
+    and specialized for mla absorbed case.
+    """
+    assert x.dim() == 3, "`x` is not a 3d-tensor"
+    finfo = torch.finfo(dtype)
+    fp8_max = finfo.max
+    if _is_hip:
+        dtype = torch.float8_e4m3fnuz
+        fp8_max = 224.0
+    x_q = x.new_empty(x.size(), dtype=dtype)
+    x_s = torch.zeros((1,), dtype=torch.float32, device=x.device)
+    num_head, num_seq, head_size = x.shape
+    BLOCK_SIZE = triton.next_power_of_2(head_size)
+    grid = (num_seq, num_head)
+    _per_tensor_quant_mla_fp8_stage1[grid](
+        x,
+        x_s,
+        head_size,
+        x.stride(0),
+        x.stride(1),
+        eps,
+        fp8_max,
+        BLOCK_SIZE,
+    )
+    _per_tensor_quant_mla_fp8_stage2[grid](
+        x,
+        x_s,
+        x_q,
+        num_seq,
+        head_size,
+        x.stride(0),
+        x.stride(1),
+        -fp8_max,
+        fp8_max,
+        BLOCK_SIZE,
+    )
+    return x_q, x_s

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -168,12 +168,13 @@ def input_to_float8(
     """This function quantizes input values to float8 values with tensor-wise quantization."""
     finfo = torch.finfo(dtype)
     min_val, max_val = x.aminmax()
-    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    amax = torch.maximum(min_val.abs(), max_val.abs()).float().clamp(min=1e-12)
     fp8_max = finfo.max
     if _is_hip:
+        dtype = torch.float8_e4m3fnuz
         fp8_max = 224.0
     scale = fp8_max / amax
-    x_scl_sat = (x * scale).clamp(min=-fp8_max, max=fp8_max)
+    x_scl_sat = (x.float() * scale).clamp(min=-fp8_max, max=fp8_max)
     return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
@@ -212,7 +213,24 @@ def block_quant_to_tensor_quant(
         for j in range(n_tiles):
             x_dq_block_tiles[j][i][:, :] = x_dq_block_tiles[j][i] * x_s[j][i]
-    x_q_tensor, scale = input_to_float8(x_dq_block, dtype=x_q_block.dtype)
+    x_q_tensor, scale = (
+        sgl_scaled_fp8_quant(x_dq_block)
+        if _is_cuda
+        else input_to_float8(x_dq_block, dtype=x_q_block.dtype)
+    )
+    return x_q_tensor, scale
+def channel_quant_to_tensor_quant(
+    x_q_channel: torch.Tensor,
+    x_s: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    x_dq_channel = x_q_channel.to(torch.float32) * x_s
+    x_q_tensor, scale = (
+        sgl_scaled_fp8_quant(x_dq_channel)
+        if _is_cuda
+        else input_to_float8(x_dq_channel, dtype=x_q_channel.dtype)
+    )
     return x_q_tensor, scale
@@ -242,9 +260,19 @@ def apply_fp8_linear(
         if _is_cuda:
             qinput, x_scale = sglang_per_token_quant_fp8(input_2d)
         else:
-            qinput, x_scale = per_token_group_quant_fp8(
-                input_2d, group_size=input_2d.shape[1]
-            )
+            # TODO(kkhuang): temporarily enforce per-tensor activation scaling if weight is per-tensor scaling
+            # final solution should be: 1. add support to per-tensor activation scaling.
+            # 2. solve the torch.compile error from weight_scale.numel() == 1 and x_scale.numel() > 1 (below line#308)
+            if _is_hip and weight_scale.numel() == 1:
+                qinput, x_scale = ops.scaled_fp8_quant(
+                    input_2d,
+                    input_scale,
+                    use_per_token_if_dynamic=use_per_token_if_dynamic,
+                )
+            else:
+                qinput, x_scale = per_token_group_quant_fp8(
+                    input_2d, group_size=input_2d.shape[1]
+                )
     if cutlass_fp8_supported:
         try:

sglang 0.4.4.post4__py3-none-any.whl → 0.4.5.post1__py3-none-any.whl

sglang 0.4.4.post4py3-none-any.whl → 0.4.5.post1py3-none-any.whl