PyPI - sglang - Versions diffs - 0.4.1.post5__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl - Mend

sglang 0.4.1.post5py3-none-any.whl → 0.4.1.post7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (129) hide show

sglang/__init__.py +21 -23
sglang/api.py +2 -7
sglang/bench_offline_throughput.py +24 -16
sglang/bench_one_batch.py +51 -3
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +37 -28
sglang/lang/backend/runtime_endpoint.py +183 -4
sglang/lang/chat_template.py +15 -4
sglang/launch_server.py +1 -1
sglang/srt/_custom_ops.py +80 -42
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/model_config.py +16 -6
sglang/srt/constrained/base_grammar_backend.py +21 -0
sglang/srt/constrained/xgrammar_backend.py +8 -4
sglang/srt/conversation.py +14 -1
sglang/srt/distributed/__init__.py +3 -3
sglang/srt/distributed/communication_op.py +2 -1
sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +107 -40
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
sglang/srt/distributed/device_communicators/pynccl.py +80 -1
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
sglang/srt/distributed/parallel_state.py +1 -1
sglang/srt/distributed/utils.py +2 -1
sglang/srt/entrypoints/engine.py +449 -0
sglang/srt/entrypoints/http_server.py +579 -0
sglang/srt/layers/activation.py +3 -3
sglang/srt/layers/attention/flashinfer_backend.py +27 -12
sglang/srt/layers/attention/triton_backend.py +4 -6
sglang/srt/layers/attention/vision.py +204 -0
sglang/srt/layers/dp_attention.py +69 -0
sglang/srt/layers/linear.py +76 -102
sglang/srt/layers/logits_processor.py +48 -63
sglang/srt/layers/moe/ep_moe/layer.py +4 -4
sglang/srt/layers/moe/fused_moe_native.py +69 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -6
sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -14
sglang/srt/layers/moe/topk.py +4 -2
sglang/srt/layers/parameter.py +26 -17
sglang/srt/layers/quantization/__init__.py +22 -23
sglang/srt/layers/quantization/fp8.py +112 -55
sglang/srt/layers/quantization/fp8_utils.py +1 -1
sglang/srt/layers/quantization/int8_kernel.py +54 -0
sglang/srt/layers/quantization/modelopt_quant.py +2 -3
sglang/srt/layers/quantization/w8a8_int8.py +117 -0
sglang/srt/layers/radix_attention.py +2 -0
sglang/srt/layers/rotary_embedding.py +1179 -31
sglang/srt/layers/sampler.py +39 -1
sglang/srt/layers/vocab_parallel_embedding.py +17 -4
sglang/srt/lora/lora.py +1 -9
sglang/srt/managers/configure_logging.py +46 -0
sglang/srt/managers/data_parallel_controller.py +79 -72
sglang/srt/managers/detokenizer_manager.py +23 -8
sglang/srt/managers/image_processor.py +158 -2
sglang/srt/managers/io_struct.py +54 -15
sglang/srt/managers/schedule_batch.py +49 -22
sglang/srt/managers/schedule_policy.py +26 -12
sglang/srt/managers/scheduler.py +319 -181
sglang/srt/managers/session_controller.py +1 -0
sglang/srt/managers/tokenizer_manager.py +303 -158
sglang/srt/managers/tp_worker.py +6 -4
sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
sglang/srt/managers/utils.py +44 -0
sglang/srt/mem_cache/memory_pool.py +110 -77
sglang/srt/metrics/collector.py +25 -11
sglang/srt/model_executor/cuda_graph_runner.py +4 -6
sglang/srt/model_executor/model_runner.py +80 -21
sglang/srt/model_loader/loader.py +8 -6
sglang/srt/model_loader/weight_utils.py +55 -2
sglang/srt/models/baichuan.py +6 -6
sglang/srt/models/chatglm.py +2 -2
sglang/srt/models/commandr.py +3 -3
sglang/srt/models/dbrx.py +4 -4
sglang/srt/models/deepseek.py +3 -3
sglang/srt/models/deepseek_v2.py +8 -8
sglang/srt/models/exaone.py +2 -2
sglang/srt/models/gemma.py +2 -2
sglang/srt/models/gemma2.py +6 -24
sglang/srt/models/gpt2.py +3 -5
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/granite.py +2 -2
sglang/srt/models/grok.py +3 -3
sglang/srt/models/internlm2.py +2 -2
sglang/srt/models/llama.py +41 -4
sglang/srt/models/minicpm.py +2 -2
sglang/srt/models/minicpm3.py +6 -6
sglang/srt/models/minicpmv.py +1238 -0
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mixtral_quant.py +3 -3
sglang/srt/models/mllama.py +2 -2
sglang/srt/models/olmo.py +3 -3
sglang/srt/models/olmo2.py +4 -4
sglang/srt/models/olmoe.py +7 -13
sglang/srt/models/phi3_small.py +2 -2
sglang/srt/models/qwen.py +2 -2
sglang/srt/models/qwen2.py +52 -4
sglang/srt/models/qwen2_eagle.py +131 -0
sglang/srt/models/qwen2_moe.py +3 -3
sglang/srt/models/qwen2_vl.py +22 -122
sglang/srt/models/stablelm.py +2 -2
sglang/srt/models/torch_native_llama.py +3 -3
sglang/srt/models/xverse.py +6 -6
sglang/srt/models/xverse_moe.py +6 -6
sglang/srt/openai_api/protocol.py +2 -0
sglang/srt/sampling/custom_logit_processor.py +38 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +15 -5
sglang/srt/sampling/sampling_batch_info.py +153 -9
sglang/srt/sampling/sampling_params.py +4 -2
sglang/srt/server.py +4 -1037
sglang/srt/server_args.py +84 -32
sglang/srt/speculative/eagle_worker.py +1 -0
sglang/srt/torch_memory_saver_adapter.py +59 -0
sglang/srt/utils.py +130 -63
sglang/test/runners.py +8 -13
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +3 -1
sglang/utils.py +12 -2
sglang/version.py +1 -1
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/METADATA +26 -13
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/RECORD +126 -117
sglang/launch_server_llavavid.py +0 -25
sglang/srt/constrained/__init__.py +0 -16
sglang/srt/distributed/device_communicators/__init__.py +0 -0
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/layer.py CHANGED Viewed

@@ -5,20 +5,21 @@ from enum import Enum
 from typing import Callable, List, Optional, Tuple
 import torch
-from vllm.distributed import (
+from vllm.model_executor.custom_op import CustomOp
+from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
-from vllm.model_executor.custom_op import CustomOp
 from sglang.srt.layers.custom_op_util import register_custom_op
+from sglang.srt.layers.moe.fused_moe_native import moe_forward_native
 from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.utils import set_weight_attrs
+from sglang.srt.utils import get_bool_env_var, is_hip, permute_weight, set_weight_attrs
 if torch.cuda.is_available():
     from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
@@ -27,6 +28,8 @@ else:
 import logging
+is_hip_ = is_hip()
 logger = logging.getLogger(__name__)
@@ -97,6 +100,20 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if is_hip_ and get_bool_env_var("CK_MOE"):
+            layer.w13_weight = torch.nn.Parameter(
+                permute_weight(layer.w13_weight.data),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+            layer.w2_weight = torch.nn.Parameter(
+                permute_weight(layer.w2_weight.data),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+        return
     def apply(
         self,
         layer: torch.nn.Module,
@@ -148,17 +165,52 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             correction_bias=correction_bias,
         )
-        return fused_experts(
-            hidden_states=x,
-            w1=layer.w13_weight,
-            w2=layer.w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=True,
-        )
+        if is_hip_ and get_bool_env_var("CK_MOE"):
+            import ater
+            from ater.fused_moe import fused_experts_ck
+            return fused_experts_ck(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+            )
+        else:
+            return fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+            )
-    def forward_cpu(self, *args, **kwargs):
-        raise NotImplementedError("The CPU backend currently does not support MoE.")
+    def forward_cpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        correction_bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return moe_forward_native(
+            layer,
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            custom_routing_function,
+            correction_bias,
+        )
     def forward_tpu(self, *args, **kwargs) -> torch.Tensor:
         raise NotImplementedError("The TPU backend currently does not support MoE.")

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -24,7 +24,9 @@ def fused_topk_native(
     topk: int,
     renormalize: bool,
 ):
-    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    assert (
+        hidden_states.shape[0] == gating_output.shape[0]
+    ), f"Number of tokens mismatch, {hidden_states.shape=} vs {gating_output.shape=}"
     M, _ = hidden_states.shape
     topk_weights = torch.empty(
         M, topk, dtype=torch.float32, device=hidden_states.device
@@ -180,7 +182,7 @@ def select_experts(
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
             )
-    elif torch_native:
+    elif torch_native and custom_routing_function is None:
         topk_weights, topk_ids = fused_topk_native(
             hidden_states=hidden_states,
             gating_output=router_logits,

sglang/srt/layers/parameter.py CHANGED Viewed

@@ -1,7 +1,4 @@
-"""
-Adapted from vLLM (0.6.4.post1).
-https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/parameter.py
-"""
+"""Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/parameter.py"""
 import logging
 from fractions import Fraction
@@ -9,7 +6,8 @@ from typing import Callable, Optional, Union
 import torch
 from torch.nn import Parameter
-from vllm.distributed import get_tensor_model_parallel_rank
+from sglang.srt.distributed import get_tensor_model_parallel_rank
 __all__ = [
     "BasevLLMParameter",
@@ -88,12 +86,17 @@ class _ColumnvLLMParameter(BasevLLMParameter):
     def output_dim(self):
         return self._output_dim
-    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
-        tp_rank = get_tensor_model_parallel_rank()
-        shard_size = self.data.shape[self.output_dim]
-        loaded_weight = loaded_weight.narrow(
-            self.output_dim, tp_rank * shard_size, shard_size
-        )
+    def load_column_parallel_weight(
+        self,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        use_presharded_weights: bool = False,
+    ):
+        if not use_presharded_weights:
+            shard_size = self.data.shape[self.output_dim]
+            loaded_weight = loaded_weight.narrow(
+                self.output_dim, tp_rank * shard_size, shard_size
+            )
         assert self.data.shape == loaded_weight.shape
         self.data.copy_(loaded_weight)
@@ -121,7 +124,7 @@ class _ColumnvLLMParameter(BasevLLMParameter):
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
-    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, tp_rank: int, **kwargs):
         shard_offset = kwargs.get("shard_offset")
         shard_size = kwargs.get("shard_size")
@@ -137,7 +140,6 @@ class _ColumnvLLMParameter(BasevLLMParameter):
             )
         param_data = self.data
-        tp_rank = get_tensor_model_parallel_rank()
         shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
         param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
         loaded_weight = loaded_weight.narrow(
@@ -164,11 +166,14 @@ class RowvLLMParameter(BasevLLMParameter):
     def input_dim(self):
         return self._input_dim
-    def load_row_parallel_weight(self, loaded_weight: torch.Tensor, **kwargs):
-        use_presharded_weights = kwargs.get("use_presharded_weights")
-        tp_rank = get_tensor_model_parallel_rank()
-        shard_size = self.data.shape[self.input_dim]
+    def load_row_parallel_weight(
+        self,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        use_presharded_weights: bool = False,
+    ):
         if not use_presharded_weights:
+            shard_size = self.data.shape[self.input_dim]
             loaded_weight = loaded_weight.narrow(
                 self.input_dim, tp_rank * shard_size, shard_size
             )
@@ -238,6 +243,8 @@ class PerTensorScaleParameter(BasevLLMParameter):
     # For row parallel layers, no sharding needed
     # load weight into parameter as is
     def load_row_parallel_weight(self, *args, **kwargs):
+        kwargs.pop("tp_rank", None)
+        kwargs.pop("use_presharded_weights", None)
         super().load_row_parallel_weight(*args, **kwargs)
     def load_merged_column_weight(self, *args, **kwargs):
@@ -247,6 +254,8 @@ class PerTensorScaleParameter(BasevLLMParameter):
         self._load_into_shard_id(*args, **kwargs)
     def load_column_parallel_weight(self, *args, **kwargs):
+        kwargs.pop("tp_rank", None)
+        kwargs.pop("use_presharded_weights", None)
         super().load_row_parallel_weight(*args, **kwargs)
     def _load_into_shard_id(

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -23,6 +23,7 @@ from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.fp8 import Fp8Config
 from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp8Config
+from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
 QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "aqlm": AQLMConfig,
@@ -42,6 +43,7 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "bitsandbytes": BitsAndBytesConfig,
     "qqq": QQQConfig,
     "experts_int8": ExpertsInt8Config,
+    "w8a8_int8": W8A8Int8Config,
 }
@@ -54,33 +56,13 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     return QUANTIZATION_METHODS[quantization]
-def fp8_get_quant_method(self, layer, prefix):
-    """Enhanced get_quant_method for FP8 config."""
-    from vllm.model_executor.layers.linear import LinearBase
-    from vllm.model_executor.layers.quantization.utils.quant_utils import (
-        is_layer_skipped,
-    )
-    from sglang.srt.layers.linear import UnquantizedLinearMethod
-    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
-    from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod, Fp8MoEMethod
-    if isinstance(layer, LinearBase):
-        if is_layer_skipped(prefix, self.ignored_layers):
-            return UnquantizedLinearMethod()
-        return Fp8LinearMethod(self)
-    elif isinstance(layer, FusedMoE):
-        return Fp8MoEMethod(self)
-    return None
 def gptq_get_quant_method(self, layer, prefix):
-    from vllm.model_executor.layers.linear import LinearBase
     from vllm.model_executor.layers.quantization.gptq_marlin import (
         GPTQMarlinLinearMethod,
         GPTQMarlinMoEMethod,
     )
+    from sglang.srt.layers.linear import LinearBase
     from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
     if isinstance(layer, LinearBase):
@@ -91,12 +73,12 @@ def gptq_get_quant_method(self, layer, prefix):
 def awq_get_quant_method(self, layer, prefix):
-    from vllm.model_executor.layers.linear import LinearBase
     from vllm.model_executor.layers.quantization.awq_marlin import (
         AWQMarlinLinearMethod,
         AWQMoEMethod,
     )
+    from sglang.srt.layers.linear import LinearBase
     from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
     if isinstance(layer, LinearBase):
@@ -106,13 +88,30 @@ def awq_get_quant_method(self, layer, prefix):
     return None
+def patch_vllm_linear_base_isinstance():
+    import builtins
+    from vllm.model_executor.layers.linear import LinearBase
+    from sglang.srt.layers.linear import LinearBase as PatchedLinearBase
+    original_isinstance = builtins.isinstance
+    def patched_isinstance(obj, classinfo):
+        if classinfo is LinearBase:
+            return original_isinstance(obj, PatchedLinearBase)
+        return original_isinstance(obj, classinfo)
+    builtins.isinstance = patched_isinstance
 def apply_monkey_patches():
     """Apply all monkey patches in one place."""
-    setattr(Fp8Config, "get_quant_method", fp8_get_quant_method)
     setattr(GPTQMarlinConfig, "get_quant_method", gptq_get_quant_method)
     setattr(AWQMarlinConfig, "get_quant_method", awq_get_quant_method)
+patch_vllm_linear_base_isinstance()
 # Apply patches when module is imported
 apply_monkey_patches()

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/quantization/fp8.py
 import logging
-import os
 from typing import Any, Callable, Dict, List, Optional
 import torch
@@ -9,8 +8,6 @@ import torch.nn.functional as F
 from torch.nn import Module
 from torch.nn.parameter import Parameter
 from vllm import _custom_ops as ops
-from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.model_executor.layers.linear import LinearBase
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear,
@@ -26,7 +23,12 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     requantize_with_max_scale,
 )
-from sglang.srt.layers.linear import LinearMethodBase, UnquantizedLinearMethod
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
 from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
@@ -40,12 +42,15 @@ from sglang.srt.layers.quantization.fp8_utils import (
 from sglang.srt.utils import (
     get_bool_env_var,
     is_hip,
+    permute_weight,
     print_warning_once,
     set_weight_attrs,
 )
 ACTIVATION_SCHEMES = ["static", "dynamic"]
+is_hip_ = is_hip()
 logger = logging.getLogger(__name__)
@@ -161,7 +166,7 @@ class Fp8LinearMethod(LinearMethodBase):
         # kernel for fast weight-only FP8 quantization
         self.use_marlin = get_bool_env_var("SGLANG_FORCE_FP8_MARLIN")
         # Disable marlin for ROCm
-        if is_hip():
+        if is_hip_:
             self.use_marlin = False
         self.block_quant = self.quant_config.weight_block_size is not None
@@ -273,7 +278,7 @@ class Fp8LinearMethod(LinearMethodBase):
         # Block quant doesn't need to process weights after loading
         if self.block_quant:
             # If ROCm, normalize the weights and scales to e4m3fnuz
-            if is_hip():
+            if is_hip_:
                 # activation_scheme: dynamic
                 weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
                     weight=layer.weight,
@@ -330,7 +335,7 @@ class Fp8LinearMethod(LinearMethodBase):
                 weight_scale = layer.weight_scale
                 # If ROCm, normalize the weights and scales to e4m3fnuz
-                if is_hip():
+                if is_hip_:
                     weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
                         weight=weight,
                         weight_scale=weight_scale,
@@ -567,7 +572,7 @@ class Fp8MoEMethod:
         # Block quant doesn't need to process weights after loading
         if self.block_quant:
             # If ROCm, normalize the weights and scales to e4m3fnuz
-            if is_hip():
+            if is_hip_:
                 # activation_scheme: dynamic
                 w13_weight, w13_weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
                     weight=layer.w13_weight,
@@ -594,7 +599,7 @@ class Fp8MoEMethod:
         # If checkpoint is fp16 or bfloat16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             # If ROCm, use float8_e4m3fnuz instead (MI300x HW)
-            fp8_dtype = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
+            fp8_dtype = torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn
             w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
             w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
@@ -616,18 +621,30 @@ class Fp8MoEMethod:
             layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
-            # If ROCm, apply weight padding (min. Mem channel contention) only if set
-            if is_hip() and bool(int(os.getenv("MOE_PADDING", "0"))):
-                layer.w13_weight = torch.nn.Parameter(
-                    F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
-                    requires_grad=False,
-                )
-                torch.cuda.empty_cache()
-                layer.w2_weight = torch.nn.Parameter(
-                    F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
-                    requires_grad=False,
-                )
-                torch.cuda.empty_cache()
+            if is_hip_:
+                if get_bool_env_var("CK_MOE"):
+                    layer.w13_weight = torch.nn.Parameter(
+                        permute_weight(layer.w13_weight.data),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                    layer.w2_weight = torch.nn.Parameter(
+                        permute_weight(layer.w2_weight.data),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                elif get_bool_env_var("MOE_PADDING"):
+                    # If ROCm, apply weight padding (min. Mem channel contention) only if set
+                    layer.w13_weight = torch.nn.Parameter(
+                        F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                    layer.w2_weight = torch.nn.Parameter(
+                        F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
             return
         # If checkpoint is fp8, we need to handle that the
@@ -658,7 +675,7 @@ class Fp8MoEMethod:
                 )
             # If ROCm, normalize the weights and scales to e4m3fnuz
-            if is_hip():
+            if is_hip_:
                 # Normalize the weights and scales
                 w13_weight, w13_weight_scale, w13_input_scale = (
                     normalize_e4m3fn_to_e4m3fnuz(
@@ -708,18 +725,30 @@ class Fp8MoEMethod:
                 max_w13_scales, requires_grad=False
             )
-            # If ROCm, apply weight padding (min. Mem channel contention) only if set
-            if is_hip() and bool(int(os.getenv("MOE_PADDING", "0"))):
-                layer.w13_weight = torch.nn.Parameter(
-                    F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
-                    requires_grad=False,
-                )
-                torch.cuda.empty_cache()
-                layer.w2_weight = torch.nn.Parameter(
-                    F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
-                    requires_grad=False,
-                )
-                torch.cuda.empty_cache()
+            if is_hip_:
+                if get_bool_env_var("CK_MOE"):
+                    layer.w13_weight = torch.nn.Parameter(
+                        permute_weight(layer.w13_weight.data),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                    layer.w2_weight = torch.nn.Parameter(
+                        permute_weight(layer.w2_weight.data),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                elif get_bool_env_var("MOE_PADDING"):
+                    # If ROCm, apply weight padding (min. Mem channel contention) only if set
+                    layer.w13_weight = torch.nn.Parameter(
+                        F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                    layer.w2_weight = torch.nn.Parameter(
+                        F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
             return
     def apply(
@@ -752,27 +781,55 @@ class Fp8MoEMethod:
             correction_bias=correction_bias,
         )
-        # Expert fusion with FP8 quantization
-        return fused_experts(
-            x,
-            layer.w13_weight,
-            layer.w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=True,
-            use_fp8_w8a8=True,
-            w1_scale=(
-                layer.w13_weight_scale_inv
-                if self.block_quant
-                else layer.w13_weight_scale
-            ),
-            w2_scale=(
-                layer.w2_weight_scale_inv if self.block_quant else layer.w2_weight_scale
-            ),
-            a1_scale=layer.w13_input_scale,
-            a2_scale=layer.w2_input_scale,
-            block_shape=self.quant_config.weight_block_size,
-        )
+        if is_hip_ and get_bool_env_var("CK_MOE"):
+            import ater
+            from ater.fused_moe import fused_experts_ck
+            return fused_experts_ck(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                use_fp8_w8a8=True,
+                w1_scale=(
+                    layer.w13_weight_scale_inv
+                    if self.block_quant
+                    else layer.w13_weight_scale
+                ),
+                w2_scale=(
+                    layer.w2_weight_scale_inv
+                    if self.block_quant
+                    else layer.w2_weight_scale
+                ),
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+            )
+        else:
+            # Expert fusion with FP8 quantization
+            return fused_experts(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                use_fp8_w8a8=True,
+                w1_scale=(
+                    layer.w13_weight_scale_inv
+                    if self.block_quant
+                    else layer.w13_weight_scale
+                ),
+                w2_scale=(
+                    layer.w2_weight_scale_inv
+                    if self.block_quant
+                    else layer.w2_weight_scale
+                ),
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+                block_shape=self.quant_config.weight_block_size,
+            )
 class Fp8KVCacheMethod(BaseKVCacheMethod):

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from typing import List, Optional, Tuple
 import torch
-from vllm.model_executor.parameter import RowvLLMParameter, _ColumnvLLMParameter
+from sglang.srt.layers.parameter import RowvLLMParameter, _ColumnvLLMParameter
 from sglang.srt.layers.quantization.fp8_kernel import (
     per_token_group_quant_fp8,
     w8a8_block_fp8_matmul,

sglang/srt/layers/quantization/int8_kernel.py ADDED Viewed

@@ -0,0 +1,54 @@
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def _per_token_quant_int8(
+    x_ptr,
+    xq_ptr,
+    scale_ptr,
+    stride_x,
+    stride_xq,
+    N,
+    BLOCK: tl.constexpr,
+):
+    # Adapted from https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282
+    row_id = tl.program_id(0)
+    cols = tl.arange(0, BLOCK)
+    mask = cols < N
+    x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask, other=0.0).to(tl.float32)
+    absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10)
+    scale_x = absmax / 127
+    x_q = x * (127 / absmax)
+    x_q = tl.extra.cuda.libdevice.round(x_q).to(tl.int8)
+    tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask)
+    tl.store(scale_ptr + row_id, scale_x)
+def per_token_quant_int8(x):
+    M = x.numel() // x.shape[-1]
+    N = x.shape[-1]
+    x_q = torch.empty_like(x, device=x.device, dtype=torch.int8)
+    scales = torch.empty(x.shape[:-1] + (1,), device=x.device, dtype=torch.float32)
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    assert x.is_contiguous()
+    _per_token_quant_int8[(M,)](
+        x,
+        x_q,
+        scales,
+        stride_x=x.stride(-2),
+        stride_xq=x_q.stride(-2),
+        N=N,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return x_q, scales

sglang/srt/layers/quantization/modelopt_quant.py CHANGED Viewed

@@ -5,15 +5,14 @@ from typing import Any, Dict, List, Optional
 import torch
 from torch.nn.parameter import Parameter
-from vllm.model_executor.layers.linear import LinearBase
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     apply_fp8_linear,
     cutlass_fp8_supported,
     requantize_with_max_scale,
 )
-from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
-from sglang.srt.layers.linear import LinearMethodBase
+from sglang.srt.layers.linear import LinearBase, LinearMethodBase
+from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,

sglang 0.4.1.post5__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl

sglang 0.4.1.post5py3-none-any.whl → 0.4.1.post7py3-none-any.whl