PyPI - sglang - Versions diffs - 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

sglang 0.5.0rc2py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

sglang/bench_one_batch.py +0 -6
sglang/bench_one_batch_server.py +7 -2
sglang/bench_serving.py +3 -3
sglang/eval/llama3_eval.py +0 -1
sglang/srt/configs/model_config.py +24 -9
sglang/srt/configs/update_config.py +40 -5
sglang/srt/constrained/xgrammar_backend.py +23 -11
sglang/srt/conversation.py +2 -15
sglang/srt/disaggregation/ascend/conn.py +1 -3
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +1 -1
sglang/srt/disaggregation/launch_lb.py +7 -1
sglang/srt/disaggregation/mini_lb.py +11 -5
sglang/srt/disaggregation/mooncake/conn.py +141 -47
sglang/srt/disaggregation/prefill.py +261 -5
sglang/srt/disaggregation/utils.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/device_communicators/pynccl.py +68 -18
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
sglang/srt/distributed/naive_distributed.py +112 -0
sglang/srt/distributed/parallel_state.py +90 -4
sglang/srt/entrypoints/context.py +20 -1
sglang/srt/entrypoints/engine.py +27 -2
sglang/srt/entrypoints/http_server.py +12 -0
sglang/srt/entrypoints/openai/protocol.py +2 -2
sglang/srt/entrypoints/openai/serving_chat.py +22 -6
sglang/srt/entrypoints/openai/serving_completions.py +9 -1
sglang/srt/entrypoints/openai/serving_responses.py +2 -2
sglang/srt/eplb/expert_distribution.py +2 -3
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +24 -0
sglang/srt/host_shared_memory.py +83 -0
sglang/srt/layers/attention/ascend_backend.py +132 -22
sglang/srt/layers/attention/flashattention_backend.py +24 -17
sglang/srt/layers/attention/flashinfer_backend.py +11 -3
sglang/srt/layers/attention/flashinfer_mla_backend.py +226 -76
sglang/srt/layers/attention/triton_backend.py +85 -46
sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
sglang/srt/layers/attention/trtllm_mha_backend.py +390 -30
sglang/srt/layers/attention/trtllm_mla_backend.py +39 -16
sglang/srt/layers/attention/utils.py +94 -15
sglang/srt/layers/attention/vision.py +40 -13
sglang/srt/layers/attention/vision_utils.py +65 -0
sglang/srt/layers/communicator.py +51 -3
sglang/srt/layers/dp_attention.py +23 -4
sglang/srt/layers/elementwise.py +94 -0
sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
sglang/srt/layers/layernorm.py +8 -1
sglang/srt/layers/linear.py +24 -0
sglang/srt/layers/logits_processor.py +5 -1
sglang/srt/layers/moe/__init__.py +31 -0
sglang/srt/layers/moe/ep_moe/layer.py +37 -33
sglang/srt/layers/moe/fused_moe_native.py +14 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
sglang/srt/layers/moe/moe_runner/base.py +13 -0
sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
sglang/srt/layers/moe/router.py +15 -9
sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +167 -83
sglang/srt/layers/moe/utils.py +159 -18
sglang/srt/layers/quantization/__init__.py +13 -14
sglang/srt/layers/quantization/awq.py +7 -7
sglang/srt/layers/quantization/base_config.py +2 -6
sglang/srt/layers/quantization/blockwise_int8.py +4 -12
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -28
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
sglang/srt/layers/quantization/fp8.py +127 -119
sglang/srt/layers/quantization/fp8_kernel.py +195 -24
sglang/srt/layers/quantization/fp8_utils.py +34 -9
sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
sglang/srt/layers/quantization/gptq.py +5 -4
sglang/srt/layers/quantization/marlin_utils.py +11 -3
sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
sglang/srt/layers/quantization/modelopt_quant.py +165 -68
sglang/srt/layers/quantization/moe_wna16.py +10 -15
sglang/srt/layers/quantization/mxfp4.py +206 -37
sglang/srt/layers/quantization/quark/quark.py +390 -0
sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
sglang/srt/layers/quantization/unquant.py +34 -70
sglang/srt/layers/quantization/utils.py +25 -0
sglang/srt/layers/quantization/w4afp8.py +7 -8
sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
sglang/srt/layers/quantization/w8a8_int8.py +5 -13
sglang/srt/layers/radix_attention.py +6 -0
sglang/srt/layers/rotary_embedding.py +1 -0
sglang/srt/lora/lora_manager.py +21 -22
sglang/srt/lora/lora_registry.py +3 -3
sglang/srt/lora/mem_pool.py +26 -24
sglang/srt/lora/utils.py +10 -12
sglang/srt/managers/cache_controller.py +76 -18
sglang/srt/managers/detokenizer_manager.py +10 -2
sglang/srt/managers/io_struct.py +9 -0
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/schedule_batch.py +4 -9
sglang/srt/managers/scheduler.py +25 -16
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/template_manager.py +7 -5
sglang/srt/managers/tokenizer_manager.py +60 -21
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/managers/utils.py +59 -1
sglang/srt/mem_cache/allocator.py +7 -5
sglang/srt/mem_cache/allocator_ascend.py +0 -11
sglang/srt/mem_cache/hicache_storage.py +14 -4
sglang/srt/mem_cache/memory_pool.py +3 -3
sglang/srt/mem_cache/memory_pool_host.py +35 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
sglang/srt/model_executor/cuda_graph_runner.py +25 -12
sglang/srt/model_executor/forward_batch_info.py +4 -1
sglang/srt/model_executor/model_runner.py +43 -32
sglang/srt/model_executor/npu_graph_runner.py +94 -0
sglang/srt/model_loader/loader.py +24 -6
sglang/srt/models/dbrx.py +12 -6
sglang/srt/models/deepseek.py +2 -1
sglang/srt/models/deepseek_nextn.py +3 -1
sglang/srt/models/deepseek_v2.py +224 -223
sglang/srt/models/ernie4.py +2 -2
sglang/srt/models/glm4_moe.py +25 -63
sglang/srt/models/glm4v.py +52 -1
sglang/srt/models/glm4v_moe.py +8 -11
sglang/srt/models/gpt_oss.py +34 -74
sglang/srt/models/granitemoe.py +0 -1
sglang/srt/models/grok.py +376 -48
sglang/srt/models/interns1.py +12 -47
sglang/srt/models/internvl.py +6 -51
sglang/srt/models/llama4.py +0 -2
sglang/srt/models/minicpm3.py +0 -1
sglang/srt/models/mixtral.py +0 -2
sglang/srt/models/nemotron_nas.py +435 -0
sglang/srt/models/olmoe.py +0 -1
sglang/srt/models/phi4mm.py +3 -21
sglang/srt/models/qwen2_5_vl.py +2 -0
sglang/srt/models/qwen2_moe.py +3 -18
sglang/srt/models/qwen3.py +2 -2
sglang/srt/models/qwen3_classification.py +7 -1
sglang/srt/models/qwen3_moe.py +9 -38
sglang/srt/models/step3_vl.py +2 -1
sglang/srt/models/xverse_moe.py +11 -5
sglang/srt/multimodal/processors/base_processor.py +3 -3
sglang/srt/multimodal/processors/internvl.py +7 -2
sglang/srt/multimodal/processors/llava.py +11 -7
sglang/srt/offloader.py +433 -0
sglang/srt/operations.py +6 -1
sglang/srt/reasoning_parser.py +4 -3
sglang/srt/server_args.py +237 -104
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
sglang/srt/speculative/eagle_utils.py +36 -13
sglang/srt/speculative/eagle_worker.py +56 -3
sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
sglang/srt/two_batch_overlap.py +16 -11
sglang/srt/utils.py +68 -70
sglang/test/runners.py +8 -5
sglang/test/test_block_fp8.py +5 -6
sglang/test/test_block_fp8_ep.py +13 -19
sglang/test/test_cutlass_moe.py +4 -6
sglang/test/test_cutlass_w4a8_moe.py +4 -3
sglang/test/test_fp4_moe.py +4 -3
sglang/test/test_utils.py +7 -0
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/METADATA +7 -7
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/RECORD +179 -161
sglang/srt/layers/quantization/fp4.py +0 -557
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/utils.py CHANGED Viewed

@@ -1,55 +1,85 @@
+from __future__ import annotations
 import importlib.util
 from enum import Enum
 from functools import lru_cache
+from typing import TYPE_CHECKING, Optional
 from packaging import version as pkg_version
-from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
+from sglang.srt.layers.dp_attention import (
+    get_attention_dp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.utils import logger
-@lru_cache(maxsize=1)
-def should_use_flashinfer_trtllm_moe():
-    result = global_server_args_dict["enable_flashinfer_trtllm_moe"] and (
-        not importlib.util.find_spec("flashinfer")
-        or pkg_version.parse(__import__("flashinfer").__version__)
-        >= pkg_version.parse("0.2.9rc1")
-    )
-    return result
+if TYPE_CHECKING:
+    from sglang.srt.server_args import ServerArgs
 class MoeA2ABackend(Enum):
-    STANDARD = ("standard", "none")
+    NONE = "none"
     DEEPEP = "deepep"
     @classmethod
     def _missing_(cls, value):
         if value is None:
-            return cls.STANDARD
+            return cls.NONE
         for member in cls:
-            if value in member.value:
+            if value == member.value:
                 return member
         raise ValueError(f"No {cls.__name__} member for value {value}")
+    def is_none(self):
+        return self == MoeA2ABackend.NONE
     def is_deepep(self):
         return self == MoeA2ABackend.DEEPEP
-    def is_standard(self):
-        return self == MoeA2ABackend.STANDARD
+class MoeRunnerBackend(Enum):
+    AUTO = "auto"
+    TRITON = "triton"
+    TRITON_KERNEL = "triton_kernel"
+    FLASHINFER = "flashinfer_trtllm"
+    FLASHINFER_CUTLASS = "flashinfer_cutlass"
+    FLASHINFER_MXFP4 = "flashinfer_mxfp4"
+    def is_auto(self):
+        return self == MoeRunnerBackend.AUTO
+    def is_triton(self):
+        return self == MoeRunnerBackend.TRITON
+    def is_triton_kernel(self):
+        return self == MoeRunnerBackend.TRITON_KERNEL
+    def is_flashinfer_trtllm(self):
+        return self == MoeRunnerBackend.FLASHINFER
+    def is_flashinfer_cutlass(self):
+        return self == MoeRunnerBackend.FLASHINFER_CUTLASS
+    def is_flashinfer_mxfp4(self):
+        return self == MoeRunnerBackend.FLASHINFER_MXFP4
 class DeepEPMode(Enum):
     NORMAL = "normal"
     LOW_LATENCY = "low_latency"
     AUTO = "auto"
-    def enable_normal(self):
+    def enable_normal(self) -> bool:
         return self in [DeepEPMode.NORMAL, DeepEPMode.AUTO]
-    def enable_low_latency(self):
+    def enable_low_latency(self) -> bool:
         return self in [DeepEPMode.LOW_LATENCY, DeepEPMode.AUTO]
-    def resolve(self, is_extend_in_batch: bool):
+    def resolve(self, is_extend_in_batch: bool) -> DeepEPMode:
         if self != DeepEPMode.AUTO:
             return self
@@ -57,3 +87,114 @@ class DeepEPMode(Enum):
             return DeepEPMode.NORMAL
         else:
             return DeepEPMode.LOW_LATENCY
+    def is_normal(self) -> bool:
+        return self == DeepEPMode.NORMAL
+    def is_low_latency(self) -> bool:
+        return self == DeepEPMode.LOW_LATENCY
+    def is_auto(self) -> bool:
+        return self == DeepEPMode.AUTO
+MOE_A2A_BACKEND: Optional[MoeA2ABackend] = None
+MOE_RUNNER_BACKEND: Optional[MoeRunnerBackend] = None
+DEEPEP_MODE: Optional[DeepEPMode] = None
+IS_TBO_ENABLED: Optional[bool] = None
+TBO_TOKEN_DISTRIBUTION_THRESHOLD: Optional[float] = None
+DEEPEP_CONFIG: Optional[str] = None
+DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER: Optional[bool] = None
+def initialize_moe_config(server_args: ServerArgs):
+    global MOE_A2A_BACKEND
+    global MOE_RUNNER_BACKEND
+    global DEEPEP_MODE
+    global DEEPEP_CONFIG
+    global IS_TBO_ENABLED
+    global TBO_TOKEN_DISTRIBUTION_THRESHOLD
+    global DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER
+    MOE_A2A_BACKEND = MoeA2ABackend(server_args.moe_a2a_backend)
+    MOE_RUNNER_BACKEND = MoeRunnerBackend(server_args.moe_runner_backend)
+    DEEPEP_MODE = DeepEPMode(server_args.deepep_mode)
+    DEEPEP_CONFIG = server_args.deepep_config or ""
+    IS_TBO_ENABLED = server_args.enable_two_batch_overlap
+    TBO_TOKEN_DISTRIBUTION_THRESHOLD = server_args.tbo_token_distribution_threshold
+    DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER = (
+        server_args.disable_flashinfer_cutlass_moe_fp4_allgather
+    )
+def get_moe_a2a_backend() -> MoeA2ABackend:
+    global MOE_A2A_BACKEND
+    if MOE_A2A_BACKEND is None:
+        logger.warning("MOE_A2A_BACKEND is not initialized, using default backend")
+        MOE_A2A_BACKEND = MoeA2ABackend(None)
+    return MOE_A2A_BACKEND
+def get_moe_runner_backend() -> MoeRunnerBackend:
+    global MOE_RUNNER_BACKEND
+    if MOE_RUNNER_BACKEND is None:
+        logger.warning("MOE_RUNNER_BACKEND is not initialized, using triton backend")
+        MOE_RUNNER_BACKEND = MoeRunnerBackend("triton")
+    return MOE_RUNNER_BACKEND
+def get_deepep_mode() -> DeepEPMode:
+    global DEEPEP_MODE
+    if DEEPEP_MODE is None:
+        logger.warning("DEEPEP_MODE is not initialized, using auto mode")
+        DEEPEP_MODE = DeepEPMode("auto")
+    return DEEPEP_MODE
+def get_deepep_config() -> str:
+    global DEEPEP_CONFIG
+    if DEEPEP_CONFIG is None:
+        logger.warning("DEEPEP_CONFIG is not initialized, using default config")
+        DEEPEP_CONFIG = ""
+    return DEEPEP_CONFIG
+def is_tbo_enabled() -> bool:
+    global IS_TBO_ENABLED
+    if IS_TBO_ENABLED is None:
+        logger.warning("IS_TBO_ENABLED is not initialized, using False")
+        IS_TBO_ENABLED = False
+    return IS_TBO_ENABLED
+def get_tbo_token_distribution_threshold() -> float:
+    global TBO_TOKEN_DISTRIBUTION_THRESHOLD
+    if TBO_TOKEN_DISTRIBUTION_THRESHOLD is None:
+        logger.warning(
+            "TBO_TOKEN_DISTRIBUTION_THRESHOLD is not initialized, using 0.48"
+        )
+        TBO_TOKEN_DISTRIBUTION_THRESHOLD = 0.48
+    return TBO_TOKEN_DISTRIBUTION_THRESHOLD
+@lru_cache(maxsize=1)
+def should_use_flashinfer_trtllm_moe():
+    result = get_moe_runner_backend().is_flashinfer_trtllm() and (
+        not importlib.util.find_spec("flashinfer")
+        or pkg_version.parse(__import__("flashinfer").__version__)
+        >= pkg_version.parse("0.2.9rc1")
+    )
+    return result
+@lru_cache(maxsize=1)
+def should_use_flashinfer_cutlass_moe_fp4_allgather():
+    """
+    Perform FP4 quantize before all-gather for flashinfer cutlass moe to reduce communication cost for high-throughput serving.
+    """
+    return (
+        not DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER
+        and get_moe_runner_backend().is_flashinfer_cutlass()
+        and is_dp_attention_enabled()
+        and get_moe_expert_parallel_world_size() == get_attention_dp_size()
+    )

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -16,7 +16,6 @@ try:
     )
     from vllm.model_executor.layers.quantization.deepspeedfp import DeepSpeedFPConfig
     from vllm.model_executor.layers.quantization.experts_int8 import ExpertsInt8Config
-    from vllm.model_executor.layers.quantization.fbgemm_fp8 import FBGEMMFp8Config
     from vllm.model_executor.layers.quantization.gguf import GGUFConfig
     from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
         GPTQMarlin24Config,
@@ -37,9 +36,9 @@ except ImportError as e:
     AQLMConfig = BitsAndBytesConfig = CompressedTensorsConfig = DeepSpeedFPConfig = (
         ExpertsInt8Config
-    ) = FBGEMMFp8Config = GGUFConfig = GPTQMarlin24Config = MarlinConfig = QQQConfig = (
-        Int8TpuConfig
-    ) = DummyConfig
+    ) = GGUFConfig = GPTQMarlin24Config = MarlinConfig = QQQConfig = Int8TpuConfig = (
+        DummyConfig
+    )
 from sglang.srt.layers.quantization.awq import AWQConfig, AWQMarlinConfig
@@ -48,13 +47,8 @@ from sglang.srt.layers.quantization.blockwise_int8 import BlockInt8Config
 from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
     CompressedTensorsConfig,
 )
-from sglang.srt.utils import is_cuda, is_hip, mxfp_supported
-is_mxfp_supported = mxfp_supported()
-if is_mxfp_supported:
-    from sglang.srt.layers.quantization.fp4 import MxFp4Config
 from sglang.srt.layers.quantization.fp8 import Fp8Config
+from sglang.srt.layers.quantization.fpgemm_fp8 import FBGEMMFp8Config
 from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
 from sglang.srt.layers.quantization.modelopt_quant import (
     ModelOptFp4Config,
@@ -67,6 +61,9 @@ from sglang.srt.layers.quantization.qoq import QoQConfig
 from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config
 from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
 from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
+from sglang.srt.utils import is_cuda, is_hip, mxfp_supported
+_is_mxfp_supported = mxfp_supported()
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.topk import TopKOutput
@@ -88,6 +85,7 @@ BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "qoq": QoQConfig,
     "w4afp8": W4AFp8Config,
     "petit_nvfp4": PetitNvFp4Config,
+    "fbgemm_fp8": FBGEMMFp8Config,
 }
@@ -98,11 +96,13 @@ if is_cuda():
             "mxfp4": Mxfp4Config,
         }
     )
-elif is_mxfp_supported and is_hip():
+elif _is_mxfp_supported and is_hip():
+    from sglang.srt.layers.quantization.quark.quark import QuarkConfig
     BASE_QUANTIZATION_METHODS.update(
         {
-            "quark": MxFp4Config,
-            "mxfp4": MxFp4Config,
+            "quark": QuarkConfig,
+            "mxfp4": Mxfp4Config,
         }
     )
 # VLLM-dependent quantization methods
@@ -110,7 +110,6 @@ VLLM_QUANTIZATION_METHODS = {
     "aqlm": AQLMConfig,
     "deepspeedfp": DeepSpeedFPConfig,
     "tpu_int8": Int8TpuConfig,
-    "fbgemm_fp8": FBGEMMFp8Config,
     "marlin": MarlinConfig,
     "gguf": GGUFConfig,
     "gptq_marlin_24": GPTQMarlin24Config,

sglang/srt/layers/quantization/awq.py CHANGED Viewed

@@ -33,7 +33,8 @@ from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
 from sglang.srt.layers.quantization.utils import get_scalar_types, replace_parameter
 if TYPE_CHECKING:
-    from sglang.srt.layers.moe.topk import TopKOutput
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+    from sglang.srt.layers.moe.topk import StandardTopKOutput
 from sglang.srt.utils import is_cuda, is_hip
@@ -739,13 +740,12 @@ class AWQMoEMethod(FusedMoEMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        topk_output: TopKOutput,
-        *,
-        activation: str = "silu",
-        **kwargs,
+        topk_output: StandardTopKOutput,
+        moe_runner_config: MoeRunnerConfig,
     ) -> torch.Tensor:
-        assert activation == "silu", "Only SiLU activation is supported."
+        assert (
+            moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
         # The input must currently be float16
         orig_dtype = x.dtype

sglang/srt/layers/quantization/base_config.py CHANGED Viewed

@@ -9,6 +9,7 @@ import torch
 from torch import nn
 if TYPE_CHECKING:
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
     from sglang.srt.layers.moe.topk import TopKOutput
@@ -100,12 +101,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         layer: torch.nn.Module,
         x: torch.Tensor,
         topk_output: TopKOutput,
-        *,
-        activation: str = "silu",
-        apply_router_weight_on_input: bool = False,
-        inplace: bool = True,
-        no_combine: bool = False,
-        routed_scaling_factor: Optional[float] = None,
+        moe_runner_config: MoeRunnerConfig,
     ) -> torch.Tensor:
         raise NotImplementedError

sglang/srt/layers/quantization/blockwise_int8.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from __future__ import annotations
 import logging
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 import torch
 from torch.nn import Module
@@ -22,6 +22,7 @@ from sglang.srt.layers.quantization.utils import is_layer_skipped
 from sglang.srt.utils import set_weight_attrs
 if TYPE_CHECKING:
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
     from sglang.srt.layers.moe.topk import TopKOutput
 ACTIVATION_SCHEMES = ["static", "dynamic"]
@@ -348,12 +349,7 @@ class BlockInt8MoEMethod(FusedMoEMethodBase):
         layer: torch.nn.Module,
         x: torch.Tensor,
         topk_output: TopKOutput,
-        *,
-        activation: str = "silu",
-        apply_router_weight_on_input: bool = False,
-        inplace: bool = True,
-        no_combine: bool = False,
-        routed_scaling_factor: Optional[float] = None,
+        moe_runner_config: MoeRunnerConfig,
     ) -> torch.Tensor:
         from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
@@ -363,15 +359,11 @@ class BlockInt8MoEMethod(FusedMoEMethodBase):
             layer.w13_weight,
             layer.w2_weight,
             topk_output=topk_output,
-            inplace=inplace,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
+            moe_runner_config=moe_runner_config,
             use_int8_w8a8=True,
             w1_scale=(layer.w13_weight_scale_inv),
             w2_scale=(layer.w2_weight_scale_inv),
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             block_shape=self.quant_config.weight_block_size,
-            no_combine=no_combine,
-            routed_scaling_factor=routed_scaling_factor,
         )

sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py CHANGED Viewed

@@ -19,15 +19,30 @@ from sglang.srt.layers.quantization.utils import (
     per_tensor_dequantize,
     replace_parameter,
 )
-from sglang.srt.utils import is_cpu, is_cuda, is_hip, is_npu, set_weight_attrs
+from sglang.srt.utils import (
+    get_bool_env_var,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+    set_weight_attrs,
+)
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
     from sglang.srt.layers.moe.topk import TopKOutput
     from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
         CompressedTensorsConfig,
     )
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+if _use_aiter:
+    from aiter.ops.shuffle import shuffle_weight
+    from sglang.srt.layers.moe.rocm_moe_utils import rocm_fused_experts_tkw1
 try:
     import vllm
@@ -264,37 +279,66 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 max_w13_scales, requires_grad=False
             )
+        if _use_aiter:
+            with torch.no_grad():
+                # Pre-shuffle weights
+                layer.w13_weight = torch.nn.Parameter(
+                    shuffle_weight(layer.w13_weight.data, (16, 16)),
+                    requires_grad=False,
+                )
+                torch.cuda.empty_cache()
+                layer.w2_weight = torch.nn.Parameter(
+                    shuffle_weight(layer.w2_weight.data, (16, 16)),
+                    requires_grad=False,
+                )
+                torch.cuda.empty_cache()
     def apply(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
         topk_output: TopKOutput,
-        *,
-        activation: str = "silu",
-        apply_router_weight_on_input: bool = False,
-        inplace: bool = True,
-        no_combine: bool = False,
-        routed_scaling_factor: Optional[float] = None,
+        moe_runner_config: MoeRunnerConfig,
     ) -> torch.Tensor:
         from sglang.srt.layers.moe.fused_moe_triton import fused_experts
-        return fused_experts(
-            x,
-            layer.w13_weight,
-            layer.w2_weight,
-            topk_output=topk_output,
-            inplace=inplace,
-            activation=activation,
-            use_fp8_w8a8=True,
-            per_channel_quant=self.weight_quant.strategy
-            == QuantizationStrategy.CHANNEL,
-            w1_scale=layer.w13_weight_scale,
-            w2_scale=layer.w2_weight_scale,
-            a1_scale=layer.w13_input_scale,
-            a2_scale=layer.w2_input_scale,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            routed_scaling_factor=routed_scaling_factor,
-        )
+        if (
+            _use_aiter
+            and self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+            and moe_runner_config.apply_router_weight_on_input
+        ):
+            topk_weights, topk_ids, _ = topk_output
+            return rocm_fused_experts_tkw1(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=moe_runner_config.activation,
+                apply_router_weight_on_input=moe_runner_config.apply_router_weight_on_input,
+                use_fp8_w8a8=True,
+                per_channel_quant=self.weight_quant.strategy
+                == QuantizationStrategy.CHANNEL,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+            )
+        else:
+            return fused_experts(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_output=topk_output,
+                moe_runner_config=moe_runner_config,
+                use_fp8_w8a8=True,
+                per_channel_quant=self.weight_quant.strategy
+                == QuantizationStrategy.CHANNEL,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+            )
 class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
@@ -601,12 +645,12 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         layer: torch.nn.Module,
         x: torch.Tensor,
         topk_output: TopKOutput,
-        *,
-        activation: str = "silu",
-        **kwargs,
+        moe_runner_config: MoeRunnerConfig,
     ) -> torch.Tensor:
-        assert activation == "silu", "Only SiLU activation is supported."
+        assert (
+            moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
         topk_weights, topk_ids, router_logits = topk_output

sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py CHANGED Viewed

@@ -7,7 +7,8 @@ logger = logging.getLogger(__name__)
 def _compute_enable_deep_gemm():
     sm_version = get_device_sm()
-    if sm_version < 90:
+    # TODO fix blackwell fp8
+    if sm_version != 90:
         return False
     try:

sglang 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl

sglang 0.5.0rc2py3-none-any.whl → 0.5.1py3-none-any.whl