PyPI - sglang - Versions diffs - 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

sglang 0.5.0rc1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (203) hide show

sglang/bench_one_batch.py +0 -7
sglang/bench_one_batch_server.py +7 -2
sglang/bench_serving.py +3 -3
sglang/eval/llama3_eval.py +0 -1
sglang/srt/configs/model_config.py +25 -9
sglang/srt/configs/update_config.py +40 -5
sglang/srt/constrained/xgrammar_backend.py +23 -11
sglang/srt/conversation.py +2 -15
sglang/srt/disaggregation/ascend/conn.py +1 -3
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +1 -2
sglang/srt/disaggregation/launch_lb.py +7 -1
sglang/srt/disaggregation/mini_lb.py +11 -5
sglang/srt/disaggregation/mooncake/conn.py +141 -47
sglang/srt/disaggregation/prefill.py +261 -5
sglang/srt/disaggregation/utils.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/device_communicators/pynccl.py +68 -18
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
sglang/srt/distributed/naive_distributed.py +112 -0
sglang/srt/distributed/parallel_state.py +90 -4
sglang/srt/entrypoints/context.py +20 -1
sglang/srt/entrypoints/engine.py +29 -4
sglang/srt/entrypoints/http_server.py +76 -0
sglang/srt/entrypoints/openai/protocol.py +4 -2
sglang/srt/entrypoints/openai/serving_chat.py +23 -6
sglang/srt/entrypoints/openai/serving_completions.py +10 -1
sglang/srt/entrypoints/openai/serving_responses.py +2 -2
sglang/srt/eplb/expert_distribution.py +2 -3
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +24 -0
sglang/srt/host_shared_memory.py +83 -0
sglang/srt/layers/attention/ascend_backend.py +132 -22
sglang/srt/layers/attention/flashattention_backend.py +24 -17
sglang/srt/layers/attention/flashinfer_backend.py +14 -3
sglang/srt/layers/attention/flashinfer_mla_backend.py +227 -76
sglang/srt/layers/attention/triton_backend.py +109 -73
sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
sglang/srt/layers/attention/trtllm_mha_backend.py +398 -36
sglang/srt/layers/attention/trtllm_mla_backend.py +49 -19
sglang/srt/layers/attention/utils.py +94 -15
sglang/srt/layers/attention/vision.py +40 -13
sglang/srt/layers/attention/vision_utils.py +65 -0
sglang/srt/layers/communicator.py +58 -10
sglang/srt/layers/dp_attention.py +137 -27
sglang/srt/layers/elementwise.py +94 -0
sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
sglang/srt/layers/layernorm.py +8 -1
sglang/srt/layers/linear.py +24 -0
sglang/srt/layers/logits_processor.py +16 -18
sglang/srt/layers/moe/__init__.py +31 -0
sglang/srt/layers/moe/ep_moe/layer.py +37 -33
sglang/srt/layers/moe/fused_moe_native.py +14 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
sglang/srt/layers/moe/moe_runner/base.py +13 -0
sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
sglang/srt/layers/moe/router.py +15 -9
sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +167 -83
sglang/srt/layers/moe/utils.py +159 -18
sglang/srt/layers/multimodal.py +156 -40
sglang/srt/layers/quantization/__init__.py +18 -46
sglang/srt/layers/quantization/awq.py +22 -23
sglang/srt/layers/quantization/base_config.py +2 -6
sglang/srt/layers/quantization/blockwise_int8.py +4 -12
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -29
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
sglang/srt/layers/quantization/fp8.py +127 -119
sglang/srt/layers/quantization/fp8_kernel.py +195 -24
sglang/srt/layers/quantization/fp8_utils.py +34 -9
sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
sglang/srt/layers/quantization/gptq.py +17 -21
sglang/srt/layers/quantization/marlin_utils.py +26 -8
sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
sglang/srt/layers/quantization/modelopt_quant.py +217 -98
sglang/srt/layers/quantization/moe_wna16.py +10 -15
sglang/srt/layers/quantization/mxfp4.py +222 -39
sglang/srt/layers/quantization/quark/quark.py +390 -0
sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
sglang/srt/layers/quantization/unquant.py +34 -70
sglang/srt/layers/quantization/utils.py +77 -2
sglang/srt/layers/quantization/w4afp8.py +7 -8
sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
sglang/srt/layers/quantization/w8a8_int8.py +5 -13
sglang/srt/layers/radix_attention.py +6 -0
sglang/srt/layers/rotary_embedding.py +1 -0
sglang/srt/layers/sampler.py +5 -2
sglang/srt/lora/layers.py +6 -2
sglang/srt/lora/lora_manager.py +21 -22
sglang/srt/lora/lora_registry.py +3 -3
sglang/srt/lora/mem_pool.py +26 -24
sglang/srt/lora/utils.py +10 -12
sglang/srt/managers/cache_controller.py +80 -19
sglang/srt/managers/detokenizer_manager.py +10 -2
sglang/srt/managers/io_struct.py +23 -0
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/schedule_batch.py +22 -48
sglang/srt/managers/scheduler.py +28 -20
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/template_manager.py +7 -5
sglang/srt/managers/tokenizer_manager.py +88 -39
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/managers/utils.py +59 -1
sglang/srt/mem_cache/allocator.py +10 -157
sglang/srt/mem_cache/allocator_ascend.py +147 -0
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +14 -4
sglang/srt/mem_cache/memory_pool.py +3 -3
sglang/srt/mem_cache/memory_pool_host.py +35 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
sglang/srt/model_executor/cuda_graph_runner.py +33 -33
sglang/srt/model_executor/forward_batch_info.py +11 -10
sglang/srt/model_executor/model_runner.py +93 -78
sglang/srt/model_executor/npu_graph_runner.py +94 -0
sglang/srt/model_loader/loader.py +24 -6
sglang/srt/models/dbrx.py +12 -6
sglang/srt/models/deepseek.py +2 -1
sglang/srt/models/deepseek_nextn.py +5 -2
sglang/srt/models/deepseek_v2.py +226 -223
sglang/srt/models/ernie4.py +2 -2
sglang/srt/models/glm4_moe.py +27 -65
sglang/srt/models/glm4_moe_nextn.py +2 -1
sglang/srt/models/glm4v.py +52 -1
sglang/srt/models/glm4v_moe.py +8 -11
sglang/srt/models/gpt_oss.py +41 -76
sglang/srt/models/granitemoe.py +0 -1
sglang/srt/models/grok.py +376 -48
sglang/srt/models/interns1.py +12 -47
sglang/srt/models/internvl.py +6 -51
sglang/srt/models/llama.py +10 -2
sglang/srt/models/llama4.py +18 -7
sglang/srt/models/minicpm3.py +0 -1
sglang/srt/models/mixtral.py +0 -2
sglang/srt/models/nemotron_nas.py +435 -0
sglang/srt/models/olmoe.py +0 -1
sglang/srt/models/phi4mm.py +3 -21
sglang/srt/models/qwen2.py +2 -2
sglang/srt/models/qwen2_5_vl.py +2 -0
sglang/srt/models/qwen2_moe.py +23 -23
sglang/srt/models/qwen3.py +2 -2
sglang/srt/models/qwen3_classification.py +84 -0
sglang/srt/models/qwen3_moe.py +27 -43
sglang/srt/models/step3_vl.py +8 -3
sglang/srt/models/xverse_moe.py +11 -5
sglang/srt/multimodal/processors/base_processor.py +3 -3
sglang/srt/multimodal/processors/internvl.py +7 -2
sglang/srt/multimodal/processors/llava.py +11 -7
sglang/srt/offloader.py +433 -0
sglang/srt/operations.py +22 -2
sglang/srt/reasoning_parser.py +4 -3
sglang/srt/sampling/sampling_batch_info.py +7 -4
sglang/srt/server_args.py +264 -105
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -21
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_utils.py +36 -13
sglang/srt/speculative/eagle_worker.py +56 -3
sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
sglang/srt/two_batch_overlap.py +20 -19
sglang/srt/utils.py +68 -70
sglang/test/runners.py +8 -5
sglang/test/test_block_fp8.py +5 -6
sglang/test/test_block_fp8_ep.py +13 -19
sglang/test/test_cutlass_moe.py +4 -6
sglang/test/test_cutlass_w4a8_moe.py +4 -3
sglang/test/test_fp4_moe.py +4 -3
sglang/test/test_marlin_moe.py +1 -1
sglang/test/test_marlin_utils.py +1 -1
sglang/test/test_utils.py +7 -0
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/METADATA +11 -11
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/RECORD +201 -171
sglang/srt/layers/quantization/fp4.py +0 -557
sglang/srt/layers/quantization/scalar_type.py +0 -352
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/utils.py CHANGED Viewed

@@ -1,55 +1,85 @@
+from __future__ import annotations
 import importlib.util
 from enum import Enum
 from functools import lru_cache
+from typing import TYPE_CHECKING, Optional
 from packaging import version as pkg_version
-from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
+from sglang.srt.layers.dp_attention import (
+    get_attention_dp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.utils import logger
-@lru_cache(maxsize=1)
-def should_use_flashinfer_trtllm_moe():
-    result = global_server_args_dict["enable_flashinfer_trtllm_moe"] and (
-        not importlib.util.find_spec("flashinfer")
-        or pkg_version.parse(__import__("flashinfer").__version__)
-        >= pkg_version.parse("0.2.9rc1")
-    )
-    return result
+if TYPE_CHECKING:
+    from sglang.srt.server_args import ServerArgs
 class MoeA2ABackend(Enum):
-    STANDARD = ("standard", "none")
+    NONE = "none"
     DEEPEP = "deepep"
     @classmethod
     def _missing_(cls, value):
         if value is None:
-            return cls.STANDARD
+            return cls.NONE
         for member in cls:
-            if value in member.value:
+            if value == member.value:
                 return member
         raise ValueError(f"No {cls.__name__} member for value {value}")
+    def is_none(self):
+        return self == MoeA2ABackend.NONE
     def is_deepep(self):
         return self == MoeA2ABackend.DEEPEP
-    def is_standard(self):
-        return self == MoeA2ABackend.STANDARD
+class MoeRunnerBackend(Enum):
+    AUTO = "auto"
+    TRITON = "triton"
+    TRITON_KERNEL = "triton_kernel"
+    FLASHINFER = "flashinfer_trtllm"
+    FLASHINFER_CUTLASS = "flashinfer_cutlass"
+    FLASHINFER_MXFP4 = "flashinfer_mxfp4"
+    def is_auto(self):
+        return self == MoeRunnerBackend.AUTO
+    def is_triton(self):
+        return self == MoeRunnerBackend.TRITON
+    def is_triton_kernel(self):
+        return self == MoeRunnerBackend.TRITON_KERNEL
+    def is_flashinfer_trtllm(self):
+        return self == MoeRunnerBackend.FLASHINFER
+    def is_flashinfer_cutlass(self):
+        return self == MoeRunnerBackend.FLASHINFER_CUTLASS
+    def is_flashinfer_mxfp4(self):
+        return self == MoeRunnerBackend.FLASHINFER_MXFP4
 class DeepEPMode(Enum):
     NORMAL = "normal"
     LOW_LATENCY = "low_latency"
     AUTO = "auto"
-    def enable_normal(self):
+    def enable_normal(self) -> bool:
         return self in [DeepEPMode.NORMAL, DeepEPMode.AUTO]
-    def enable_low_latency(self):
+    def enable_low_latency(self) -> bool:
         return self in [DeepEPMode.LOW_LATENCY, DeepEPMode.AUTO]
-    def resolve(self, is_extend_in_batch: bool):
+    def resolve(self, is_extend_in_batch: bool) -> DeepEPMode:
         if self != DeepEPMode.AUTO:
             return self
@@ -57,3 +87,114 @@ class DeepEPMode(Enum):
             return DeepEPMode.NORMAL
         else:
             return DeepEPMode.LOW_LATENCY
+    def is_normal(self) -> bool:
+        return self == DeepEPMode.NORMAL
+    def is_low_latency(self) -> bool:
+        return self == DeepEPMode.LOW_LATENCY
+    def is_auto(self) -> bool:
+        return self == DeepEPMode.AUTO
+MOE_A2A_BACKEND: Optional[MoeA2ABackend] = None
+MOE_RUNNER_BACKEND: Optional[MoeRunnerBackend] = None
+DEEPEP_MODE: Optional[DeepEPMode] = None
+IS_TBO_ENABLED: Optional[bool] = None
+TBO_TOKEN_DISTRIBUTION_THRESHOLD: Optional[float] = None
+DEEPEP_CONFIG: Optional[str] = None
+DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER: Optional[bool] = None
+def initialize_moe_config(server_args: ServerArgs):
+    global MOE_A2A_BACKEND
+    global MOE_RUNNER_BACKEND
+    global DEEPEP_MODE
+    global DEEPEP_CONFIG
+    global IS_TBO_ENABLED
+    global TBO_TOKEN_DISTRIBUTION_THRESHOLD
+    global DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER
+    MOE_A2A_BACKEND = MoeA2ABackend(server_args.moe_a2a_backend)
+    MOE_RUNNER_BACKEND = MoeRunnerBackend(server_args.moe_runner_backend)
+    DEEPEP_MODE = DeepEPMode(server_args.deepep_mode)
+    DEEPEP_CONFIG = server_args.deepep_config or ""
+    IS_TBO_ENABLED = server_args.enable_two_batch_overlap
+    TBO_TOKEN_DISTRIBUTION_THRESHOLD = server_args.tbo_token_distribution_threshold
+    DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER = (
+        server_args.disable_flashinfer_cutlass_moe_fp4_allgather
+    )
+def get_moe_a2a_backend() -> MoeA2ABackend:
+    global MOE_A2A_BACKEND
+    if MOE_A2A_BACKEND is None:
+        logger.warning("MOE_A2A_BACKEND is not initialized, using default backend")
+        MOE_A2A_BACKEND = MoeA2ABackend(None)
+    return MOE_A2A_BACKEND
+def get_moe_runner_backend() -> MoeRunnerBackend:
+    global MOE_RUNNER_BACKEND
+    if MOE_RUNNER_BACKEND is None:
+        logger.warning("MOE_RUNNER_BACKEND is not initialized, using triton backend")
+        MOE_RUNNER_BACKEND = MoeRunnerBackend("triton")
+    return MOE_RUNNER_BACKEND
+def get_deepep_mode() -> DeepEPMode:
+    global DEEPEP_MODE
+    if DEEPEP_MODE is None:
+        logger.warning("DEEPEP_MODE is not initialized, using auto mode")
+        DEEPEP_MODE = DeepEPMode("auto")
+    return DEEPEP_MODE
+def get_deepep_config() -> str:
+    global DEEPEP_CONFIG
+    if DEEPEP_CONFIG is None:
+        logger.warning("DEEPEP_CONFIG is not initialized, using default config")
+        DEEPEP_CONFIG = ""
+    return DEEPEP_CONFIG
+def is_tbo_enabled() -> bool:
+    global IS_TBO_ENABLED
+    if IS_TBO_ENABLED is None:
+        logger.warning("IS_TBO_ENABLED is not initialized, using False")
+        IS_TBO_ENABLED = False
+    return IS_TBO_ENABLED
+def get_tbo_token_distribution_threshold() -> float:
+    global TBO_TOKEN_DISTRIBUTION_THRESHOLD
+    if TBO_TOKEN_DISTRIBUTION_THRESHOLD is None:
+        logger.warning(
+            "TBO_TOKEN_DISTRIBUTION_THRESHOLD is not initialized, using 0.48"
+        )
+        TBO_TOKEN_DISTRIBUTION_THRESHOLD = 0.48
+    return TBO_TOKEN_DISTRIBUTION_THRESHOLD
+@lru_cache(maxsize=1)
+def should_use_flashinfer_trtllm_moe():
+    result = get_moe_runner_backend().is_flashinfer_trtllm() and (
+        not importlib.util.find_spec("flashinfer")
+        or pkg_version.parse(__import__("flashinfer").__version__)
+        >= pkg_version.parse("0.2.9rc1")
+    )
+    return result
+@lru_cache(maxsize=1)
+def should_use_flashinfer_cutlass_moe_fp4_allgather():
+    """
+    Perform FP4 quantize before all-gather for flashinfer cutlass moe to reduce communication cost for high-throughput serving.
+    """
+    return (
+        not DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER
+        and get_moe_runner_backend().is_flashinfer_cutlass()
+        and is_dp_attention_enabled()
+        and get_moe_expert_parallel_world_size() == get_attention_dp_size()
+    )

sglang/srt/layers/multimodal.py CHANGED Viewed

@@ -17,57 +17,173 @@ import torch
 import triton
 import triton.language as tl
+FMIX32_C1 = 0x85EBCA6B
+FMIX32_C2 = 0xC2B2AE35
+POS_C1 = 0x27D4EB2D
+POS_C2 = 0x165667B1
+@triton.jit
+def _rotl32(x, r: tl.constexpr):
+    return (x << r) | (x >> (32 - r))
+@triton.jit
+def _fmix32(x, C1: tl.constexpr, C2: tl.constexpr):
+    c1 = tl.full((), C1, tl.uint32)
+    c2 = tl.full((), C2, tl.uint32)
+    x ^= x >> 16
+    x = x * c1
+    x ^= x >> 13
+    x = x * c2
+    x ^= x >> 16
+    return x
 @triton.jit
-def hash_kernel(
-    input_ptr,
-    output_ptr,
-    n_elements,
-    BLOCK_SIZE: tl.constexpr,
-    PRIME: tl.constexpr,
-    XCONST: tl.constexpr,
+def hash_tiles32_kernel_blocked(
+    in_ptr,
+    out_ptr,
+    n_u32,
+    seed1,
+    seed2,
+    FM_C1: tl.constexpr,
+    FM_C2: tl.constexpr,
+    POS_A: tl.constexpr,
+    POS_B: tl.constexpr,
+    TILE: tl.constexpr,
+    BLOCK: tl.constexpr,
+    USE_CG: tl.constexpr,
 ):
     pid = tl.program_id(axis=0)
-    block_start = pid * BLOCK_SIZE
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_elements
+    base = pid * TILE
+    s1 = tl.full((), seed1, tl.uint32)
+    s2 = tl.full((), seed2, tl.uint32)
+    posA = tl.full((), POS_A, tl.uint32)
+    posB = tl.full((), POS_B, tl.uint32)
+    h1 = tl.zeros((), dtype=tl.uint32)
+    h2 = tl.zeros((), dtype=tl.uint32)
+    for off in tl.static_range(0, TILE, BLOCK):
+        idx = base + off + tl.arange(0, BLOCK)
+        m = idx < n_u32
-    data = tl.load(input_ptr + offsets, mask=mask, other=0).to(tl.int64)
-    mixed = data ^ (offsets.to(tl.int64) + XCONST)
-    hash_val = mixed * PRIME
-    hash_val = hash_val ^ (hash_val >> 16)
-    hash_val = hash_val * (PRIME ^ XCONST)
-    hash_val = hash_val ^ (hash_val >> 13)
+        if USE_CG:
+            v = tl.load(in_ptr + idx, mask=m, other=0, cache_modifier=".cg")
+        else:
+            v = tl.load(in_ptr + idx, mask=m, other=0)
+        v = v.to(tl.uint32)
+        iu = idx.to(tl.uint32)
+        p1 = (iu * posA + s1) ^ _rotl32(iu, 15)
+        p2 = (iu * posB + s2) ^ _rotl32(iu, 13)
+        k1 = _fmix32(v ^ p1, C1=FM_C1, C2=FM_C2)
+        k2 = _fmix32(v ^ p2, C1=FM_C1, C2=FM_C2)
+        zero32 = tl.zeros_like(k1)
+        k1 = tl.where(m, k1, zero32)
+        k2 = tl.where(m, k2, zero32)
+        h1 += tl.sum(k1, axis=0).to(tl.uint32)
+        h2 += tl.sum(k2, axis=0).to(tl.uint32)
+    nbytes = tl.full((), n_u32 * 4, tl.uint32)
+    h1 ^= nbytes
+    h2 ^= nbytes
+    h1 = _fmix32(h1, C1=FM_C1, C2=FM_C2)
+    h2 = (
+        _fmix32(h2, C1=FMIX32_C1, C2=FMIX32_C2)
+        if False
+        else _fmix32(h2, C1=FM_C1, C2=FM_C2)
+    )
+    out = (h1.to(tl.uint64) << 32) | h2.to(tl.uint64)
+    tl.store(out_ptr + pid, out)
+@triton.jit
+def add_tree_reduce_u64_kernel(in_ptr, out_ptr, n_elems, CHUNK: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    start = pid * CHUNK
+    h = tl.zeros((), dtype=tl.uint64)
+    for i in tl.static_range(0, CHUNK):
+        idx = start + i
+        m = idx < n_elems
+        v = tl.load(in_ptr + idx, mask=m, other=0).to(tl.uint64)
+        h += v
+    tl.store(out_ptr + pid, h)
-    tl.store(output_ptr + offsets, hash_val, mask=mask)
+def _as_uint32_words(t: torch.Tensor) -> torch.Tensor:
+    assert t.is_cuda, "Use .cuda() first"
+    tb = t.contiguous().view(torch.uint8)
+    nbytes = tb.numel()
+    pad = (4 - (nbytes & 3)) & 3
+    if pad:
+        tb_p = torch.empty(nbytes + pad, dtype=torch.uint8, device=tb.device)
+        tb_p[:nbytes].copy_(tb)
+        tb_p[nbytes:].zero_()
+        tb = tb_p
+    return tb.view(torch.uint32)
-PRIME_1 = -(11400714785074694791 ^ 0xFFFFFFFFFFFFFFFF) - 1
-PRIME_2 = -(14029467366897019727 ^ 0xFFFFFFFFFFFFFFFF) - 1
+def _final_splitmix64(x: int) -> int:
+    mask = (1 << 64) - 1
+    x &= mask
+    x ^= x >> 30
+    x = (x * 0xBF58476D1CE4E5B9) & mask
+    x ^= x >> 27
+    x = (x * 0x94D049BB133111EB) & mask
+    x ^= x >> 31
+    return x
-def gpu_tensor_hash(tensor: torch.Tensor) -> int:
-    assert tensor.is_cuda
-    tensor = tensor.contiguous().view(torch.int32)
-    n = tensor.numel()
-    BLOCK_SIZE = 1024
-    grid = (triton.cdiv(n, BLOCK_SIZE),)
-    intermediate_hashes = torch.empty(n, dtype=torch.int64, device=tensor.device)
+@torch.inference_mode()
+def gpu_tensor_hash(
+    tensor: torch.Tensor,
+    *,
+    seed: int = 0x243F6A88,
+    tile_words: int = 8192,
+    block_words: int = 256,
+    reduce_chunk: int = 1024,
+    num_warps: int = 4,
+    num_stages: int = 4,
+    use_cg: bool = True,
+) -> int:
+    assert tensor.is_cuda, "Use .cuda() first"
+    u32 = _as_uint32_words(tensor)
+    n = u32.numel()
+    if n == 0:
+        return 0
-    # Set cuda device to prevent ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
-    # Solution from Tri: https://github.com/Dao-AILab/flash-attention/issues/523#issuecomment-1707611579
-    with torch.cuda.device(tensor.device):
-        hash_kernel[grid](
-            tensor,
-            intermediate_hashes,
-            n,
-            BLOCK_SIZE=BLOCK_SIZE,
-            PRIME=PRIME_1,
-            XCONST=PRIME_2,
-        )
+    grid1 = (triton.cdiv(n, tile_words),)
+    partials = torch.empty(grid1[0], dtype=torch.uint64, device=u32.device)
+    hash_tiles32_kernel_blocked[grid1](
+        u32,
+        partials,
+        n,
+        seed1=seed & 0xFFFFFFFF,
+        seed2=((seed * 0x9E3779B1) ^ 0xDEADBEEF) & 0xFFFFFFFF,
+        FM_C1=FMIX32_C1,
+        FM_C2=FMIX32_C2,
+        POS_A=POS_C1,
+        POS_B=POS_C2,
+        TILE=tile_words,
+        BLOCK=block_words,
+        USE_CG=use_cg,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
-    # TODO: threads can't be synced on triton kernel
-    final_hash = intermediate_hashes.sum().item()
+    cur = partials
+    while cur.numel() > 1:
+        n_elems = cur.numel()
+        grid2 = (triton.cdiv(n_elems, reduce_chunk),)
+        nxt = torch.empty(grid2[0], dtype=torch.uint64, device=cur.device)
+        add_tree_reduce_u64_kernel[grid2](cur, nxt, n_elems, CHUNK=reduce_chunk)
+        cur = nxt
-    return final_hash
+    return _final_splitmix64(int(cur.item()))

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -16,7 +16,6 @@ try:
     )
     from vllm.model_executor.layers.quantization.deepspeedfp import DeepSpeedFPConfig
     from vllm.model_executor.layers.quantization.experts_int8 import ExpertsInt8Config
-    from vllm.model_executor.layers.quantization.fbgemm_fp8 import FBGEMMFp8Config
     from vllm.model_executor.layers.quantization.gguf import GGUFConfig
     from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
         GPTQMarlin24Config,
@@ -37,9 +36,9 @@ except ImportError as e:
     AQLMConfig = BitsAndBytesConfig = CompressedTensorsConfig = DeepSpeedFPConfig = (
         ExpertsInt8Config
-    ) = FBGEMMFp8Config = GGUFConfig = GPTQMarlin24Config = MarlinConfig = QQQConfig = (
-        Int8TpuConfig
-    ) = DummyConfig
+    ) = GGUFConfig = GPTQMarlin24Config = MarlinConfig = QQQConfig = Int8TpuConfig = (
+        DummyConfig
+    )
 from sglang.srt.layers.quantization.awq import AWQConfig, AWQMarlinConfig
@@ -48,20 +47,9 @@ from sglang.srt.layers.quantization.blockwise_int8 import BlockInt8Config
 from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
     CompressedTensorsConfig,
 )
-from sglang.srt.utils import is_cuda, is_hip, mxfp_supported
-is_mxfp_supported = mxfp_supported()
-if is_mxfp_supported:
-    from sglang.srt.layers.quantization.fp4 import MxFp4Config
 from sglang.srt.layers.quantization.fp8 import Fp8Config
-from sglang.srt.layers.quantization.gptq import (
-    GPTQConfig,
-    GPTQLinearMethod,
-    GPTQMarlinConfig,
-    GPTQMarlinLinearMethod,
-    GPTQMarlinMoEMethod,
-)
+from sglang.srt.layers.quantization.fpgemm_fp8 import FBGEMMFp8Config
+from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
 from sglang.srt.layers.quantization.modelopt_quant import (
     ModelOptFp4Config,
     ModelOptFp8Config,
@@ -70,10 +58,12 @@ from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
 from sglang.srt.layers.quantization.mxfp4 import Mxfp4Config
 from sglang.srt.layers.quantization.petit import PetitNvFp4Config
 from sglang.srt.layers.quantization.qoq import QoQConfig
-from sglang.srt.layers.quantization.utils import get_linear_quant_method
 from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config
 from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
 from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
+from sglang.srt.utils import is_cuda, is_hip, mxfp_supported
+_is_mxfp_supported = mxfp_supported()
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.topk import TopKOutput
@@ -86,11 +76,16 @@ BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "modelopt_fp4": ModelOptFp4Config,
     "w8a8_int8": W8A8Int8Config,
     "w8a8_fp8": W8A8Fp8Config,
+    "awq": AWQConfig,
+    "awq_marlin": AWQMarlinConfig,
+    "gptq": GPTQConfig,
+    "gptq_marlin": GPTQMarlinConfig,
     "moe_wna16": MoeWNA16Config,
     "compressed-tensors": CompressedTensorsConfig,
     "qoq": QoQConfig,
     "w4afp8": W4AFp8Config,
     "petit_nvfp4": PetitNvFp4Config,
+    "fbgemm_fp8": FBGEMMFp8Config,
 }
@@ -101,29 +96,26 @@ if is_cuda():
             "mxfp4": Mxfp4Config,
         }
     )
-elif is_mxfp_supported and is_hip():
+elif _is_mxfp_supported and is_hip():
+    from sglang.srt.layers.quantization.quark.quark import QuarkConfig
     BASE_QUANTIZATION_METHODS.update(
         {
-            "quark": MxFp4Config,
-            "mxfp4": MxFp4Config,
+            "quark": QuarkConfig,
+            "mxfp4": Mxfp4Config,
         }
     )
 # VLLM-dependent quantization methods
 VLLM_QUANTIZATION_METHODS = {
     "aqlm": AQLMConfig,
-    "awq": AWQConfig,
     "deepspeedfp": DeepSpeedFPConfig,
     "tpu_int8": Int8TpuConfig,
-    "fbgemm_fp8": FBGEMMFp8Config,
     "marlin": MarlinConfig,
     "gguf": GGUFConfig,
     "gptq_marlin_24": GPTQMarlin24Config,
-    "awq_marlin": AWQMarlinConfig,
     "bitsandbytes": BitsAndBytesConfig,
     "qqq": QQQConfig,
     "experts_int8": ExpertsInt8Config,
-    "gptq_marlin": GPTQMarlinConfig,
-    "gptq": GPTQConfig,
 }
 QUANTIZATION_METHODS = {**BASE_QUANTIZATION_METHODS, **VLLM_QUANTIZATION_METHODS}
@@ -145,23 +137,6 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     return QUANTIZATION_METHODS[quantization]
-def gptq_get_quant_method(self, layer, prefix):
-    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
-    if isinstance(layer, FusedMoE):
-        return GPTQMarlinMoEMethod(self)
-    if isinstance(self, GPTQConfig):
-        return get_linear_quant_method(
-            self, layer, prefix=prefix, linear_method_cls=GPTQLinearMethod
-        )
-    elif isinstance(self, GPTQMarlinConfig):
-        return get_linear_quant_method(
-            self, layer, prefix=prefix, linear_method_cls=GPTQMarlinLinearMethod
-        )
-    return None
 original_isinstance = builtins.isinstance
@@ -239,10 +214,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
 def monkey_patch_quant_configs():
     """Apply all monkey patches in one place."""
-    setattr(GPTQMarlinConfig, "get_quant_method", gptq_get_quant_method)
-    setattr(GPTQConfig, "get_quant_method", gptq_get_quant_method)
-    monkey_patch_moe_apply(GPTQMarlinMoEMethod)
     monkey_patch_moe_apply(CompressedTensorsW8A8Fp8MoEMethod)
     monkey_patch_moe_apply(CompressedTensorsWNA16MoEMethod)

sglang/srt/layers/quantization/awq.py CHANGED Viewed

@@ -29,29 +29,26 @@ from sglang.srt.layers.quantization.marlin_utils import (
     verify_marlin_supported,
     verify_marlin_supports_shape,
 )
-from sglang.srt.layers.quantization.scalar_type import scalar_types
 from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
-from sglang.srt.layers.quantization.utils import replace_parameter
+from sglang.srt.layers.quantization.utils import get_scalar_types, replace_parameter
 if TYPE_CHECKING:
-    from sglang.srt.layers.moe.topk import TopKOutput
-try:
-    from vllm import _custom_ops as ops
-    warnings.warn(
-        f"Using kernels directly from vllm. This might lead to performance degradation or "
-        f"missing functionalities as certain kernels may not be optimized. "
-    )
-except ImportError:
-    ops = None
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+    from sglang.srt.layers.moe.topk import StandardTopKOutput
 from sglang.srt.utils import is_cuda, is_hip
 _is_cuda = is_cuda()
 _is_hip = is_hip()
 if _is_cuda:
-    from sgl_kernel import awq_dequantize, fused_marlin_moe
+    from sgl_kernel import (
+        awq_dequantize,
+        awq_marlin_moe_repack,
+        awq_marlin_repack,
+        fused_marlin_moe,
+    )
 elif _is_hip:
     from sglang.srt.layers.quantization.awq_triton import (
         awq_dequantize_triton as awq_dequantize,
@@ -64,6 +61,9 @@ else:
 logger = logging.getLogger(__name__)
+ScalarType, scalar_types = get_scalar_types()
 def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]):
     return any(module_name in prefix for module_name in modules_to_not_convert)
@@ -516,7 +516,7 @@ class AWQMarlinLinearMethod(LinearMethodBase):
         layer.workspace = marlin_make_workspace(device)
         # Repack weights from AWQ format to marlin format.
-        marlin_qweight = ops.awq_marlin_repack(
+        marlin_qweight = awq_marlin_repack(
             layer.qweight,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
@@ -684,7 +684,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
             requires_grad=False,
         )
-        marlin_w13_qweight = ops.awq_marlin_moe_repack(
+        marlin_w13_qweight = awq_marlin_moe_repack(
             layer.w13_qweight,
             layer.w13_g_idx_sort_indices,
             size_k=layer.w13_qweight.shape[1],
@@ -693,7 +693,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
         )
         replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
-        marlin_w2_qweight = ops.awq_marlin_moe_repack(
+        marlin_w2_qweight = awq_marlin_moe_repack(
             layer.w2_qweight,
             layer.w2_g_idx_sort_indices,
             size_k=layer.w2_qweight.shape[1],
@@ -740,13 +740,12 @@ class AWQMoEMethod(FusedMoEMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        topk_output: TopKOutput,
-        *,
-        activation: str = "silu",
-        **kwargs,
+        topk_output: StandardTopKOutput,
+        moe_runner_config: MoeRunnerConfig,
     ) -> torch.Tensor:
-        assert activation == "silu", "Only SiLU activation is supported."
+        assert (
+            moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
         # The input must currently be float16
         orig_dtype = x.dtype

sglang/srt/layers/quantization/base_config.py CHANGED Viewed

@@ -9,6 +9,7 @@ import torch
 from torch import nn
 if TYPE_CHECKING:
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
     from sglang.srt.layers.moe.topk import TopKOutput
@@ -100,12 +101,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         layer: torch.nn.Module,
         x: torch.Tensor,
         topk_output: TopKOutput,
-        *,
-        activation: str = "silu",
-        apply_router_weight_on_input: bool = False,
-        inplace: bool = True,
-        no_combine: bool = False,
-        routed_scaling_factor: Optional[float] = None,
+        moe_runner_config: MoeRunnerConfig,
     ) -> torch.Tensor:
         raise NotImplementedError

sglang 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl

sglang 0.5.0rc1py3-none-any.whl → 0.5.1py3-none-any.whl