PyPI - sglang - Versions diffs - 0.4.3.post2__py3-none-any.whl → 0.4.3.post4__py3-none-any.whl - Mend

sglang 0.4.3.post2py3-none-any.whl → 0.4.3.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (205) hide show

sglang/api.py +1 -1
sglang/bench_offline_throughput.py +19 -0
sglang/bench_one_batch.py +2 -2
sglang/bench_serving.py +123 -79
sglang/global_config.py +8 -3
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/ir.py +1 -1
sglang/srt/_custom_ops.py +83 -91
sglang/srt/configs/load_config.py +4 -1
sglang/srt/configs/model_config.py +48 -2
sglang/srt/configs/qwen2_5_vl_config.py +5 -2
sglang/srt/constrained/base_grammar_backend.py +117 -15
sglang/srt/constrained/llguidance_backend.py +151 -0
sglang/srt/constrained/outlines_backend.py +24 -33
sglang/srt/constrained/xgrammar_backend.py +69 -38
sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
sglang/srt/distributed/parallel_state.py +48 -3
sglang/srt/entrypoints/engine.py +67 -9
sglang/srt/entrypoints/http_server.py +190 -41
sglang/srt/entrypoints/verl_engine.py +147 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/activation.py +11 -0
sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
sglang/srt/layers/attention/flashinfer_backend.py +302 -414
sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
sglang/srt/layers/attention/torch_native_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +13 -8
sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
sglang/srt/layers/attention/utils.py +39 -0
sglang/srt/layers/attention/vision.py +60 -63
sglang/srt/layers/dp_attention.py +142 -1
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/linear.py +3 -1
sglang/srt/layers/logits_processor.py +281 -45
sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
sglang/srt/layers/moe/ep_moe/layer.py +140 -28
sglang/srt/layers/moe/fused_moe_native.py +2 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
sglang/srt/layers/moe/topk.py +13 -4
sglang/srt/layers/quantization/__init__.py +111 -7
sglang/srt/layers/quantization/blockwise_int8.py +409 -0
sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/fp8.py +69 -28
sglang/srt/layers/quantization/fp8_utils.py +17 -1
sglang/srt/layers/quantization/gptq.py +416 -0
sglang/srt/layers/quantization/int8_kernel.py +327 -0
sglang/srt/layers/quantization/int8_utils.py +73 -0
sglang/srt/layers/quantization/modelopt_quant.py +18 -1
sglang/srt/layers/radix_attention.py +1 -0
sglang/srt/layers/rotary_embedding.py +0 -1
sglang/srt/layers/sampler.py +76 -31
sglang/srt/layers/vocab_parallel_embedding.py +14 -13
sglang/srt/lora/lora.py +17 -1
sglang/srt/lora/lora_config.py +5 -0
sglang/srt/lora/lora_manager.py +1 -3
sglang/srt/managers/cache_controller.py +193 -62
sglang/srt/managers/configure_logging.py +2 -1
sglang/srt/managers/data_parallel_controller.py +6 -2
sglang/srt/managers/detokenizer_manager.py +124 -102
sglang/srt/managers/image_processor.py +2 -1
sglang/srt/managers/io_struct.py +144 -6
sglang/srt/managers/schedule_batch.py +237 -197
sglang/srt/managers/schedule_policy.py +29 -29
sglang/srt/managers/scheduler.py +773 -334
sglang/srt/managers/session_controller.py +6 -2
sglang/srt/managers/tokenizer_manager.py +225 -68
sglang/srt/managers/tp_worker.py +15 -4
sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
sglang/srt/mem_cache/chunk_cache.py +18 -11
sglang/srt/mem_cache/hiradix_cache.py +394 -0
sglang/srt/mem_cache/memory_pool.py +68 -37
sglang/srt/mem_cache/radix_cache.py +58 -47
sglang/srt/metrics/collector.py +102 -36
sglang/srt/model_executor/cuda_graph_runner.py +56 -31
sglang/srt/model_executor/forward_batch_info.py +49 -16
sglang/srt/model_executor/model_runner.py +280 -81
sglang/srt/model_loader/loader.py +3 -3
sglang/srt/model_loader/weight_utils.py +36 -14
sglang/srt/models/baichuan.py +31 -6
sglang/srt/models/chatglm.py +39 -7
sglang/srt/models/commandr.py +29 -5
sglang/srt/models/dbrx.py +31 -5
sglang/srt/models/deepseek.py +43 -6
sglang/srt/models/deepseek_nextn.py +32 -19
sglang/srt/models/deepseek_v2.py +265 -32
sglang/srt/models/exaone.py +19 -9
sglang/srt/models/gemma.py +22 -8
sglang/srt/models/gemma2.py +25 -12
sglang/srt/models/gemma2_reward.py +5 -1
sglang/srt/models/gpt2.py +28 -13
sglang/srt/models/gpt_bigcode.py +27 -5
sglang/srt/models/granite.py +21 -9
sglang/srt/models/grok.py +21 -4
sglang/srt/models/internlm2.py +36 -6
sglang/srt/models/internlm2_reward.py +5 -1
sglang/srt/models/llama.py +26 -9
sglang/srt/models/llama_classification.py +5 -1
sglang/srt/models/llama_eagle.py +17 -4
sglang/srt/models/llama_embedding.py +5 -1
sglang/srt/models/llama_reward.py +7 -2
sglang/srt/models/llava.py +19 -3
sglang/srt/models/llavavid.py +10 -1
sglang/srt/models/minicpm.py +26 -2
sglang/srt/models/minicpm3.py +39 -3
sglang/srt/models/minicpmv.py +45 -14
sglang/srt/models/mixtral.py +20 -9
sglang/srt/models/mixtral_quant.py +50 -8
sglang/srt/models/mllama.py +57 -11
sglang/srt/models/olmo.py +34 -6
sglang/srt/models/olmo2.py +34 -13
sglang/srt/models/olmoe.py +26 -4
sglang/srt/models/phi3_small.py +29 -10
sglang/srt/models/qwen.py +26 -3
sglang/srt/models/qwen2.py +26 -4
sglang/srt/models/qwen2_5_vl.py +46 -8
sglang/srt/models/qwen2_eagle.py +17 -5
sglang/srt/models/qwen2_moe.py +44 -6
sglang/srt/models/qwen2_rm.py +78 -0
sglang/srt/models/qwen2_vl.py +39 -8
sglang/srt/models/stablelm.py +32 -5
sglang/srt/models/torch_native_llama.py +5 -2
sglang/srt/models/xverse.py +21 -9
sglang/srt/models/xverse_moe.py +45 -7
sglang/srt/models/yivl.py +2 -1
sglang/srt/openai_api/adapter.py +109 -24
sglang/srt/openai_api/protocol.py +17 -1
sglang/srt/reasoning_parser.py +154 -0
sglang/srt/sampling/penaltylib/__init__.py +4 -6
sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
sglang/srt/sampling/sampling_batch_info.py +79 -157
sglang/srt/sampling/sampling_params.py +16 -13
sglang/srt/server_args.py +135 -60
sglang/srt/speculative/build_eagle_tree.py +8 -9
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -12
sglang/srt/speculative/eagle_utils.py +92 -57
sglang/srt/speculative/eagle_worker.py +238 -111
sglang/srt/speculative/spec_info.py +1 -13
sglang/srt/utils.py +43 -17
sglang/srt/warmup.py +47 -0
sglang/test/few_shot_gsm8k.py +4 -1
sglang/test/runners.py +389 -126
sglang/test/send_one.py +88 -0
sglang/test/test_block_fp8_ep.py +361 -0
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +138 -84
sglang/utils.py +50 -60
sglang/version.py +1 -1
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/METADATA +22 -15
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/RECORD +200 -166
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/WHEEL +1 -1
sglang/bench_latency.py +0 -1
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
sglang/test/srt/sampling/penaltylib/utils.py +0 -344
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/LICENSE +0 -0
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -51,6 +51,10 @@ ACTIVATION_SCHEMES = ["static", "dynamic"]
 is_hip_ = is_hip()
+if is_hip_:
+    from aiter.fused_moe_bf16_asm import asm_moe
+    from aiter.ops.shuffle import shuffle_weight
 logger = logging.getLogger(__name__)
@@ -533,6 +537,20 @@ class Fp8MoEMethod:
             )
             layer.register_parameter("w13_weight_scale", w13_weight_scale)
             layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            if is_hip_ and get_bool_env_var("CK_MOE"):
+                # ROCm - using column scaling, duplicate scaling numbers in case per tensor scaling
+                w13_weight_scale1 = torch.nn.Parameter(
+                    torch.ones(num_experts, 2 * intermediate_size, dtype=torch.float32),
+                    requires_grad=False,
+                )
+                w2_weight_scale1 = torch.nn.Parameter(
+                    torch.ones(num_experts, hidden_size, dtype=torch.float32),
+                    requires_grad=False,
+                )
+                layer.register_parameter("w13_weight_scale1", w13_weight_scale1)
+                layer.register_parameter("w2_weight_scale1", w2_weight_scale1)
         # Add the quantization method used (per tensor/grouped/channel)
         # to ensure the weight scales are loaded in properly
         extra_weight_attrs.update(
@@ -602,6 +620,15 @@ class Fp8MoEMethod:
                     w2_weight_scale, requires_grad=False
                 )
                 layer.w2_input_scale = None
+                if get_bool_env_var("CK_MOE"):
+                    # Pre-shuffle weights
+                    layer.w13_weight.data = shuffle_weight(
+                        layer.w13_weight.contiguous(), (16, 16)
+                    )
+                    layer.w2_weight.data = shuffle_weight(
+                        layer.w2_weight.contiguous(), (16, 16)
+                    )
             return
         # If checkpoint is fp16 or bfloat16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
@@ -640,6 +667,9 @@ class Fp8MoEMethod:
                         requires_grad=False,
                     )
                     torch.cuda.empty_cache()
+                    # ROCm (CK_MOE): using column-wise scaling
+                    layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1)
+                    layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1)
                 elif get_bool_env_var("MOE_PADDING"):
                     # If ROCm, apply weight padding (min. Mem channel contention) only if set
                     layer.w13_weight = torch.nn.Parameter(
@@ -744,6 +774,9 @@ class Fp8MoEMethod:
                         requires_grad=False,
                     )
                     torch.cuda.empty_cache()
+                    # ROCm (CK_MOE): using column-wise scaling
+                    layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1)
+                    layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1)
                 elif get_bool_env_var("MOE_PADDING"):
                     # If ROCm, apply weight padding (min. Mem channel contention) only if set
                     layer.w13_weight = torch.nn.Parameter(
@@ -771,6 +804,8 @@ class Fp8MoEMethod:
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
+        inplace: bool = True,
+        no_combine: bool = False,
     ) -> torch.Tensor:
         from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
         from sglang.srt.layers.moe.topk import select_experts
@@ -788,33 +823,38 @@ class Fp8MoEMethod:
             correction_bias=correction_bias,
         )
-        if is_hip_ and get_bool_env_var("CK_MOE"):
-            import ater
-            from ater.fused_moe import fused_experts_ck
-            assert activation == "silu", f"{activation=} is not supported."
-            return fused_experts_ck(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                use_fp8_w8a8=True,
-                w1_scale=(
-                    layer.w13_weight_scale_inv
-                    if self.block_quant
-                    else layer.w13_weight_scale
-                ),
-                w2_scale=(
-                    layer.w2_weight_scale_inv
-                    if self.block_quant
-                    else layer.w2_weight_scale
-                ),
-                a1_scale=layer.w13_input_scale,
-                a2_scale=layer.w2_input_scale,
-            )
+        if is_hip_ and get_bool_env_var("CK_MOE") and activation == "silu":
+            # TODO(CK_MOE): FP8 or FP8 block_quant only supports 'silu' for the time-being.
+            assert not no_combine, f"{no_combine=} is not supported."
+            if self.block_quant:
+                return asm_moe(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    layer.w13_weight_scale_inv,
+                    layer.w2_weight_scale_inv,
+                    None,
+                    None,
+                    False,
+                    None,
+                    block_shape=tuple(self.quant_config.weight_block_size),
+                    expert_mask=None,
+                )
+            else:
+                return asm_moe(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    layer.w13_weight_scale1,
+                    layer.w2_weight_scale1,
+                    None,
+                    None,
+                    False,
+                )
         else:
             # Expert fusion with FP8 quantization
             return fused_experts(
@@ -823,7 +863,7 @@ class Fp8MoEMethod:
                 layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                inplace=True,
+                inplace=inplace and not no_combine,
                 activation=activation,
                 use_fp8_w8a8=True,
                 w1_scale=(
@@ -839,6 +879,7 @@ class Fp8MoEMethod:
                 a1_scale=layer.w13_input_scale,
                 a2_scale=layer.w2_input_scale,
                 block_shape=self.quant_config.weight_block_size,
+                no_combine=no_combine,
             )

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import os
 from typing import List, Optional, Tuple
 import torch
@@ -7,9 +8,12 @@ from sglang.srt.layers.quantization.fp8_kernel import (
     per_token_group_quant_fp8,
     w8a8_block_fp8_matmul,
 )
-from sglang.srt.utils import is_hip
+from sglang.srt.utils import get_bool_env_var, is_hip
 is_hip_ = is_hip()
+if is_hip_ and get_bool_env_var("CK_MOE"):
+    from aiter import gemm_a8w8_blockscale
 _is_cuda = torch.cuda.is_available() and torch.version.cuda
 if _is_cuda:
     from sgl_kernel import fp8_blockwise_scaled_mm
@@ -40,6 +44,8 @@ def normalize_e4m3fn_to_e4m3fnuz(
 def cutlass_block_fp8_supported() -> bool:
+    if os.environ.get("SUPPORT_CUTLASS_BLOCK_FP8") is None:
+        return False
     if _is_cuda:
         major, minor = torch.cuda.get_device_capability()
         sm_version = major * 10 + minor
@@ -75,6 +81,16 @@ def apply_w8a8_block_fp8_linear(
         output = fp8_blockwise_scaled_mm(
             q_input, weight.T, x_scale, weight_scale.T, out_dtype=input.dtype
         )
+    elif is_hip_ and get_bool_env_var("CK_MOE"):
+        q_input, x_scale = per_token_group_quant_fp8(
+            input_2d, block_size[1], column_major_scales=False
+        )
+        output = torch.zeros(
+            [q_input.shape[0], weight.shape[0]],
+            dtype=input.dtype,
+            device=q_input.device,
+        )
+        gemm_a8w8_blockscale(q_input, weight, x_scale, weight_scale, output)
     else:
         q_input, x_scale = per_token_group_quant_fp8(
             input_2d, block_size[1], column_major_scales=False

sglang/srt/layers/quantization/gptq.py ADDED Viewed

@@ -0,0 +1,416 @@
+import logging
+from fractions import Fraction
+from typing import Any, Dict, List, Optional, Union
+import torch
+from vllm.scalar_type import scalar_types
+from sglang.srt.layers.linear import LinearBase
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+logger = logging.getLogger(__name__)
+class GPTQConfig(QuantizationConfig):
+    """Config class for GPTQ.
+    Reference: https://arxiv.org/abs/2210.17323
+    """
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        lm_head_quantized: bool,
+        dynamic: Dict[str, Dict[str, Union[int, bool]]],
+    ) -> None:
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is Dict[str, Dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        super().__init__()
+        self.dynamic = dynamic
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
+        self.pack_factor = Fraction(32, self.weight_bits)
+        if self.weight_bits not in [2, 3, 4, 8]:
+            raise ValueError(
+                "Currently, only 2/3/4/8-bit weight quantization is "
+                f"supported for GPTQ, but got {self.weight_bits} bits."
+            )
+    def __repr__(self) -> str:
+        return (
+            f"GPTQConfig(weight_bits={self.weight_bits}, "
+            f"group_size={self.group_size}, "
+            f"desc_act={self.desc_act}),"
+            f"lm_head_quantized={self.lm_head_quantized}), "
+            f"dynamic={self.dynamic}"
+        )
+    def get_scaled_act_names(self) -> List[str]:
+        """Returns the activation function names that should be post-scaled.
+        For now, this is only used by AWQ.
+        """
+        raise NotImplementedError
+    @classmethod
+    def get_name(cls) -> str:
+        return "gptq"
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half]
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 60
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig":
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        dynamic = {} if dynamic is None else dynamic
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        return cls(weight_bits, group_size, desc_act, lm_head_quantized, dynamic)
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["GPTQLinearMethod"]:
+        from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+        from sglang.srt.layers.quantization import get_linear_quant_method
+        return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod)
+class GPTQMarlinConfig(QuantizationConfig):
+    """Config class for GPTQ Marlin"""
+    # (num_bits, is_sym) -> quant_type
+    TYPE_MAP = {
+        (4, True): scalar_types.uint4b8,
+        (8, True): scalar_types.uint8b128,
+    }
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        lm_head_quantized: bool,
+        dynamic: Dict[str, Dict[str, Union[int, bool]]],
+        full_config: Dict[str, Any],
+    ) -> None:
+        super().__init__()
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is Dict[str, Dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        self.dynamic = dynamic
+        self.weight_bits = weight_bits
+        self.is_sym = is_sym
+        self.pack_factor = 32 // weight_bits  # packed into int32
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
+        self.full_config = full_config
+        if (weight_bits, is_sym) not in self.TYPE_MAP:
+            raise ValueError(
+                "Unsupported quantization config: " f"bits={weight_bits}, sym={is_sym}"
+            )
+        self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
+    def __repr__(self) -> str:
+        return (
+            f"GPTQMarlinConfig(quant_type={self.quant_type}, "
+            f"group_size={self.group_size}, "
+            f"desc_act={self.desc_act}, "
+            f"lm_head_quantized={self.lm_head_quantized}), "
+            f"dynamic={self.dynamic}"
+        )
+    def get_scaled_act_names(self) -> List[str]:
+        """Returns the activation function names that should be post-scaled.
+        For now, this is only used by AWQ.
+        """
+        raise NotImplementedError
+    @classmethod
+    def get_name(cls) -> str:
+        return "gptq_marlin"
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        dynamic = {} if dynamic is None else dynamic
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        is_sym = cls.get_from_keys(config, ["sym"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        return cls(
+            weight_bits,
+            group_size,
+            desc_act,
+            is_sym,
+            lm_head_quantized,
+            dynamic,
+            config,
+        )
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
+        is_valid_user_quant = (
+            user_quant is None or user_quant == "marlin" or user_quant == "gptq_marlin"
+        )
+        if can_convert and is_valid_user_quant:
+            msg = (
+                "The model is convertible to {} during runtime."
+                " Using {} kernel.".format(cls.get_name(), cls.get_name())
+            )
+            logger.info(msg)
+            return cls.get_name()
+        if can_convert and user_quant == "gptq":
+            logger.info(
+                "Detected that the model can run with gptq_marlin"
+                ", however you specified quantization=gptq explicitly,"
+                " so forcing gptq. Use quantization=gptq_marlin for"
+                " faster inference"
+            )
+        return None
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        from vllm.model_executor.layers.quantization.gptq_marlin import (
+            GPTQMarlinLinearMethod,
+            GPTQMarlinMoEMethod,
+        )
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        from sglang.srt.layers.quantization import get_linear_quant_method
+        if isinstance(layer, FusedMoE):
+            return GPTQMarlinMoEMethod(self)
+            # TODO: re-enable after SGLang syncs with vllm >= 0.7.3
+            # if layer.num_experts > 32:
+            #     # For MoEs with many experts the moe_wna16 kernel is faster
+            #     return MoeWNA16Config.from_config(self.full_config).get_quant_method(
+            #         layer, prefix
+            #     )
+            # else:
+            #     return GPTQMarlinMoEMethod(self)
+        return get_linear_quant_method(self, layer, prefix, GPTQMarlinLinearMethod)
+    @classmethod
+    def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        group_size = quant_config.get("group_size")
+        sym = quant_config.get("sym")
+        desc_act = quant_config.get("desc_act")
+        from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+            check_marlin_supported,
+        )
+        from vllm.platforms import current_platform
+        if not current_platform.is_cuda():
+            return False
+        if quant_method != "gptq":
+            return False
+        # Marlin conversion is only valid if required properties are found
+        if num_bits is None or group_size is None or sym is None or desc_act is None:
+            return False
+        if (num_bits, sym) not in cls.TYPE_MAP:
+            return False
+        return check_marlin_supported(
+            quant_type=cls.TYPE_MAP[(num_bits, sym)], group_size=group_size
+        )
+class MarlinConfig(QuantizationConfig):
+    """Config class for Marlin.
+    Reference: https://github.com/IST-DASLab/marlin/tree/master
+    """
+    def __init__(
+        self,
+        group_size: int,
+        lm_head_quantized: bool,
+    ) -> None:
+        # Group size for the quantization.
+        self.group_size = group_size
+        self.lm_head_quantized = lm_head_quantized
+        if self.group_size != 128 and self.group_size != -1:
+            raise ValueError(
+                "Currently, only group size 128 and -1 (channelwise) "
+                "is supported for Marlin, but got group_size of "
+                f"{self.group_size}"
+            )
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = 32 // 4
+        # Tile size used by marlin kernels.
+        self.tile_size = 16
+        # Min out_features dim
+        self.min_n_threads = 64
+        # Min in_features dim
+        self.min_k_threads = 128
+        # Max parallel problems to solve at once (improves large
+        # batch performance)
+        self.max_parallel = 16
+        # Permutation length used by the marlin kernels.
+        self.perm_len = 1024
+    def __repr__(self) -> str:
+        return (
+            f"MarlinConfig(group_size={self.group_size}, "
+            f"lm_head_quantized={self.lm_head_quantized})"
+        )
+    @classmethod
+    def get_name(cls) -> str:
+        return "marlin"
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half]
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 80
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "MarlinConfig":
+        group_size = cls.get_from_keys(config, ["group_size"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        return cls(group_size, lm_head_quantized)
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        # compat: autogptq >=0.8.0 use checkpoint_format: str
+        # compat: autogptq <=0.7.1 is_marlin_format: bool
+        is_marlin_format = hf_quant_cfg.get(
+            "checkpoint_format"
+        ) == "marlin" or hf_quant_cfg.get("is_marlin_format", False)
+        is_valid_user_quant = (
+            user_quant is None or user_quant == "gptq" or user_quant == "marlin"
+        )
+        if is_marlin_format and is_valid_user_quant:
+            msg = "The model is serialized in {} format. Using {} kernel.".format(
+                cls.get_name(), cls.get_name()
+            )
+            logger.info(msg)
+            return cls.get_name()
+        return None
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["MarlinLinearMethod"]:
+        from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
+        if isinstance(layer, LinearBase) or (
+            isinstance(layer, ParallelLMHead) and self.lm_head_quantized
+        ):
+            return MarlinLinearMethod(self)
+        return None

sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post4__py3-none-any.whl

sglang 0.4.3.post2py3-none-any.whl → 0.4.3.post4py3-none-any.whl