PyPI - sglang - Versions diffs - 0.4.3.post3__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

sglang 0.4.3.post3py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (128) hide show

sglang/srt/layers/moe/fused_moe_triton/fused_moe.py CHANGED Viewed

@@ -11,20 +11,23 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
 import torch
 import triton
 import triton.language as tl
-from vllm import _custom_ops as ops
+from vllm import _custom_ops as vllm_ops
 from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
-from sglang.srt.layers.quantization.int8_kernel import per_token_group_quant_int8
+from sglang.srt.layers.quantization.int8_kernel import (
+    per_token_group_quant_int8,
+    per_token_quant_int8,
+)
 from sglang.srt.utils import (
     direct_register_custom_op,
     get_bool_env_var,
     get_device_name,
-    is_cuda_available,
+    is_cuda,
     is_hip,
 )
-is_hip_ = is_hip()
+_is_hip = is_hip()
 logger = logging.getLogger(__name__)
@@ -34,17 +37,17 @@ enable_moe_align_block_size_triton = bool(
     int(os.getenv("ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
 )
-_is_cuda = torch.cuda.is_available() and torch.version.cuda
-_is_rocm = torch.cuda.is_available() and torch.version.hip
+_is_cuda = is_cuda()
 if _is_cuda:
     from sgl_kernel import gelu_and_mul, silu_and_mul
+    from sglang.srt.custom_op import scaled_fp8_quant as sgl_scaled_fp8_quant
     from sglang.srt.layers.quantization.fp8_kernel import (
         sglang_per_token_group_quant_fp8,
     )
-if _is_cuda or _is_rocm:
+if _is_cuda or _is_hip:
     from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
@@ -117,6 +120,7 @@ def fused_moe_kernel(
     - expert_ids: A tensor containing the indices of the expert for each
         block. It determines which expert matrix from B should be used for
         each block in A.
     This kernel performs the multiplication of a token by its corresponding
     expert matrix as determined by `expert_ids`. The sorting of
     `sorted_token_ids` by expert index and padding ensures divisibility by
@@ -167,17 +171,38 @@ def fused_moe_kernel(
         )
         b_scale = tl.load(b_scale_ptrs)
-    if use_fp8_w8a8 or use_int8_w8a8:
+    if use_fp8_w8a8:
+        # block-wise
         if group_k > 0 and group_n > 0:
             a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
             offs_bsn = offs_bn // group_n
             b_scale_ptrs = (
                 b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn
             )
+        # tensor-wise
         else:
             a_scale = tl.load(a_scale_ptr)
             b_scale = tl.load(b_scale_ptr + off_experts)
+    if use_int8_w8a8:
+        # block-wise
+        if group_k > 0 and group_n > 0:
+            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            offs_bsn = offs_bn // group_n
+            b_scale_ptrs = (
+                b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn
+            )
+        # channel-wise
+        else:
+            # Load per-column scale for weights
+            b_scale_ptrs = (
+                b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn
+            )
+            b_scale = tl.load(b_scale_ptrs)
+            # Load per-token scale for activations
+            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0)[:, None]
     # -----------------------------------------------------------
     # Iterate to compute a block of the C matrix.
     # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
@@ -217,7 +242,11 @@ def fused_moe_kernel(
                 accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :]
             else:
-                accumulator = tl.dot(a, b, acc=accumulator)
+                # fix out of shared memory issue
+                if use_fp8_w8a8:
+                    accumulator = tl.dot(a, b, acc=accumulator)
+                else:
+                    accumulator += tl.dot(a, b)
         else:
             accumulator += tl.dot(a, b)
         # Advance the ptrs to the next K block.
@@ -458,7 +487,7 @@ def moe_align_block_size(
                 cumsum_buffer,
             )
     else:
-        ops.moe_align_block_size(
+        vllm_ops.moe_align_block_size(
             topk_ids,
             num_experts,
             block_size,
@@ -497,9 +526,14 @@ def invoke_fused_moe_kernel(
     if use_fp8_w8a8:
         assert B_scale is not None
         if block_shape is None:
+            # activation tensor-wise fp8 quantization, dynamic or static
             padded_size = padding_size
-            A, A_scale = ops.scaled_fp8_quant(A, A_scale)
+            if _is_cuda:
+                A, A_scale = sgl_scaled_fp8_quant(A, A_scale)
+            else:
+                A, A_scale = vllm_ops.scaled_fp8_quant(A, A_scale)
         else:
+            # activation block-wise fp8 quantization
             assert len(block_shape) == 2
             block_n, block_k = block_shape[0], block_shape[1]
             if _is_cuda:
@@ -512,9 +546,10 @@ def invoke_fused_moe_kernel(
     elif use_int8_w8a8:
         assert B_scale is not None
         if block_shape is None:
-            padded_size = padding_size
-            A, A_scale = ops.scaled_int8_quant(A, A_scale)
+            # activation channel-wise int8 quantization
+            A, A_scale = per_token_quant_int8(A)
         else:
+            # activation block-wise int8 quantization
             assert len(block_shape) == 2
             block_n, block_k = block_shape[0], block_shape[1]
             A, A_scale = per_token_group_quant_int8(A, block_k)
@@ -648,7 +683,7 @@ def get_default_config(
                 "BLOCK_SIZE_K": 128,
                 "GROUP_SIZE_M": 32,
                 "num_warps": 8,
-                "num_stages": 2 if is_hip_ else 4,
+                "num_stages": 2 if _is_hip else 4,
             }
             if M <= E:
                 config = {
@@ -657,7 +692,7 @@ def get_default_config(
                     "BLOCK_SIZE_K": 128,
                     "GROUP_SIZE_M": 1,
                     "num_warps": 4,
-                    "num_stages": 2 if is_hip_ else 4,
+                    "num_stages": 2 if _is_hip else 4,
                 }
         else:
             # Block-wise quant: BLOCK_SIZE_K must be divisable by block_shape[1]
@@ -667,7 +702,7 @@ def get_default_config(
                 "BLOCK_SIZE_K": block_shape[1],
                 "GROUP_SIZE_M": 32,
                 "num_warps": 4,
-                "num_stages": 2 if is_hip_ else 3,
+                "num_stages": 2 if _is_hip else 3,
             }
     else:
         config = {
@@ -945,7 +980,7 @@ def fused_experts_impl(
     if (
         not (use_fp8_w8a8 or use_int8_w8a8)
         or block_shape is not None
-        or (is_hip_ and get_bool_env_var("CK_MOE"))
+        or (_is_hip and get_bool_env_var("CK_MOE"))
     ):
         padded_size = 0
@@ -1029,7 +1064,9 @@ def fused_experts_impl(
             # so the cache size and config are already set correctly and
             # do not need to be adjusted.
             intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
-            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
+            intermediate_cache2 = intermediate_cache2[
+                : tokens_in_chunk * topk_ids.shape[1]
+            ]
             intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
             config = get_config_func(tokens_in_chunk)
@@ -1060,17 +1097,20 @@ def fused_experts_impl(
             use_int8_w8a16=use_int8_w8a16,
             block_shape=block_shape,
         )
         if activation == "silu":
             if _is_cuda:
                 silu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
             else:
-                ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+                vllm_ops.silu_and_mul(
+                    intermediate_cache2, intermediate_cache1.view(-1, N)
+                )
         elif activation == "gelu":
             if _is_cuda:
                 gelu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
             else:
-                ops.gelu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
+                vllm_ops.gelu_and_mul(
+                    intermediate_cache2, intermediate_cache1.view(-1, N)
+                )
         else:
             raise ValueError(f"Unsupported activation: {activation=}")
@@ -1101,8 +1141,8 @@ def fused_experts_impl(
         if no_combine:
             pass
-        elif is_hip_:
-            ops.moe_sum(
+        elif _is_hip:
+            vllm_ops.moe_sum(
                 intermediate_cache3.view(*intermediate_cache3.shape),
                 out_hidden_states[begin_chunk_idx:end_chunk_idx],
             )

sglang/srt/layers/moe/fused_moe_triton/layer.py CHANGED Viewed

@@ -27,9 +27,9 @@ else:
 import logging
-is_hip_ = is_hip()
+_is_hip = is_hip()
-if is_hip_:
+if _is_hip:
     from aiter import ck_moe
 logger = logging.getLogger(__name__)
@@ -102,7 +102,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         set_weight_attrs(w2_weight, extra_weight_attrs)
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        if is_hip_ and get_bool_env_var("CK_MOE"):
+        if _is_hip and get_bool_env_var("CK_MOE"):
             layer.w13_weight = torch.nn.Parameter(
                 permute_weight(layer.w13_weight.data),
                 requires_grad=False,
@@ -175,7 +175,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             correction_bias=correction_bias,
         )
-        if is_hip_ and get_bool_env_var("CK_MOE"):
+        if _is_hip and get_bool_env_var("CK_MOE"):
             assert not no_combine, "unsupported"
             return ck_moe(
                 x,
@@ -513,6 +513,10 @@ class FusedMoE(torch.nn.Module):
         # Case input scale: input_scale loading is only supported for fp8
         if "input_scale" in weight_name:
+            # INT4-FP8 (INT4 MoE Weight, FP8 Compute): Adjust input_scale for e4m3fnuz (AMD)
+            if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
+                loaded_weight = loaded_weight * 2.0
             # this is needed for compressed-tensors only
             loaded_weight = loaded_weight.to(param.data.device)
@@ -551,6 +555,10 @@ class FusedMoE(torch.nn.Module):
             # specific to each case
             quant_method = getattr(param, "quant_method", None)
             if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value:
+                # INT4-FP8 (INT4 MoE Weight, FP8 Compute): Adjust INT4 column-wise scaling number to e4m3fnuz (AMD)
+                if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
+                    loaded_weight = loaded_weight * 0.5
                 self._load_per_channel_weight_scale(
                     shard_id=shard_id,
                     shard_dim=shard_dim,
@@ -570,6 +578,10 @@ class FusedMoE(torch.nn.Module):
                     tp_rank=tp_rank,
                 )
             elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
+                # INT4-FP8 (INT4 MoE Weight, FP8 Compute): Adjust FP8 per-tensor scaling number for e4m3fnuz (AMD)
+                if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
+                    loaded_weight = loaded_weight * 2.0
                 self._load_per_tensor_weight_scale(
                     shard_id=shard_id,
                     param=param,

sglang/srt/layers/parameter.py CHANGED Viewed

@@ -16,6 +16,7 @@ __all__ = [
     "ModelWeightParameter",
     "ChannelQuantScaleParameter",
     "GroupQuantScaleParameter",
+    "BlockQuantScaleParameter",
     "PackedColumnParameter",
     "RowvLLMParameter",
 ]
@@ -221,6 +222,15 @@ class ChannelQuantScaleParameter(_ColumnvLLMParameter):
     pass
+class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    block-wise quantization. Uses both column and row parallelism.
+    """
+    pass
 class PerTensorScaleParameter(BasevLLMParameter):
     """
     Parameter class for scales where the number of scales is

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -1,4 +1,6 @@
 # Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/__init__.py
+import builtins
+import inspect
 import re
 from copy import deepcopy
 from typing import Callable, Dict, Optional, Type, Union
@@ -6,10 +8,7 @@ from typing import Callable, Dict, Optional, Type, Union
 import torch
 from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
-from vllm.model_executor.layers.quantization.awq_marlin import (
-    AWQMarlinConfig,
-    AWQMoEMethod,
-)
+from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
 from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (
     CompressedTensorsConfig,
@@ -28,6 +27,7 @@ from sglang.srt.layers.quantization.blockwise_int8 import BlockInt8Config
 from sglang.srt.layers.quantization.fp8 import Fp8Config
 from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
 from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp8Config
+from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
 from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
 QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
@@ -50,6 +50,7 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "qqq": QQQConfig,
     "experts_int8": ExpertsInt8Config,
     "w8a8_int8": W8A8Int8Config,
+    "w8a8_fp8": W8A8Fp8Config,
 }
@@ -178,96 +179,117 @@ def gptq_get_quant_method(self, layer, prefix):
     return None
-def awq_get_quant_method(self, layer, prefix):
-    from vllm.model_executor.layers.quantization.awq import is_layer_skipped_awq
-    from vllm.model_executor.layers.quantization.awq_marlin import (
-        AWQMarlinLinearMethod,
-        AWQMoEMethod,
-    )
+original_isinstance = builtins.isinstance
-    from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
-    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
-    from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
-    if isinstance(layer, LinearBase) or (
-        isinstance(layer, ParallelLMHead) and self.lm_head_quantized
-    ):
-        if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
-            return UnquantizedLinearMethod()
-        return AWQMarlinLinearMethod(self)
-    elif isinstance(layer, FusedMoE):
-        return AWQMoEMethod(self)
-    return None
+def monkey_patch_isinstance_for_vllm_base_layer(reverse: bool = False):
+    """
+    Patch isinstance so that the `get_quant_method` in vllm's QuantizationConfig
+    can recognize sglang layers
+    """
+    if reverse:
+        builtins.isinstance = original_isinstance
+        return
-original_awq_moe_method_apply = AWQMoEMethod.apply
-def awq_moe_method_apply(
-    self,
-    layer: torch.nn.Module,
-    x: torch.Tensor,
-    router_logits: torch.Tensor,
-    top_k: int,
-    renormalize: bool,
-    use_grouped_topk: bool = False,
-    topk_group: Optional[int] = None,
-    num_expert_group: Optional[int] = None,
-    custom_routing_function: Optional[Callable] = None,
-    scoring_func: str = "softmax",
-    e_score_correction_bias: Optional[torch.Tensor] = None,
-    **kwargs,
-):
-    return original_awq_moe_method_apply(
-        self,
-        layer,
-        x,
-        router_logits,
-        top_k,
-        renormalize,
-        use_grouped_topk,
-        topk_group,
-        num_expert_group,
-        custom_routing_function,
-        scoring_func,
-        e_score_correction_bias,
-    )
-def patch_vllm_linear_base_isinstance():
-    import builtins
+    from vllm.model_executor.layers.fused_moe import FusedMoE
     from vllm.model_executor.layers.linear import LinearBase
+    from vllm.model_executor.layers.vocab_parallel_embedding import (
+        VocabParallelEmbedding,
+    )
     from sglang.srt.layers.linear import LinearBase as PatchedLinearBase
-    original_isinstance = builtins.isinstance
+    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE as PatchedFusedMoE
+    from sglang.srt.layers.vocab_parallel_embedding import (
+        VocabParallelEmbedding as PatchedVocabParallelEmbedding,
+    )
     def patched_isinstance(obj, classinfo):
         if classinfo is LinearBase:
             return original_isinstance(obj, PatchedLinearBase)
+        if classinfo is FusedMoE:
+            return original_isinstance(obj, PatchedFusedMoE)
+        if classinfo is VocabParallelEmbedding:
+            return original_isinstance(obj, PatchedVocabParallelEmbedding)
         return original_isinstance(obj, classinfo)
     builtins.isinstance = patched_isinstance
-def apply_monkey_patches():
+def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
+    """
+    Monkey patch the apply function of vllm's FusedMoEMethodBase.
+    Convert sglang arguments to vllm arguments.
+    """
+    original_apply = class_obj.apply
+    sig = inspect.signature(original_apply)
+    param_names = list(sig.parameters.keys())
+    has_correction_bias = "e_score_correction_bias" in param_names
+    def new_apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+        inplace: bool = True,
+        no_combine: bool = False,
+    ):
+        assert activation == "silu"
+        assert inplace and not no_combine
+        kwargs = {
+            "self": self,
+            "layer": layer,
+            "x": x,
+            "router_logits": router_logits,
+            "top_k": top_k,
+            "renormalize": renormalize,
+            "use_grouped_topk": use_grouped_topk,
+            "topk_group": topk_group,
+            "num_expert_group": num_expert_group,
+            "custom_routing_function": custom_routing_function,
+        }
+        if correction_bias is not None:
+            if not has_correction_bias:
+                raise ValueError(
+                    "Please increase the version of your vllm. Try `pip install vllm==0.7.2`"
+                )
+            kwargs["e_score_correction_bias"] = correction_bias
+        return original_apply(**kwargs)
+    setattr(class_obj, "apply", new_apply)
+def monkey_patch_quant_configs():
     """Apply all monkey patches in one place."""
     from vllm.model_executor.layers.quantization.awq_marlin import AWQMoEMethod
+    from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (
+        CompressedTensorsW8A8Fp8MoEMethod,
+        CompressedTensorsWNA16MoEMethod,
+    )
+    from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinMoEMethod
     setattr(GPTQMarlinConfig, "get_quant_method", gptq_get_quant_method)
     setattr(GPTQConfig, "get_quant_method", gptq_get_quant_method)
-    setattr(AWQMarlinConfig, "get_quant_method", awq_get_quant_method)
-    setattr(AWQMoEMethod, "apply", awq_moe_method_apply)
+    monkey_patch_moe_apply(AWQMoEMethod)
+    monkey_patch_moe_apply(GPTQMarlinMoEMethod)
+    monkey_patch_moe_apply(CompressedTensorsW8A8Fp8MoEMethod)
+    monkey_patch_moe_apply(CompressedTensorsWNA16MoEMethod)
-patch_vllm_linear_base_isinstance()
-# Apply patches when module is imported
-apply_monkey_patches()
+monkey_patch_quant_configs()
 __all__ = [
-    "QuantizationConfig",
     "get_quantization_config",
     "QUANTIZATION_METHODS",
 ]

sglang/srt/layers/quantization/blockwise_int8.py CHANGED Viewed

@@ -13,12 +13,11 @@ from sglang.srt.layers.linear import (
     LinearMethodBase,
     UnquantizedLinearMethod,
 )
-from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
+from sglang.srt.layers.parameter import BlockQuantScaleParameter, ModelWeightParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.layers.quantization.fp8_utils import BlockQuantScaleParameter
 from sglang.srt.layers.quantization.int8_utils import apply_w8a8_block_int8_linear
 from sglang.srt.utils import set_weight_attrs

sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json ADDED Viewed

@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}

sglang 0.4.3.post3__py3-none-any.whl → 0.4.4__py3-none-any.whl

sglang 0.4.3.post3py3-none-any.whl → 0.4.4py3-none-any.whl