PyPI - sglang - Versions diffs - 0.5.2rc0__py3-none-any.whl → 0.5.2rc2__py3-none-any.whl - Mend

sglang 0.5.2rc0py3-none-any.whl → 0.5.2rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

sglang/lang/interpreter.py +1 -1
sglang/srt/configs/internvl.py +6 -0
sglang/srt/configs/model_config.py +2 -1
sglang/srt/disaggregation/mini_lb.py +2 -2
sglang/srt/distributed/parallel_state.py +46 -41
sglang/srt/entrypoints/engine.py +1 -1
sglang/srt/entrypoints/http_server.py +5 -1
sglang/srt/entrypoints/openai/protocol.py +3 -3
sglang/srt/entrypoints/openai/serving_chat.py +3 -3
sglang/srt/entrypoints/openai/serving_completions.py +3 -1
sglang/srt/entrypoints/openai/serving_embedding.py +1 -1
sglang/srt/entrypoints/openai/serving_responses.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/layers/attention/aiter_backend.py +93 -68
sglang/srt/layers/communicator.py +45 -7
sglang/srt/layers/moe/cutlass_w4a8_moe.py +1 -9
sglang/srt/layers/moe/ep_moe/layer.py +2 -7
sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -1048
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +796 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +5 -2
sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
sglang/srt/layers/moe/utils.py +0 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +8 -0
sglang/srt/layers/quantization/modelopt_quant.py +35 -2
sglang/srt/layers/quantization/mxfp4.py +4 -1
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
sglang/srt/layers/quantization/quark/utils.py +97 -0
sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
sglang/srt/layers/quantization/w4afp8.py +30 -25
sglang/srt/layers/rocm_linear_utils.py +44 -0
sglang/srt/layers/rotary_embedding.py +0 -18
sglang/srt/managers/cache_controller.py +42 -39
sglang/srt/managers/detokenizer_manager.py +0 -34
sglang/srt/managers/multi_tokenizer_mixin.py +48 -6
sglang/srt/managers/schedule_policy.py +3 -2
sglang/srt/managers/scheduler.py +7 -100
sglang/srt/managers/scheduler_metrics_mixin.py +113 -7
sglang/srt/managers/template_manager.py +3 -3
sglang/srt/managers/tokenizer_manager.py +1 -0
sglang/srt/mem_cache/allocator.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +15 -10
sglang/srt/mem_cache/hiradix_cache.py +16 -0
sglang/srt/mem_cache/memory_pool_host.py +18 -11
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +35 -6
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +32 -13
sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
sglang/srt/metrics/collector.py +12 -4
sglang/srt/metrics/utils.py +48 -0
sglang/srt/model_executor/forward_batch_info.py +16 -17
sglang/srt/model_executor/model_runner.py +1 -1
sglang/srt/models/deepseek_v2.py +245 -36
sglang/srt/models/glm4_moe.py +10 -1
sglang/srt/models/gpt_oss.py +5 -4
sglang/srt/models/internvl.py +28 -0
sglang/srt/models/longcat_flash.py +26 -15
sglang/srt/models/longcat_flash_nextn.py +23 -15
sglang/srt/models/minicpmv.py +165 -3
sglang/srt/models/qwen2_moe.py +4 -1
sglang/srt/models/qwen3.py +8 -2
sglang/srt/models/qwen3_moe.py +39 -8
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
sglang/srt/server_args.py +79 -2
sglang/srt/speculative/eagle_worker.py +158 -112
sglang/srt/utils.py +12 -10
sglang/test/few_shot_gsm8k.py +1 -0
sglang/test/test_cutlass_w4a8_moe.py +24 -9
sglang/utils.py +1 -0
sglang/version.py +1 -1
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/METADATA +2 -2
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/RECORD +83 -76
sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
/sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
/sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
/sglang/srt/{conversation.py → parser/conversation.py} +0 -0
/sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
/sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/layer.py CHANGED Viewed

@@ -175,6 +175,8 @@ class FusedMoE(torch.nn.Module):
         self.moe_tp_rank = get_moe_tensor_parallel_rank()
         assert num_experts % self.moe_ep_size == 0
         self.num_local_experts = num_experts // self.moe_ep_size
+        self.start_expert_id = self.moe_ep_rank * self.num_local_experts
+        self.end_expert_id = self.start_expert_id + self.num_local_experts - 1
         if self.moe_ep_size > 1:
             # TODO(ch-wan): support shared experts fusion
             # Create a tensor of size num_experts filled with -1
@@ -593,8 +595,9 @@ class FusedMoE(torch.nn.Module):
             if (
                 "compressed" in self.quant_method.__class__.__name__.lower()
-                and param.data[expert_id] != 1
-                and (param.data[expert_id] - loaded_weight).abs() > 1e-5
+                or "w4afp8" in self.quant_config.get_name()
+                and (param.data[expert_id] != 1).any()
+                and ((param.data[expert_id] - loaded_weight).abs() > 1e-5).any()
             ):
                 raise ValueError(
                     "input_scales of w1 and w3 of a layer "

sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py ADDED Viewed

@@ -0,0 +1,87 @@
+from __future__ import annotations
+from typing import Tuple
+import torch
+import triton
+from sglang.srt.utils import is_cuda, is_hip
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+if _is_cuda or _is_hip:
+    from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
+def moe_align_block_size(
+    topk_ids: torch.Tensor, block_size: int, num_experts: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Aligns the token distribution across experts to be compatible with block
+    size for matrix multiplication.
+    Parameters:
+    - topk_ids: A tensor of shape [total_tokens, top_k] representing the
+        top-k expert indices for each token.
+    - block_size: The block size used in block matrix multiplication.
+    - num_experts: The total number of experts.
+    Returns:
+    - sorted_token_ids: A tensor containing the sorted token indices according
+        to their allocated expert.
+    - expert_ids: A tensor indicating the assigned expert index for each block.
+    - num_tokens_post_padded: The total number of tokens after padding,
+        ensuring divisibility by block_size.
+    This function pads the number of tokens that each expert needs to process
+    so that it is divisible by block_size.
+    Padding ensures that during block matrix multiplication, the dimensions
+    align correctly.
+    Example:
+    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
+    block_size = 4, and num_experts = 4:
+    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
+        with each expert needing to process 3 tokens.
+    - As block_size is 4, we pad 1 token for each expert.
+    - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
+    - Then append padding tokens [12, 12, 12, 12] for each block.
+    - After sorting by expert index, we obtain token_ids
+        [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
+        Tokens 12 are non-existent (padding) and are ignored in
+        the subsequent matrix multiplication.
+    - The padding ensures that the total number of tokens is now divisible
+        by block_size for proper block matrix operations.
+    """
+    max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
+    sorted_ids = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
+    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+    expert_ids = torch.empty(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
+    # In EP, expert_ids for filtered experts are -1. We have num_experts + 1 ids in total.
+    cumsum_buffer = torch.empty(
+        (num_experts + 2,), dtype=torch.int32, device=topk_ids.device
+    )
+    # Threshold based on benchmark results
+    fuse_sorted_ids_padding = sorted_ids.shape[0] <= 4096
+    if not fuse_sorted_ids_padding:
+        sorted_ids.fill_(topk_ids.numel())
+    sgl_moe_align_block_size(
+        topk_ids,
+        num_experts + 1,
+        block_size,
+        sorted_ids,
+        expert_ids,
+        num_tokens_post_pad,
+        cumsum_buffer,
+        fuse_sorted_ids_padding,
+    )
+    return sorted_ids, expert_ids, num_tokens_post_pad

sglang/srt/layers/moe/utils.py CHANGED Viewed

@@ -162,7 +162,6 @@ def get_deepep_config() -> str:
 def is_tbo_enabled() -> bool:
     global IS_TBO_ENABLED
     if IS_TBO_ENABLED is None:
-        logger.warning("IS_TBO_ENABLED is not initialized, using False")
         IS_TBO_ENABLED = False
     return IS_TBO_ENABLED

sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py CHANGED Viewed

@@ -132,9 +132,17 @@ def _compile_deep_gemm_one_type_all(
         kernel_type, max_m=max(m_list), n=n, k=k, num_groups=num_groups
     )
+    old_compile_mode = deep_gemm.get_compile_mode()
+    deep_gemm.set_compile_mode(1)
     # TODO can use multi thread
     for m in tqdm(m_list, desc=f"DeepGEMM warmup"):
         executor.execute(m=m)
+    deep_gemm.set_compile_mode(old_compile_mode)
+    # clean up input buffers
+    torch.cuda.current_stream().synchronize()
+    del executor
+    torch.cuda.empty_cache()
 class _BaseWarmupExecutor:

sglang/srt/layers/quantization/modelopt_quant.py CHANGED Viewed

@@ -517,6 +517,39 @@ class ModelOptFp4Config(QuantizationConfig):
     def get_config_filenames(cls) -> List[str]:
         return ["hf_quant_config.json"]
+    @staticmethod
+    def common_group_size(cfg: dict) -> int:
+        """Return the unique group_size across the config; raise if missing/mismatched."""
+        sizes = set()
+        # Top-level and 'quantization' block
+        v = cfg.get("group_size")
+        if isinstance(v, int):
+            sizes.add(v)
+        q = cfg.get("quantization")
+        if isinstance(q, dict):
+            v = q.get("group_size")
+            if isinstance(v, int):
+                sizes.add(v)
+        # config_groups: accept group-level or nested dicts (e.g., weights/input_activations)
+        for g in (cfg.get("config_groups") or {}).values():
+            if isinstance(g, dict):
+                v = g.get("group_size")
+                if isinstance(v, int):
+                    sizes.add(v)
+                for sub in g.values():
+                    if isinstance(sub, dict):
+                        v = sub.get("group_size")
+                        if isinstance(v, int):
+                            sizes.add(v)
+        if not sizes:
+            raise ValueError("No group_size found in config.")
+        if len(sizes) > 1:
+            raise ValueError(f"Inconsistent group_size values: {sorted(sizes)}")
+        return next(iter(sizes))
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> ModelOptFp4Config:
         # Handle two different config formats:
@@ -549,7 +582,7 @@ class ModelOptFp4Config(QuantizationConfig):
                 else:
                     kv_cache_quant_algo = "auto"
-            group_size = config.get("group_size")
+            group_size = ModelOptFp4Config.common_group_size(config)
             exclude_modules = config.get("ignore", [])
         else:
             # Fall back to nested format (hf_quant_config.json - legacy format)
@@ -559,7 +592,7 @@ class ModelOptFp4Config(QuantizationConfig):
                 kv_cache_quant_algo = quant_config.get("kv_cache_quant_algo")
                 if not kv_cache_quant_algo:
                     kv_cache_quant_algo = "auto"
-                group_size = quant_config.get("group_size")
+                group_size = ModelOptFp4Config.common_group_size(config)
                 exclude_modules = quant_config.get("exclude_modules", [])
             except (ValueError, KeyError):
                 raise ValueError(

sglang/srt/layers/quantization/mxfp4.py CHANGED Viewed

@@ -816,7 +816,10 @@ class Mxfp4DynamicQuantMoEMethod(FusedMoEMethodBase):
         moe_runner_config: MoeRunnerConfig,
     ) -> torch.Tensor:
         topk_weights, topk_ids, _ = topk_output
+        if _is_hip:
+            topk_weights = topk_weights.to(
+                torch.float32
+            )  # aiter's moe_sorting requires topk_weights to be FP32
         return fused_moe(
             x,
             layer.w13_weight,

sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py CHANGED Viewed

@@ -8,6 +8,7 @@ import torch.nn.functional as F
 from aiter.ops.gemm_op_a4w4 import gemm_a4w4
 from aiter.ops.shuffle import shuffle_weight
 from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4
+from aiter.ops.triton.gemm_afp4wfp4_pre_quant_atomic import gemm_afp4wfp4_pre_quant
 from aiter.ops.triton.quant import dynamic_mxfp4_quant
 from aiter.utility import dtypes
 from aiter.utility.fp4_utils import e8m0_shuffle
@@ -38,15 +39,6 @@ class QuarkW4A4MXFP4(QuarkScheme):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         return
-        # for aiter implement
-        # wshuffle = shuffle_weight(layer.weight.data, layout=(16, 16))
-        # w_scales_shuffle = e8m0_shuffle(layer.weight_scale.data).view(dtypes.fp8_e8m0)
-        # layer.weight = torch.nn.Parameter(wshuffle,
-        #                                  requires_grad=False)
-        # layer.weight_scale = torch.nn.Parameter(w_scales_shuffle,
-        #                                        requires_grad=False)
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -93,26 +85,53 @@ class QuarkW4A4MXFP4(QuarkScheme):
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        out_dtype = x.dtype
-        # M = x.shape[0]
-        # N = layer.weight.shape[0]
-        # quant_func = aiter.get_triton_quant(aiter.QuantType.per_1x32)
-        # x, x_scales_shuffle = quant_func(x, shuffle=True)
-        # y = torch.zeros((M + 255) // 256 * 256, N, device=x.device, dtype=self.out_dtype)
-        # out = gemm_a4w4(x, layer.weight.data, x_scales_shuffle, layer.weight_scale.data, y, bias=bias)
-        # return out[:M]
-        # triton implement
-        x_q, x_s = dynamic_mxfp4_quant(x)
-        y = torch.empty(
-            x_q.shape[0], layer.weight.shape[0], device=x_q.device, dtype=out_dtype
+        # This path does not have support for bias currently
+        assert bias is None, "bias is not supported"
+        three_d = False
+        x_s = None
+        y = None
+        if isinstance(x, tuple):
+            assert len(x) in [
+                2,
+                3,
+            ], "For tuple input, only (x, x_s) or (x, x_s, y) formats are accepted"
+            if len(x) == 2:
+                x, x_s = x
+            elif len(x) == 3:
+                x, x_s, y = x
+        use_fused_quant_gemm = (
+            x_s is None and y is not None and layer.weight.shape[0] == y.shape[1]
         )
-        out = gemm_afp4wfp4(x_q, layer.weight, x_s, layer.weight_scale, out_dtype, y)
-        return out
+        if x.dim() == 3:
+            three_d = True
+            x = x.view(-1, x.shape[-1])
+            output_shape = [*x.shape[:-1], layer.weight.shape[0]]
+        # use_fused_quant_gemm = true, x_q is a bf16/fp16 num
+        # x_s is not None = true, x_q is uint8 num
+        if use_fused_quant_gemm or x_s is not None:
+            x_q = x
+        else:
+            x_q, x_s = dynamic_mxfp4_quant(x)
+        if y is None:
+            y = torch.empty(
+                x_q.shape[0],
+                layer.weight.shape[0],
+                device=x_q.device,
+                dtype=self.out_dtype,
+            )
+        if use_fused_quant_gemm:
+            gemm_afp4wfp4_pre_quant(x_q, layer.weight, layer.weight_scale, y.dtype, y)
+            y = y.to(x.dtype)
+        else:
+            gemm_afp4wfp4(x_q, layer.weight, x_s, layer.weight_scale, self.out_dtype, y)
+        if three_d:
+            return y.view(*output_shape)
+        return y

sglang/srt/layers/quantization/quark/utils.py CHANGED Viewed

@@ -5,6 +5,10 @@ from collections.abc import Iterable, Mapping
 from types import MappingProxyType
 from typing import Any, Optional
+import torch
+from aiter.ops.triton.quant import dynamic_mxfp4_quant
+from torch import nn
 def deep_compare(dict1: Any, dict2: Any) -> bool:
     if type(dict1) is not type(dict2):
@@ -105,3 +109,96 @@ def _is_equal_or_regex_match(
     elif target == value:
         return True
     return False
+# utility for tensor dims > 2 cases
+def b_dynamic_mxfp4_quant(x):
+    h, b, d = x.shape
+    x, x_scales = dynamic_mxfp4_quant(x.reshape(-1, d))
+    return x.view(h, b, d // 2), x_scales.view(h, b, d // 32)
+def mxfp4_to_f32(x, is_threed):
+    # 2 because we pack fp4 in uint8.
+    x = x.repeat_interleave(2, dim=-1)
+    if is_threed:
+        x[..., ::2] = x[..., ::2] & 0xF
+        x[..., 1::2] = x[..., 1::2] >> 4
+    else:
+        x[:, ::2] = x[:, ::2] & 0xF
+        x[:, 1::2] = x[:, 1::2] >> 4
+    mxfp4_list = [
+        0.0,
+        0.5,
+        1.0,
+        1.5,
+        2.0,
+        3.0,
+        4.0,
+        6.0,
+        -0.0,
+        -0.5,
+        -1.0,
+        -1.5,
+        -2.0,
+        -3.0,
+        -4.0,
+        -6.0,
+    ]
+    mxfp4_in_f32 = torch.tensor(mxfp4_list, dtype=torch.float32, device="cuda")
+    return mxfp4_in_f32[x.long()]
+def e8m0_to_f32(x):
+    # Convert the input tensor `x` (assumed to be in e8m0 format) to float32.
+    # e8m0 is a custom 8-bit floating point format with 8 bits for exponent, 0 for mantissa.
+    # This means the value is essentially 2^(exponent - 127), similar to how IEEE-754 stores floats.
+    # Convert x to float32 for computation, and compute the power of 2 by subtracting the bias (127).
+    x_f32 = 2 ** ((x.to(torch.float32)) - 127)
+    # If the exponent value was 255 (i.e., 2^(128)), this is a special case usually used to represent NaN or Inf.
+    # Since this custom format has no mantissa, treat 2^128 as NaN.
+    x_f32[x_f32 == 128] = float("nan")
+    return x_f32
+def quark_post_load_weights(self_attn: nn.Module, w: torch.Tensor, quant_format: str):
+    if "mxfp4" in quant_format:
+        # when dtype is bf16, the processing flow is to dynamic quantize bf16 tensor to uint8 tensor
+        # do w_kc (bf16) first to get the w_kc(uint8) w_s_kc(uint8)
+        # and w_vc repeating the same procedure of w_kc to get  w_vc(uint8) w_s_vc(uint8)
+        if w.dtype == torch.bfloat16:
+            w_kc, w_vc = w.unflatten(
+                0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+            ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+            w_kc, w_s_kc = b_dynamic_mxfp4_quant(w_kc.transpose(-2, -1))
+            w_kc = w_kc.transpose(-2, -1)
+            w_s_kc = w_s_kc.transpose(-2, -1)
+            w_vc, w_s_vc = b_dynamic_mxfp4_quant(w_vc)
+            w_s_kc = w_s_kc.transpose(1, 2).contiguous().transpose(1, 2)
+            w_s_vc = w_s_vc.contiguous().transpose(1, 2)
+        elif w.dtype == torch.uint8:  # static quant for mxfp4
+            # when dtype is uint8, it means the w has been quantized to mxfp4 format
+            # but we must separate it to w_kc and w_vc.
+            # The quantized tensor size is only half of original tensor size
+            # and the scaling factor is 1/32, the transpose behavior will be not correct
+            # need to upcast it to fp32 to separate w to w_kc and w_vc
+            # to ensure the following transpose behavior is correct
+            # and then do mxfp4 quant again
+            w = mxfp4_to_f32(w, True).to(torch.bfloat16)
+            w_scales = self_attn.kv_b_proj.weight_scale.repeat_interleave(32, dim=-1)
+            w_scales = e8m0_to_f32(w_scales).to(torch.bfloat16)
+            w = w * w_scales
+            w_kc, w_vc = w.unflatten(
+                0, (-1, (self_attn.qk_nope_head_dim + self_attn.v_head_dim))
+            ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+            w_kc, w_s_kc = b_dynamic_mxfp4_quant(w_kc.transpose(-2, -1))
+            w_kc = w_kc.transpose(-2, -1)
+            w_s_kc = w_s_kc.transpose(-2, -1)
+            w_vc, w_s_vc = b_dynamic_mxfp4_quant(w_vc)
+            w_s_kc = w_s_kc.transpose(1, 2).contiguous().transpose(1, 2)
+            w_s_vc = w_s_vc.contiguous().transpose(1, 2)
+        return w_kc, w_s_kc, w_vc, w_s_vc

sglang/srt/layers/quantization/rocm_mxfp4_utils.py ADDED Viewed

@@ -0,0 +1,13 @@
+from aiter.ops.triton.batched_gemm_afp4wfp4_pre_quant import (
+    batched_gemm_afp4wfp4_pre_quant,
+)
+from aiter.ops.triton.fused_mxfp4_quant import (
+    fused_flatten_mxfp4_quant,
+    fused_rms_mxfp4_quant,
+)
+__all__ = [
+    "fused_rms_mxfp4_quant",
+    "fused_flatten_mxfp4_quant",
+    "batched_gemm_afp4wfp4_pre_quant",
+]

sglang/srt/layers/quantization/w4afp8.py CHANGED Viewed

@@ -1,12 +1,14 @@
 from __future__ import annotations
 import logging
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
 import torch
 from torch.nn import Module
 from torch.nn.parameter import Parameter
+from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
+from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
 from sglang.srt.layers.quantization.base_config import (
     FusedMoEMethodBase,
     QuantizationConfig,
@@ -91,12 +93,13 @@ class W4AFp8Config(QuantizationConfig):
         from sglang.srt.layers.linear import LinearBase
         from sglang.srt.layers.moe.ep_moe.layer import EPMoE
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        from sglang.srt.managers.schedule_batch import global_server_args_dict
         if isinstance(layer, LinearBase):
             if is_layer_skipped(prefix, self.ignored_layers):
                 return UnquantizedLinearMethod()
             return Fp8LinearMethod(self)
-        elif isinstance(layer, EPMoE):
+        elif isinstance(layer, FusedMoE):
             return W4AFp8MoEMethod(self)
         return None
@@ -104,8 +107,24 @@ class W4AFp8Config(QuantizationConfig):
         return []
-class W4AFp8MoEMethod(FusedMoEMethodBase):
+def interleave_scales(scales: torch.Tensor) -> torch.Tensor:
+    """Interleave scales in groups of 4 similar to TRT-LLM implementation."""
+    s_shape = scales.shape
+    # Reshape to separate groups of 4
+    alignment = 4 if s_shape[2] % 4 == 0 else 1
+    scales_interleaved = scales.reshape(
+        s_shape[0], s_shape[1], (s_shape[2] // alignment), alignment
+    )
+    # Permute dimensions to interleave
+    scales_interleaved = scales_interleaved.permute(0, 2, 1, 3)
+    # Reshape back to original dimensions but with interleaved values
+    scales_interleaved = scales_interleaved.reshape(
+        s_shape[0], s_shape[2] // alignment, s_shape[1] * alignment
+    )
+    return scales_interleaved.contiguous()
+class W4AFp8MoEMethod(FusedMoEMethodBase):
     def __init__(self, quant_config: W4AFp8Config):
         self.quant_config = quant_config
@@ -234,33 +253,18 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         return
-    def _interleave_scales(self, scales: torch.Tensor) -> torch.Tensor:
-        """Interleave scales in groups of 4 similar to TRT-LLM implementation."""
-        s_shape = scales.shape
-        # Reshape to separate groups of 4
-        scales_interleaved = scales.reshape(
-            s_shape[0], s_shape[1], (s_shape[2] // 4), 4
-        )
-        # Permute dimensions to interleave
-        scales_interleaved = scales_interleaved.permute(0, 2, 1, 3)
-        # Reshape back to original dimensions but with interleaved values
-        scales_interleaved = scales_interleaved.reshape(
-            s_shape[0], s_shape[2] // 4, s_shape[1] * 4
-        )
-        return scales_interleaved.contiguous()
     def process_weights_after_loading(self, layer: Module) -> None:
         dtype = torch.bfloat16
         device = layer.w2_weight.device
         # Interleave w13_weight_scale (gate_up_proj)
         w13_weight_scale = layer.w13_weight_scale_inv.to(dtype)
-        w13_weight_scale = self._interleave_scales(w13_weight_scale)
+        w13_weight_scale = interleave_scales(w13_weight_scale)
         layer.w13_weight_scale_inv = Parameter(w13_weight_scale, requires_grad=False)
         # Interleave w2_weight_scale (down_proj)
         w2_weight_scale = layer.w2_weight_scale_inv.to(dtype)
-        w2_weight_scale = self._interleave_scales(w2_weight_scale)
+        w2_weight_scale = interleave_scales(w2_weight_scale)
         layer.w2_weight_scale_inv = Parameter(w2_weight_scale, requires_grad=False)
         # Process input scales
@@ -291,11 +295,12 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         topk_weights, topk_ids, _ = topk_output
         local_topk_ids = topk_ids
-        local_topk_ids = torch.where(
-            topk_ids == -1,
-            layer.num_experts,
-            topk_ids,
-        )
+        if get_moe_expert_parallel_world_size() > 1:
+            local_topk_ids = torch.where(
+                topk_ids == -1,
+                layer.num_experts,
+                topk_ids,
+            )
         output = cutlass_w4a8_moe(
             layer.start_expert_id,

sglang/srt/layers/rocm_linear_utils.py ADDED Viewed

@@ -0,0 +1,44 @@
+import torch
+from aiter.ops.triton.fused_qk_concat import fused_qk_rope_cat
+from aiter.ops.triton.gemm_a16w16 import gemm_a16w16
+from aiter.ops.triton.gemm_a16w16_atomic import gemm_a16w16_atomic
+from sglang.srt.utils import BumpAllocator
+__all__ = ["fused_qk_rope_cat"]
+def aiter_dsv3_router_gemm(
+    hidden_states: torch.Tensor,
+    weight: torch.Tensor,
+    gemm_output_zero_allocator: BumpAllocator = None,
+):
+    M = hidden_states.shape[0]
+    N = weight.shape[0]
+    y = None
+    if M <= 256:
+        # TODO (cagri): convert to bfloat16 as part of another kernel to save time
+        # for now it is also coupled with zero allocator.
+        if gemm_output_zero_allocator != None:
+            y = gemm_output_zero_allocator.allocate(M * N).view(M, N)
+        else:
+            y = torch.zeros((M, N), dtype=torch.float32, device=hidden_states.device)
+    if y is not None:
+        logits = gemm_a16w16_atomic(hidden_states, weight, y=y).to(hidden_states.dtype)
+    else:
+        logits = gemm_a16w16(hidden_states, weight)
+    return logits
+def get_dsv3_gemm_output_zero_allocator_size(
+    n_routed_experts: int, num_moe_layers: int, allocate_size: int, embedding_dim: int
+):
+    if embedding_dim != 7168 or n_routed_experts != 256:
+        return 0
+    per_layer_size = 256 * (allocate_size + n_routed_experts)
+    return num_moe_layers * per_layer_size

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -1433,24 +1433,6 @@ class MRotaryEmbedding(RotaryEmbedding):
             return position_ids, mrope_position_deltas
-    @staticmethod
-    def get_next_input_positions(
-        mrope_position_delta: int,
-        context_len: int,
-        seq_len: int,
-    ) -> torch.Tensor:
-        return torch.tensor(
-            [
-                list(
-                    range(
-                        context_len + mrope_position_delta,
-                        seq_len + mrope_position_delta,
-                    )
-                )
-                for _ in range(3)
-            ]
-        )
 class DualChunkRotaryEmbedding(CustomOp):
     """Rotary positional embedding for Dual Chunk Attention."""

sglang 0.5.2rc0__py3-none-any.whl → 0.5.2rc2__py3-none-any.whl

sglang 0.5.2rc0py3-none-any.whl → 0.5.2rc2py3-none-any.whl