PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (318) hide show

sglang/srt/layers/moe/fused_moe_triton/layer.py CHANGED Viewed

@@ -28,8 +28,9 @@ else:
 import logging
 _is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
-if _is_hip:
+if _use_aiter:
     from aiter import ActivationType
     from aiter.fused_moe_bf16_asm import ck_moe_2stages
     from aiter.ops.shuffle import shuffle_weight
@@ -104,7 +105,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         set_weight_attrs(w2_weight, extra_weight_attrs)
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
+        if _use_aiter:
             layer.w13_weight = torch.nn.Parameter(
                 shuffle_weight(layer.w13_weight.data, (16, 16)),
                 requires_grad=False,
@@ -127,6 +128,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         use_grouped_topk: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -144,6 +146,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             use_grouped_topk=use_grouped_topk,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             activation=activation,
@@ -163,6 +166,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         renormalize: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -179,12 +183,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             routed_scaling_factor=routed_scaling_factor,
         )
-        if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
+        if _use_aiter:
             assert not no_combine, "unsupported"
             if apply_router_weight_on_input:
                 assert (
@@ -220,6 +225,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 activation=activation,
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 no_combine=no_combine,
+                routed_scaling_factor=routed_scaling_factor,
             )
     def forward_cpu(
@@ -232,6 +238,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         renormalize: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         inplace: bool = True,
@@ -245,6 +252,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             renormalize,
             topk_group,
             num_expert_group,
+            num_fused_shared_experts,
             custom_routing_function,
             correction_bias,
         )
@@ -289,6 +297,7 @@ class FusedMoE(torch.nn.Module):
         renormalize: bool = True,
         use_grouped_topk: bool = False,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         topk_group: Optional[int] = None,
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
@@ -307,6 +316,7 @@ class FusedMoE(torch.nn.Module):
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
+        self.hidden_size = hidden_size
         self.tp_size = (
             tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
         )
@@ -321,6 +331,7 @@ class FusedMoE(torch.nn.Module):
         if self.use_grouped_topk:
             assert num_expert_group is not None and topk_group is not None
         self.num_expert_group = num_expert_group
+        self.num_fused_shared_experts = num_fused_shared_experts
         self.topk_group = topk_group
         self.custom_routing_function = custom_routing_function
         self.correction_bias = correction_bias
@@ -546,7 +557,8 @@ class FusedMoE(torch.nn.Module):
             loaded_weight = loaded_weight.to(param.data.device)
             if (
-                param.data[expert_id] != 1
+                "compressed" in self.quant_method.__class__.__name__.lower()
+                and param.data[expert_id] != 1
                 and (param.data[expert_id] - loaded_weight).abs() > 1e-5
             ):
                 raise ValueError(
@@ -570,6 +582,23 @@ class FusedMoE(torch.nn.Module):
                 tp_rank=tp_rank,
             )
             return
+        if "ModelOpt" in self.quant_method.__class__.__name__:
+            if "weight_scale_2" in weight_name or "input_scale" in weight_name:
+                self._load_per_tensor_weight_scale(
+                    shard_id=shard_id,
+                    param=param,
+                    loaded_weight=loaded_weight,
+                    expert_id=expert_id,
+                )
+            elif "weight" in weight_name:
+                self._load_model_weight_or_group_weight_scale(
+                    shard_id=shard_id,
+                    shard_dim=shard_dim,
+                    loaded_weight=loaded_weight,
+                    expert_data=expert_data,
+                    tp_rank=tp_rank,
+                )
+            return
         # Case weight scales and zero_points
         if "scale" in weight_name or "zero" in weight_name:
@@ -651,6 +680,7 @@ class FusedMoE(torch.nn.Module):
             use_grouped_topk=self.use_grouped_topk,
             topk_group=self.topk_group,
             num_expert_group=self.num_expert_group,
+            num_fused_shared_experts=self.num_fused_shared_experts,
             custom_routing_function=self.custom_routing_function,
             correction_bias=self.correction_bias,
             activation=self.activation,

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -18,6 +18,7 @@ from typing import Callable, Optional
 import torch
 import torch.nn.functional as F
+from sglang.srt.managers import expert_location_dispatch
 from sglang.srt.managers.expert_distribution import (
     ExpertDistributionRecorder,
     get_global_expert_distribution_recorder,
@@ -65,6 +66,7 @@ def fused_topk(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
+    num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
 ):
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
@@ -87,9 +89,27 @@ def fused_topk(
     )
     del token_expert_indicies
+    return _fused_topk_postprocess(
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        renormalize=renormalize,
+        expert_location_dispatch_info=expert_location_dispatch_info,
+        num_token_non_padded=num_token_non_padded,
+    )
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def _fused_topk_postprocess(
+    topk_weights,
+    topk_ids,
+    renormalize,
+    expert_location_dispatch_info,
+    num_token_non_padded,
+):
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
     topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
+    _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
     return topk_weights, topk_ids
@@ -102,7 +122,7 @@ def grouped_topk(
     renormalize: bool,
     num_expert_group: int = 0,
     topk_group: int = 0,
-    n_share_experts_fusion: int = 0,
+    num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
@@ -127,10 +147,10 @@ def grouped_topk(
     )  # [n, e]
     tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
     topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
-    if n_share_experts_fusion:
+    if num_fused_shared_experts:
         topk_ids[:, -1] = torch.randint(
             low=num_experts,
-            high=num_experts + n_share_experts_fusion,
+            high=num_experts + num_fused_shared_experts,
             size=(topk_ids.size(0),),
             dtype=topk_ids.dtype,
             device=topk_ids.device,
@@ -140,7 +160,7 @@ def grouped_topk(
     if renormalize:
         topk_weights_sum = (
             topk_weights.sum(dim=-1, keepdim=True)
-            if n_share_experts_fusion == 0
+            if num_fused_shared_experts == 0
             else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
         )
         topk_weights = topk_weights / topk_weights_sum
@@ -159,7 +179,7 @@ def biased_grouped_topk_impl(
     renormalize: bool,
     num_expert_group: int = 0,
     topk_group: int = 0,
-    n_share_experts_fusion: int = 0,
+    num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
@@ -191,10 +211,10 @@ def biased_grouped_topk_impl(
     _, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
     topk_weights = scores.gather(1, topk_ids)
-    if n_share_experts_fusion:
+    if num_fused_shared_experts:
         topk_ids[:, -1] = torch.randint(
             low=num_experts,
-            high=num_experts + n_share_experts_fusion,
+            high=num_experts + num_fused_shared_experts,
             size=(topk_ids.size(0),),
             dtype=topk_ids.dtype,
             device=topk_ids.device,
@@ -204,7 +224,7 @@ def biased_grouped_topk_impl(
     if renormalize:
         topk_weights_sum = (
             topk_weights.sum(dim=-1, keepdim=True)
-            if n_share_experts_fusion == 0
+            if num_fused_shared_experts == 0
             else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
         )
         topk_weights = topk_weights / topk_weights_sum
@@ -238,7 +258,7 @@ def biased_grouped_topk(
     num_expert_group: int = 0,
     topk_group: int = 0,
     compiled: bool = True,
-    n_share_experts_fusion: int = 0,
+    num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
@@ -246,7 +266,7 @@ def biased_grouped_topk(
     assert (
         routed_scaling_factor is not None
     ), "routed_scaling_factor is required for biased_grouped_topk"
-    # TODO: moe_fused_gate kernel is not supported for n_share_experts_fusion > 0 now.
+    # TODO: moe_fused_gate kernel is not supported for num_fused_shared_experts > 0 now.
     if (
         _is_cuda
         and gating_output.shape[1] // num_expert_group
@@ -259,7 +279,7 @@ def biased_grouped_topk(
             num_expert_group,
             topk_group,
             topk,
-            n_share_experts_fusion,
+            num_fused_shared_experts,
             routed_scaling_factor,
         )
         # TODO merge into kernel for this branch
@@ -287,7 +307,7 @@ def biased_grouped_topk(
             renormalize,
             num_expert_group,
             topk_group,
-            n_share_experts_fusion=n_share_experts_fusion,
+            num_fused_shared_experts=num_fused_shared_experts,
             routed_scaling_factor=routed_scaling_factor,
             num_token_non_padded=num_token_non_padded,
             expert_location_dispatch_info=expert_location_dispatch_info,
@@ -302,6 +322,7 @@ def select_experts(
     renormalize: bool,
     topk_group: Optional[int] = None,
     num_expert_group: Optional[int] = None,
+    num_fused_shared_experts: int = 0,
     custom_routing_function: Optional[Callable] = None,
     correction_bias: Optional[torch.Tensor] = None,
     torch_native: bool = False,
@@ -309,7 +330,14 @@ def select_experts(
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
 ):
-    n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
+    router_logits, correction_bias = (
+        expert_location_dispatch.transform_select_experts_inputs(
+            router_logits=router_logits,
+            correction_bias=correction_bias,
+            info=expert_location_dispatch_info,
+        )
+    )
     # DeepSeek V2/V3/R1 series models use grouped_top_k
     if use_grouped_topk:
         assert topk_group is not None
@@ -322,7 +350,7 @@ def select_experts(
                 renormalize=renormalize,
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
-                n_share_experts_fusion=n_share_experts_fusion,
+                num_fused_shared_experts=num_fused_shared_experts,
                 routed_scaling_factor=routed_scaling_factor,
                 num_token_non_padded=num_token_non_padded,
                 expert_location_dispatch_info=expert_location_dispatch_info,
@@ -336,7 +364,7 @@ def select_experts(
                 renormalize=renormalize,
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
-                n_share_experts_fusion=n_share_experts_fusion,
+                num_fused_shared_experts=num_fused_shared_experts,
                 routed_scaling_factor=routed_scaling_factor,
                 num_token_non_padded=num_token_non_padded,
                 expert_location_dispatch_info=expert_location_dispatch_info,
@@ -353,15 +381,13 @@ def select_experts(
             renormalize=renormalize,
         )
     elif custom_routing_function is None:
-        assert (
-            num_token_non_padded is None
-        ), "num_token_non_padded is not yet supported in fused_topk"
         # Qwen3MOE uses fused_topk
         topk_weights, topk_ids = fused_topk(
             hidden_states=hidden_states,
             gating_output=router_logits,
             topk=top_k,
             renormalize=renormalize,
+            num_token_non_padded=num_token_non_padded,
             expert_location_dispatch_info=expert_location_dispatch_info,
         )
     else:

sglang/srt/layers/multimodal.py CHANGED Viewed

@@ -32,8 +32,8 @@ def hash_kernel(
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
     mask = offsets < n_elements
-    data = tl.load(input_ptr + offsets, mask=mask, other=0)
-    mixed = data ^ (offsets + XCONST)
+    data = tl.load(input_ptr + offsets, mask=mask, other=0).to(tl.int64)
+    mixed = data ^ (offsets.to(tl.int64) + XCONST)
     hash_val = mixed * PRIME
     hash_val = hash_val ^ (hash_val >> 16)
     hash_val = hash_val * (PRIME ^ XCONST)
@@ -53,7 +53,7 @@ def gpu_tensor_hash(tensor: torch.Tensor) -> int:
     BLOCK_SIZE = 1024
     grid = (triton.cdiv(n, BLOCK_SIZE),)
-    intermediate_hashes = torch.empty(n, dtype=torch.int32, device=tensor.device)
+    intermediate_hashes = torch.empty(n, dtype=torch.int64, device=tensor.device)
     hash_kernel[grid](
         tensor,

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -114,7 +114,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
         raise ValueError(
             f"{quantization} quantization requires some operators from vllm. "
-            "Please install vllm by `pip install vllm==0.8.4`"
+            "Please install vllm by `pip install vllm==0.9.0.1`"
         )
     return QUANTIZATION_METHODS[quantization]
@@ -289,6 +289,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
         use_grouped_topk: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -315,7 +316,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
         if correction_bias is not None:
             if not has_correction_bias:
                 raise ValueError(
-                    "Please increase the version of your vllm. Try `pip install vllm==0.8.4`"
+                    "Please increase the version of your vllm. Try `pip install vllm==0.9.0.1`"
                 )
             kwargs["e_score_correction_bias"] = correction_bias
         return original_apply(**kwargs)

sglang/srt/layers/quantization/blockwise_int8.py CHANGED Viewed

@@ -367,6 +367,7 @@ class BlockInt8MoEMethod:
         use_grouped_topk: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -387,6 +388,7 @@ class BlockInt8MoEMethod:
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             routed_scaling_factor=routed_scaling_factor,
@@ -409,4 +411,5 @@ class BlockInt8MoEMethod:
             a2_scale=layer.w2_input_scale,
             block_shape=self.quant_config.weight_block_size,
             no_combine=no_combine,
+            routed_scaling_factor=routed_scaling_factor,
         )

sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py CHANGED Viewed

@@ -272,6 +272,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         global_num_experts: int = -1,
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
@@ -294,6 +295,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             routed_scaling_factor=routed_scaling_factor,
@@ -315,6 +317,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             apply_router_weight_on_input=apply_router_weight_on_input,
+            routed_scaling_factor=routed_scaling_factor,
         )
@@ -627,6 +630,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         global_num_experts: int = -1,
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
@@ -651,6 +655,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             correction_bias=correction_bias,

sglang/srt/layers/quantization/deep_gemm.py CHANGED Viewed

@@ -17,10 +17,10 @@ _ENABLE_JIT_DEEPGEMM = False
 try:
     import deep_gemm
     from deep_gemm import get_num_sms
+    from deep_gemm.jit import build
     from deep_gemm.jit.compiler import get_nvcc_compiler
     from deep_gemm.jit_kernels.gemm import get_best_configs
     from deep_gemm.jit_kernels.runtime import FP8GemmRuntime, GemmType
-    from deep_gemm.jit_kernels.tuner import jit_tuner
     sm_version = get_device_sm()
     if sm_version == 90:
@@ -148,32 +148,28 @@ def _compile_grouped_gemm_nt_f8f8bf16_masked_one(
     block_k = 128
     num_tma_threads = 128
     num_math_threads_per_group = 128
     kwargs = {
+        "GEMM_TYPE": GemmType.GroupedMasked,
         "NUM_TMA_THREADS": num_tma_threads,
         "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
+        "N": n,
+        "K": k,
+        "NUM_GROUPS": 1,
+        "BLOCK_M": block_m,
+        "BLOCK_N": block_n,
         "BLOCK_K": block_k,
+        "SWIZZLE_D_MODE": smem_config[1],
+        "BLOCK_N_PADDING": smem_config[2],
+        "NUM_STAGES": num_stages,
+        "NUM_TMA_MULTICAST": tma_multicast_config[0],
+        "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
         "NUM_SMS": num_sms,
         "SMEM_SIZE": smem_config[0],
     }
-    _, _ = jit_tuner.compile_and_tune(
-        name="m_grouped_gemm_fp8_fp8_bf16_nt",
-        keys={
-            "N": n,
-            "K": k,
-            "BLOCK_M": block_m,
-            "BLOCK_N": block_n,
-            "SWIZZLE_D_MODE": smem_config[1],
-            "BLOCK_N_PADDING": smem_config[2],
-            "NUM_GROUPS": num_groups,
-            "NUM_STAGES": num_stages,
-            "NUM_TMA_MULTICAST": tma_multicast_config[0],
-            "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
-            "GEMM_TYPE": GemmType.GroupedMasked,
-        },
-        space=(),
-        kwargs=kwargs,
-        runtime_cls=FP8GemmRuntime,
-    )
+    code = FP8GemmRuntime.generate(kwargs)
+    _ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
 def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
@@ -187,31 +183,26 @@ def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
     num_tma_threads = 128
     num_math_threads_per_group = 128
     kwargs = {
+        "GEMM_TYPE": GemmType.GroupedContiguous,
         "NUM_TMA_THREADS": num_tma_threads,
         "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
+        "N": n,
+        "K": k,
+        "NUM_GROUPS": 1,
+        "BLOCK_M": block_m,
+        "BLOCK_N": block_n,
         "BLOCK_K": block_k,
+        "SWIZZLE_D_MODE": smem_config[1],
+        "BLOCK_N_PADDING": smem_config[2],
+        "NUM_STAGES": num_stages,
+        "NUM_TMA_MULTICAST": tma_multicast_config[0],
+        "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
         "NUM_SMS": num_sms,
         "SMEM_SIZE": smem_config[0],
     }
-    _, _ = jit_tuner.compile_and_tune(
-        name="m_grouped_gemm_fp8_fp8_bf16_nt",
-        keys={
-            "N": n,
-            "K": k,
-            "BLOCK_M": block_m,
-            "BLOCK_N": block_n,
-            "SWIZZLE_D_MODE": smem_config[1],
-            "BLOCK_N_PADDING": smem_config[2],
-            "NUM_GROUPS": num_groups,
-            "NUM_STAGES": num_stages,
-            "NUM_TMA_MULTICAST": tma_multicast_config[0],
-            "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
-            "GEMM_TYPE": GemmType.GroupedContiguous,
-        },
-        space=(),
-        kwargs=kwargs,
-        runtime_cls=FP8GemmRuntime,
-    )
+    code = FP8GemmRuntime.generate(kwargs)
+    _ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
 def _compile_gemm_nt_f8f8bf16_one(
@@ -228,28 +219,23 @@ def _compile_gemm_nt_f8f8bf16_one(
         "GEMM_TYPE": GemmType.Normal,
         "NUM_TMA_THREADS": num_tma_threads,
         "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
+        "N": n,
+        "K": k,
         "NUM_GROUPS": 1,
+        "BLOCK_M": block_m,
+        "BLOCK_N": block_n,
         "BLOCK_K": block_k,
+        "SWIZZLE_D_MODE": smem_config[1],
+        "BLOCK_N_PADDING": smem_config[2],
+        "NUM_STAGES": num_stages,
+        "NUM_TMA_MULTICAST": tma_multicast_config[0],
+        "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
         "NUM_SMS": num_sms,
         "SMEM_SIZE": smem_config[0],
     }
-    _, _ = jit_tuner.compile_and_tune(
-        name="gemm_fp8_fp8_bf16_nt",
-        keys={
-            "N": n,
-            "K": k,
-            "BLOCK_M": block_m,
-            "BLOCK_N": block_n,
-            "SWIZZLE_D_MODE": smem_config[1],
-            "BLOCK_N_PADDING": smem_config[2],
-            "NUM_STAGES": num_stages,
-            "NUM_TMA_MULTICAST": tma_multicast_config[0],
-            "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
-        },
-        space=(),
-        kwargs=kwargs,
-        runtime_cls=FP8GemmRuntime,
-    )
+    code = FP8GemmRuntime.generate(kwargs)
+    _ = build("gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
 _KERNEL_HELPER_DICT: Dict[DeepGemmKernelType, DeepGemmKernelHelper] = {
@@ -391,3 +377,16 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
     RuntimeCache.get = __patched_func
     yield
     RuntimeCache.get = origin_func
+@contextmanager
+def configure_deep_gemm_num_sms(num_sms):
+    if num_sms is None:
+        yield
+    else:
+        original_num_sms = deep_gemm.get_num_sms()
+        deep_gemm.set_num_sms(num_sms)
+        try:
+            yield
+        finally:
+            deep_gemm.set_num_sms(original_num_sms)

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl