PyPI - sglang - Versions diffs - 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl - Mend

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_offline_throughput.py +4 -2
sglang/bench_one_batch.py +3 -13
sglang/bench_one_batch_server.py +143 -15
sglang/bench_serving.py +158 -8
sglang/compile_deep_gemm.py +1 -1
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +119 -75
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +5 -2
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/internvl.py +696 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +18 -0
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +71 -53
sglang/srt/conversation.py +78 -46
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +11 -3
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +74 -23
sglang/srt/disaggregation/mooncake/conn.py +236 -138
sglang/srt/disaggregation/nixl/conn.py +242 -71
sglang/srt/disaggregation/prefill.py +7 -4
sglang/srt/disaggregation/utils.py +51 -2
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
sglang/srt/distributed/device_communicators/pynccl.py +2 -1
sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
sglang/srt/distributed/parallel_state.py +22 -1
sglang/srt/entrypoints/engine.py +31 -4
sglang/srt/entrypoints/http_server.py +45 -3
sglang/srt/entrypoints/verl_engine.py +3 -2
sglang/srt/function_call_parser.py +2 -2
sglang/srt/hf_transformers_utils.py +20 -1
sglang/srt/layers/attention/flashattention_backend.py +147 -51
sglang/srt/layers/attention/flashinfer_backend.py +23 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
sglang/srt/layers/attention/merge_state.py +46 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
sglang/srt/layers/attention/utils.py +4 -2
sglang/srt/layers/attention/vision.py +290 -163
sglang/srt/layers/dp_attention.py +71 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
sglang/srt/layers/moe/ep_moe/layer.py +121 -2
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/topk.py +1 -1
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
sglang/srt/layers/quantization/deep_gemm.py +77 -71
sglang/srt/layers/quantization/fp8.py +110 -97
sglang/srt/layers/quantization/fp8_kernel.py +81 -62
sglang/srt/layers/quantization/fp8_utils.py +71 -23
sglang/srt/layers/quantization/int8_kernel.py +2 -2
sglang/srt/layers/quantization/kv_cache.py +3 -10
sglang/srt/layers/quantization/utils.py +0 -5
sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +11 -14
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +115 -119
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/io_struct.py +13 -1
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
sglang/srt/managers/multimodal_processors/internvl.py +232 -0
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/schedule_batch.py +93 -23
sglang/srt/managers/schedule_policy.py +11 -8
sglang/srt/managers/scheduler.py +140 -100
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/tokenizer_manager.py +157 -47
sglang/srt/managers/tp_worker.py +21 -21
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/chunk_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +4 -2
sglang/srt/metrics/collector.py +312 -37
sglang/srt/model_executor/cuda_graph_runner.py +10 -11
sglang/srt/model_executor/forward_batch_info.py +1 -1
sglang/srt/model_executor/model_runner.py +57 -41
sglang/srt/model_loader/loader.py +18 -11
sglang/srt/models/clip.py +4 -4
sglang/srt/models/deepseek_janus_pro.py +3 -3
sglang/srt/models/deepseek_nextn.py +1 -20
sglang/srt/models/deepseek_v2.py +77 -39
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/internlm2.py +3 -0
sglang/srt/models/internvl.py +670 -0
sglang/srt/models/llama.py +3 -1
sglang/srt/models/llama4.py +58 -13
sglang/srt/models/llava.py +248 -5
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/phi3_small.py +16 -2
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/qwen2_5_vl.py +8 -4
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/models/xiaomi_mimo.py +171 -0
sglang/srt/openai_api/adapter.py +52 -42
sglang/srt/openai_api/protocol.py +20 -16
sglang/srt/reasoning_parser.py +1 -1
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +2 -2
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +64 -10
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +7 -7
sglang/srt/speculative/eagle_worker.py +22 -19
sglang/srt/utils.py +41 -6
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deepep_utils.py +219 -0
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +92 -15
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json ADDED Viewed

@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}

sglang/srt/layers/moe/fused_moe_triton/fused_moe.py CHANGED Viewed

@@ -29,6 +29,7 @@ from sglang.srt.utils import (
     get_device_name,
     is_cuda,
     is_hip,
+    log_info_on_rank0,
 )
 _is_hip = is_hip()
@@ -945,7 +946,9 @@ def get_moe_configs(
             # For example, updating the Triton version might cause all old configs to become suboptimal.
             # To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
             # For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
-            logger.info("Using MoE kernel config from %s.", config_file_path)
+            log_info_on_rank0(
+                logger, f"Using MoE kernel config from {config_file_path}."
+            )
             # If a configuration has been found, return it
             return {int(key): val for key, val in json.load(f).items()}
@@ -991,7 +994,7 @@ def get_default_config(
                     "num_stages": 2 if _is_hip else 4,
                 }
         else:
-            # Block-wise quant: BLOCK_SIZE_K must be divisable by block_shape[1]
+            # Block-wise quant: BLOCK_SIZE_K must be divisible by block_shape[1]
             config = {
                 "BLOCK_SIZE_M": 64,
                 "BLOCK_SIZE_N": block_shape[0],

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -270,7 +270,7 @@ def select_experts(
     routed_scaling_factor: Optional[float] = None,
 ):
     n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
-    # DeekSeek V2/V3/R1 serices models uses grouped_top_k
+    # DeepSeek V2/V3/R1 series models use grouped_top_k
     if use_grouped_topk:
         assert topk_group is not None
         assert num_expert_group is not None

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
         raise ValueError(
             f"{quantization} quantization requires some operators from vllm. "
-            "Pleaes install vllm by `pip install vllm==0.8.4`"
+            "Please install vllm by `pip install vllm==0.8.4`"
         )
     return QUANTIZATION_METHODS[quantization]

sglang/srt/layers/quantization/blockwise_int8.py CHANGED Viewed

@@ -152,7 +152,7 @@ class BlockInt8LinearMethod(LinearMethodBase):
                     f"{input_size_per_partition} is not divisible by "
                     f"weight quantization block_k = {block_k}."
                 )
-        # Required by collum parallel or enabling merged weights
+        # Required by column parallel or enabling merged weights
         if (tp_size > 1 and output_size // output_size_per_partition == tp_size) or len(
             output_partition_sizes
         ) > 1:
@@ -285,7 +285,7 @@ class BlockInt8MoEMethod:
             self.quant_config.weight_block_size[1],
         )
         # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
-        # Required by collum parallel or enabling merged weights
+        # Required by column parallel or enabling merged weights
         if intermediate_size % block_n != 0:
             raise ValueError(
                 f"The output_size of gate's and up's weight = "

sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py CHANGED Viewed

@@ -10,16 +10,14 @@ import torch
 from compressed_tensors import CompressionFormat
 from compressed_tensors.quantization import QuantizationStrategy
-from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz, scaled_fp8_quant
 from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
 from sglang.srt.layers.quantization.utils import (
     all_close_1d,
-    is_cuda,
-    is_fp8_fnuz,
     per_tensor_dequantize,
     replace_parameter,
 )
-from sglang.srt.utils import set_weight_attrs
+from sglang.srt.utils import is_cuda, set_weight_attrs
 _is_cuda = is_cuda()

sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py CHANGED Viewed

@@ -15,11 +15,12 @@ from sglang.srt.layers.parameter import (
 from sglang.srt.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme,
 )
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
 from sglang.srt.layers.quantization.fp8_utils import (
     apply_fp8_linear,
     normalize_e4m3fn_to_e4m3fnuz,
 )
-from sglang.srt.layers.quantization.utils import is_fp8_fnuz, requantize_with_max_scale
+from sglang.srt.layers.quantization.utils import requantize_with_max_scale
 __all__ = ["CompressedTensorsW8A8Fp8"]

sglang/srt/layers/quantization/deep_gemm.py CHANGED Viewed

@@ -15,12 +15,9 @@ _ENABLE_JIT_DEEPGEMM = False
 if is_cuda():
     import deep_gemm
     from deep_gemm import get_num_sms
+    from deep_gemm.jit.compiler import get_nvcc_compiler
     from deep_gemm.jit_kernels.gemm import get_best_configs
-    from deep_gemm.jit_kernels.gemm import includes as deep_gemm_includes
-    from deep_gemm.jit_kernels.gemm import template as deep_gemm_gemm_template
-    from deep_gemm.jit_kernels.m_grouped_gemm import (
-        template as deep_gemm_grouped_gemm_template,
-    )
+    from deep_gemm.jit_kernels.runtime import FP8GemmRuntime, GemmType
     from deep_gemm.jit_kernels.tuner import jit_tuner
     sm_version = get_device_sm()
@@ -28,6 +25,11 @@ if is_cuda():
         if get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true"):
             _ENABLE_JIT_DEEPGEMM = True
+def get_enable_jit_deepgemm():
+    return _ENABLE_JIT_DEEPGEMM
 logger = logging.getLogger(__name__)
 _BUILTIN_M_LIST = list(range(1, 1024 * 16 + 1))
@@ -40,10 +42,25 @@ _COMPILE_WORKERS = get_int_env_var("SGL_JIT_DEEPGEMM_COMPILE_WORKERS", 4)
 _IN_PRECOMPILE_STAGE = get_bool_env_var("SGL_IN_DEEPGEMM_PRECOMPILE_STAGE", "false")
 # Force redirect deep_gemm cache_dir
-os.environ["DG_CACHE_DIR"] = os.getenv(
-    "SGL_DG_CACHE_DIR", os.path.expanduser("~") + "/.cache/deep_gemm"
+os.environ["DG_JIT_CACHE_DIR"] = os.getenv(
+    "SGL_DG_CACHE_DIR", os.path.join(os.path.expanduser("~"), ".cache", "deep_gemm")
 )
+# Refer to https://github.com/deepseek-ai/DeepGEMM/commit/d75b218b7b8f4a5dd5406ac87905039ead3ae42f
+# NVRTC may have performance loss with some cases.
+# And NVCC JIT speed is also 9x faster in the ref commit
+_USE_NVRTC_DEFAULT = "0"
+if _ENABLE_JIT_DEEPGEMM:
+    try:
+        get_nvcc_compiler()
+    except:
+        logger.warning(
+            "NVCC Compiler not found, use NVRTC for DeepGEMM JIT "
+            "and may have performance loss with some cases."
+        )
+        _USE_NVRTC_DEFAULT = "1"
+os.environ["DG_JIT_USE_NVRTC"] = os.getenv("SGL_DG_USE_NVRTC", _USE_NVRTC_DEFAULT)
 def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
     global _BUILTIN_M_LIST
@@ -98,10 +115,10 @@ _INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dic
 def _compile_warning_1():
     if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
         logger.warning(
-            "Entering DeepGEMM JIT Pre-Complie session. "
+            "Entering DeepGEMM JIT Pre-Compile session. "
             "And it may takes a long time(Typically 10-20 mins) "
             "if you have not run `sglang.compile_deep_gemm`. "
-            "Recommand to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
+            "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
             " for pre-compilation to reduce the overhead if you have not run it before. "
             "For example: "
             "`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`"
@@ -110,7 +127,7 @@ def _compile_warning_1():
 def _compile_warning_2():
     logger.warning(
-        "Entering DeepGEMM JIT Single Kernel Complie session. "
+        "Entering DeepGEMM JIT Single Kernel Compile session. "
         "And it will makes inference throughput becomes flaky. "
         "Please run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
         " for pre-compilation to solve this issue. "
@@ -125,10 +142,18 @@ def _compile_grouped_gemm_nt_f8f8bf16_masked_one(
     num_groups: int,
     config: Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
 ) -> None:
-    # Auto-tuning with compilation
-    global deep_gemm_includes, deep_gemm_grouped_gemm_template
-    _, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
-    _ = jit_tuner.compile_and_tune(
+    num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
+    block_k = 128
+    num_tma_threads = 128
+    num_math_threads_per_group = 128
+    kwargs = {
+        "NUM_TMA_THREADS": num_tma_threads,
+        "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
+        "BLOCK_K": block_k,
+        "NUM_SMS": num_sms,
+        "SMEM_SIZE": smem_config[0],
+    }
+    _, _ = jit_tuner.compile_and_tune(
         name="m_grouped_gemm_fp8_fp8_bf16_nt",
         keys={
             "N": n,
@@ -141,24 +166,11 @@ def _compile_grouped_gemm_nt_f8f8bf16_masked_one(
             "NUM_STAGES": num_stages,
             "NUM_TMA_MULTICAST": tma_multicast_config[0],
             "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
-            "GEMM_TYPE": "GroupedMasked",
+            "GEMM_TYPE": GemmType.GroupedMasked,
         },
         space=(),
-        includes=deep_gemm_includes,
-        arg_defs=(
-            ("lhs", torch.float8_e4m3fn),
-            ("lhs_scales", torch.float),
-            ("rhs", torch.float8_e4m3fn),
-            ("rhs_scales", torch.float),
-            ("out", torch.bfloat16),
-            ("grouped_layout", torch.int32),
-            ("m", int),
-            ("stream", torch.cuda.Stream),
-            ("num_sms", int),
-            ("smem_size", int),
-        ),
-        template=deep_gemm_grouped_gemm_template,
-        args=[],
+        kwargs=kwargs,
+        runtime_cls=FP8GemmRuntime,
     )
@@ -168,9 +180,18 @@ def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
     num_groups: int,
     config: Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
 ) -> None:
-    global deep_gemm_includes, deep_gemm_grouped_gemm_template
-    _, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
-    _ = jit_tuner.compile_and_tune(
+    num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
+    block_k = 128
+    num_tma_threads = 128
+    num_math_threads_per_group = 128
+    kwargs = {
+        "NUM_TMA_THREADS": num_tma_threads,
+        "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
+        "BLOCK_K": block_k,
+        "NUM_SMS": num_sms,
+        "SMEM_SIZE": smem_config[0],
+    }
+    _, _ = jit_tuner.compile_and_tune(
         name="m_grouped_gemm_fp8_fp8_bf16_nt",
         keys={
             "N": n,
@@ -183,25 +204,11 @@ def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
             "NUM_STAGES": num_stages,
             "NUM_TMA_MULTICAST": tma_multicast_config[0],
             "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
-            "GEMM_TYPE": "GroupedContiguous",
+            "GEMM_TYPE": GemmType.GroupedContiguous,
         },
         space=(),
-        includes=deep_gemm_includes,
-        arg_defs=(
-            ("lhs", torch.float8_e4m3fn),
-            ("lhs_scales", torch.float),
-            ("rhs", torch.float8_e4m3fn),
-            ("rhs_scales", torch.float),
-            ("out", torch.bfloat16),
-            ("grouped_layout", torch.int32),
-            ("m", int),
-            ("num_groups", int),
-            ("stream", torch.cuda.Stream),
-            ("num_sms", int),
-            ("smem_size", int),
-        ),
-        template=deep_gemm_grouped_gemm_template,
-        args=[],
+        kwargs=kwargs,
+        runtime_cls=FP8GemmRuntime,
     )
@@ -211,9 +218,20 @@ def _compile_gemm_nt_f8f8bf16_one(
     _: int,  # _ is a dummy parameter to align with other interfaces
     config: Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]],
 ) -> None:
-    global deep_gemm_includes, deep_gemm_gemm_template
-    _, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
-    _ = jit_tuner.compile_and_tune(
+    num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = config
+    block_k = 128
+    num_tma_threads = 128
+    num_math_threads_per_group = 128
+    kwargs = {
+        "GEMM_TYPE": GemmType.Normal,
+        "NUM_TMA_THREADS": num_tma_threads,
+        "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
+        "NUM_GROUPS": 1,
+        "BLOCK_K": block_k,
+        "NUM_SMS": num_sms,
+        "SMEM_SIZE": smem_config[0],
+    }
+    _, _ = jit_tuner.compile_and_tune(
         name="gemm_fp8_fp8_bf16_nt",
         keys={
             "N": n,
@@ -227,20 +245,8 @@ def _compile_gemm_nt_f8f8bf16_one(
             "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
         },
         space=(),
-        includes=deep_gemm_includes,
-        arg_defs=(
-            ("lhs", torch.float8_e4m3fn),
-            ("lhs_scales", torch.float),
-            ("rhs", torch.float8_e4m3fn),
-            ("rhs_scales", torch.float),
-            ("out", torch.bfloat16),
-            ("m", int),
-            ("stream", torch.cuda.Stream),
-            ("num_sms", int),
-            ("smem_size", int),
-        ),
-        template=deep_gemm_gemm_template,
-        args=[],
+        kwargs=kwargs,
+        runtime_cls=FP8GemmRuntime,
     )
@@ -293,7 +299,7 @@ def _maybe_compile_deep_gemm_one_type_all(
         logger.info(
             f"Try DeepGEMM JIT Compiling for "
             f"<{kernel_helper.name}> N={n}, K={k}, num_groups={num_groups} with all Ms."
-            f"{' It only takes a litte time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
+            f"{' It only takes a little time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
         )
         # NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced
@@ -368,7 +374,7 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
     from deep_gemm.jit.runtime import RuntimeCache
-    origin_func = RuntimeCache.__getitem__
+    origin_func = RuntimeCache.get
     def __patched_func(self, *args, **kwargs):
         ret = origin_func(self, *args, **kwargs)
@@ -380,6 +386,6 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
             )
         return ret
-    RuntimeCache.__getitem__ = __patched_func
+    RuntimeCache.get = __patched_func
     yield
-    RuntimeCache.__getitem__ = origin_func
+    RuntimeCache.get = origin_func

sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post4py3-none-any.whl