PyPI - sglang - Versions diffs - 0.5.2rc0__py3-none-any.whl → 0.5.2rc2__py3-none-any.whl - Mend

sglang 0.5.2rc0py3-none-any.whl → 0.5.2rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

sglang/lang/interpreter.py +1 -1
sglang/srt/configs/internvl.py +6 -0
sglang/srt/configs/model_config.py +2 -1
sglang/srt/disaggregation/mini_lb.py +2 -2
sglang/srt/distributed/parallel_state.py +46 -41
sglang/srt/entrypoints/engine.py +1 -1
sglang/srt/entrypoints/http_server.py +5 -1
sglang/srt/entrypoints/openai/protocol.py +3 -3
sglang/srt/entrypoints/openai/serving_chat.py +3 -3
sglang/srt/entrypoints/openai/serving_completions.py +3 -1
sglang/srt/entrypoints/openai/serving_embedding.py +1 -1
sglang/srt/entrypoints/openai/serving_responses.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/layers/attention/aiter_backend.py +93 -68
sglang/srt/layers/communicator.py +45 -7
sglang/srt/layers/moe/cutlass_w4a8_moe.py +1 -9
sglang/srt/layers/moe/ep_moe/layer.py +2 -7
sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -1048
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +796 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +5 -2
sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
sglang/srt/layers/moe/utils.py +0 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +8 -0
sglang/srt/layers/quantization/modelopt_quant.py +35 -2
sglang/srt/layers/quantization/mxfp4.py +4 -1
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
sglang/srt/layers/quantization/quark/utils.py +97 -0
sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
sglang/srt/layers/quantization/w4afp8.py +30 -25
sglang/srt/layers/rocm_linear_utils.py +44 -0
sglang/srt/layers/rotary_embedding.py +0 -18
sglang/srt/managers/cache_controller.py +42 -39
sglang/srt/managers/detokenizer_manager.py +0 -34
sglang/srt/managers/multi_tokenizer_mixin.py +48 -6
sglang/srt/managers/schedule_policy.py +3 -2
sglang/srt/managers/scheduler.py +7 -100
sglang/srt/managers/scheduler_metrics_mixin.py +113 -7
sglang/srt/managers/template_manager.py +3 -3
sglang/srt/managers/tokenizer_manager.py +1 -0
sglang/srt/mem_cache/allocator.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +15 -10
sglang/srt/mem_cache/hiradix_cache.py +16 -0
sglang/srt/mem_cache/memory_pool_host.py +18 -11
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +35 -6
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +32 -13
sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
sglang/srt/metrics/collector.py +12 -4
sglang/srt/metrics/utils.py +48 -0
sglang/srt/model_executor/forward_batch_info.py +16 -17
sglang/srt/model_executor/model_runner.py +1 -1
sglang/srt/models/deepseek_v2.py +245 -36
sglang/srt/models/glm4_moe.py +10 -1
sglang/srt/models/gpt_oss.py +5 -4
sglang/srt/models/internvl.py +28 -0
sglang/srt/models/longcat_flash.py +26 -15
sglang/srt/models/longcat_flash_nextn.py +23 -15
sglang/srt/models/minicpmv.py +165 -3
sglang/srt/models/qwen2_moe.py +4 -1
sglang/srt/models/qwen3.py +8 -2
sglang/srt/models/qwen3_moe.py +39 -8
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
sglang/srt/server_args.py +79 -2
sglang/srt/speculative/eagle_worker.py +158 -112
sglang/srt/utils.py +12 -10
sglang/test/few_shot_gsm8k.py +1 -0
sglang/test/test_cutlass_w4a8_moe.py +24 -9
sglang/utils.py +1 -0
sglang/version.py +1 -1
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/METADATA +2 -2
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/RECORD +83 -76
sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
/sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
/sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
/sglang/srt/{conversation.py → parser/conversation.py} +0 -0
/sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
/sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py ADDED Viewed

@@ -0,0 +1,212 @@
+from __future__ import annotations
+import functools
+import json
+import logging
+import os
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+import triton
+from sglang.srt.utils import get_device_name, is_hip
+logger = logging.getLogger(__name__)
+_is_hip = is_hip()
+def get_config_file_name(
+    E: int, N: int, dtype: Optional[str], block_shape: Optional[int] = None
+) -> str:
+    device_name = get_device_name().replace(" ", "_")
+    dtype_selector = "" if not dtype else f",dtype={dtype}"
+    block_shape_selector = (
+        "" if not block_shape or not all(block_shape) else f",block_shape={block_shape}"
+    )
+    return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json"
+@functools.lru_cache
+def get_moe_configs(
+    E: int,
+    N: int,
+    dtype: Optional[str],
+    block_n: Optional[int] = 0,
+    block_k: Optional[int] = 0,
+) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the fused MoE kernel.
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the fused_moe kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+    # Supported Triton versions, should be sorted from the newest to the oldest
+    supported_triton_versions = ["3.3.1", "3.2.0", "3.1.0"]
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    json_file_name = get_config_file_name(E, N, dtype, [block_n, block_k])
+    # We found that using the fused_moe_kernel config from Triton 3.1.0 with Triton 3.2.0 results in negative performance gains,
+    # so we also include the Triton version as a key for finding the fused_moe_kernel config to achieve the best performance.
+    triton_version = triton.__version__
+    version_dir = f"triton_{triton_version.replace('.', '_')}"
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)),
+        "configs",
+        version_dir,
+        json_file_name,
+    )
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            # Please note that although we find the config files, performance might still be suboptimal.
+            # This is because the tuning environment might differ from your current environment.
+            # For example, updating the Triton version might cause all old configs to become suboptimal.
+            # To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
+            # For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
+            logger.info(f"Using MoE kernel config from {config_file_path}.")
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+    # Searching for other triton versions that supports the same config
+    for try_triton_version in supported_triton_versions:
+        if try_triton_version == triton_version:
+            continue
+        try_config_file_path = os.path.join(
+            os.path.dirname(os.path.realpath(__file__)),
+            "configs",
+            f"triton_{try_triton_version.replace('.', '_')}",
+            json_file_name,
+        )
+        if os.path.exists(try_config_file_path):
+            with open(try_config_file_path) as f:
+                logger.warning(
+                    f"Config file not found at {config_file_path}. Fallback to triton version {try_triton_version} and use MoE kernel config from {try_config_file_path}. Performance might be sub-optimal!",
+                )
+                # If a configuration has been found, return it
+                return {int(key): val for key, val in json.load(f).items()}
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        (
+            "Using default MoE kernel config. Performance might be sub-optimal! "
+            "Config file not found at %s, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton"
+        ),
+        config_file_path,
+    )
+    return None
+def get_default_config(
+    M: int,
+    E: int,
+    N: int,
+    K: int,
+    topk: int,
+    dtype: Optional[str],
+    is_marlin: bool,
+    block_shape: Optional[List[int]] = None,
+) -> Dict[str, int]:
+    if dtype == "fp8_w8a8":
+        if block_shape is None:
+            config = {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 32,
+                "num_warps": 8,
+                "num_stages": 2 if _is_hip else 4,
+            }
+            if M <= E:
+                config = {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 128,
+                    "GROUP_SIZE_M": 1,
+                    "num_warps": 4,
+                    "num_stages": 2 if _is_hip else 4,
+                }
+        else:
+            # Block-wise quant: BLOCK_SIZE_K must be divisible by block_shape[1]
+            config = {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": block_shape[0],
+                "BLOCK_SIZE_K": block_shape[1],
+                "GROUP_SIZE_M": 32,
+                "num_warps": 4,
+                "num_stages": 2 if _is_hip else 3,
+            }
+    else:
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+        }
+        # A heuristic: fused marlin works faster with this config for small M
+        if M <= E or (is_marlin and M <= 32):
+            config = {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+            }
+    return config
+def try_get_optimal_moe_config(
+    w1_shape: Tuple[int, ...],
+    w2_shape: Tuple[int, ...],
+    top_k: int,
+    dtype: Optional[str],
+    M: int,
+    is_marlin: bool = False,
+    block_shape: Optional[List[int]] = None,
+):
+    from sglang.srt.layers.moe.fused_moe_triton import get_config
+    override_config = get_config()
+    if override_config:
+        config = override_config
+    else:
+        # First try to load optimal config from the file
+        E, _, N = w2_shape
+        block_n = block_shape[0] if block_shape else 0
+        block_k = block_shape[1] if block_shape else 0
+        configs = get_moe_configs(E, N, dtype, block_n, block_k)
+        if configs:
+            # If an optimal configuration map has been found, look up the
+            # optimal config
+            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+        else:
+            # Else use the default config
+            config = get_default_config(
+                M, E, N, w1_shape[2], top_k, dtype, is_marlin, block_shape
+            )
+    return config
+def get_config_dtype_str(
+    dtype: torch.dtype,
+    use_int8_w8a16: Optional[bool] = False,
+    use_int4_w4a16: Optional[bool] = False,
+    use_fp8_w8a8: Optional[bool] = False,
+    use_int8_w8a8: Optional[bool] = False,
+):
+    if use_fp8_w8a8:
+        return "fp8_w8a8"
+    elif use_int8_w8a8:
+        return "int8_w8a8"
+    elif use_int4_w4a16:
+        return "int4_w4a16"
+    elif use_int8_w8a16:
+        return "int8_w8a16"
+    elif dtype == torch.float:
+        # avoiding cases where kernel fails when float32 MoE
+        # use fp16/bfloat16 configs
+        return "float32"
+    return None

sglang 0.5.2rc0__py3-none-any.whl → 0.5.2rc2__py3-none-any.whl

sglang 0.5.2rc0py3-none-any.whl → 0.5.2rc2py3-none-any.whl