PyPI - sglang - Versions diffs - 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl - Mend

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

sglang/bench_one_batch.py +113 -17
sglang/srt/configs/model_config.py +35 -0
sglang/srt/conversation.py +9 -5
sglang/srt/disaggregation/base/conn.py +5 -2
sglang/srt/disaggregation/decode.py +6 -1
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
sglang/srt/disaggregation/mooncake/conn.py +243 -135
sglang/srt/disaggregation/prefill.py +2 -0
sglang/srt/distributed/parallel_state.py +11 -9
sglang/srt/entrypoints/context.py +244 -0
sglang/srt/entrypoints/engine.py +4 -3
sglang/srt/entrypoints/harmony_utils.py +370 -0
sglang/srt/entrypoints/http_server.py +71 -0
sglang/srt/entrypoints/openai/protocol.py +227 -1
sglang/srt/entrypoints/openai/serving_chat.py +278 -42
sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
sglang/srt/entrypoints/openai/tool_server.py +174 -0
sglang/srt/entrypoints/tool.py +87 -0
sglang/srt/eplb/expert_location.py +5 -1
sglang/srt/function_call/harmony_tool_parser.py +130 -0
sglang/srt/hf_transformers_utils.py +30 -3
sglang/srt/jinja_template_utils.py +8 -1
sglang/srt/layers/attention/aiter_backend.py +5 -8
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
sglang/srt/layers/attention/triton_backend.py +85 -14
sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
sglang/srt/layers/attention/vision.py +13 -5
sglang/srt/layers/communicator.py +21 -4
sglang/srt/layers/dp_attention.py +12 -0
sglang/srt/layers/linear.py +2 -7
sglang/srt/layers/moe/cutlass_moe.py +20 -6
sglang/srt/layers/moe/ep_moe/layer.py +77 -73
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
sglang/srt/layers/moe/fused_moe_triton/layer.py +416 -35
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
sglang/srt/layers/moe/topk.py +12 -3
sglang/srt/layers/moe/utils.py +16 -0
sglang/srt/layers/quantization/__init__.py +22 -0
sglang/srt/layers/quantization/fp4.py +557 -0
sglang/srt/layers/quantization/fp8.py +3 -6
sglang/srt/layers/quantization/fp8_utils.py +29 -0
sglang/srt/layers/quantization/modelopt_quant.py +259 -64
sglang/srt/layers/quantization/mxfp4.py +651 -0
sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
sglang/srt/layers/quantization/quark/__init__.py +0 -0
sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
sglang/srt/layers/quantization/quark/utils.py +107 -0
sglang/srt/layers/quantization/unquant.py +60 -6
sglang/srt/layers/quantization/w4afp8.py +1 -1
sglang/srt/layers/rotary_embedding.py +225 -1
sglang/srt/layers/utils.py +9 -0
sglang/srt/layers/vocab_parallel_embedding.py +8 -3
sglang/srt/lora/lora_manager.py +70 -14
sglang/srt/lora/lora_registry.py +3 -2
sglang/srt/lora/mem_pool.py +43 -5
sglang/srt/managers/cache_controller.py +55 -30
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +15 -3
sglang/srt/managers/mm_utils.py +5 -11
sglang/srt/managers/schedule_batch.py +28 -7
sglang/srt/managers/scheduler.py +26 -12
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
sglang/srt/managers/scheduler_recv_skipper.py +37 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
sglang/srt/managers/template_manager.py +35 -1
sglang/srt/managers/tokenizer_manager.py +24 -6
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
sglang/srt/mem_cache/hiradix_cache.py +53 -5
sglang/srt/mem_cache/memory_pool_host.py +1 -1
sglang/srt/mem_cache/multimodal_cache.py +33 -13
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +7 -6
sglang/srt/model_executor/forward_batch_info.py +35 -14
sglang/srt/model_executor/model_runner.py +19 -2
sglang/srt/model_loader/weight_utils.py +10 -0
sglang/srt/models/bailing_moe.py +425 -0
sglang/srt/models/deepseek_v2.py +72 -33
sglang/srt/models/ernie4.py +426 -0
sglang/srt/models/ernie4_eagle.py +203 -0
sglang/srt/models/gemma3n_mm.py +39 -0
sglang/srt/models/glm4_moe.py +24 -12
sglang/srt/models/gpt_oss.py +1134 -0
sglang/srt/models/qwen2.py +6 -0
sglang/srt/models/qwen2_moe.py +6 -0
sglang/srt/models/qwen3_moe.py +32 -6
sglang/srt/models/step3_vl.py +9 -0
sglang/srt/models/transformers.py +2 -5
sglang/srt/multimodal/processors/step3_vl.py +3 -1
sglang/srt/reasoning_parser.py +18 -39
sglang/srt/server_args.py +142 -7
sglang/srt/two_batch_overlap.py +157 -5
sglang/srt/utils.py +38 -2
sglang/test/runners.py +2 -2
sglang/test/test_utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +16 -14
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +105 -84
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py CHANGED Viewed

@@ -6,15 +6,50 @@ from typing import TYPE_CHECKING, Optional
 import torch
 from sgl_kernel import gelu_and_mul, silu_and_mul
-from triton_kernels.matmul_ogs import matmul_ogs
+from triton_kernels.matmul_ogs import (
+    FlexCtx,
+    FnSpecs,
+    FusedActivation,
+    PrecisionConfig,
+    matmul_ogs,
+)
+from triton_kernels.numerics import InFlexData
 from triton_kernels.routing import GatherIndx, RoutingData, ScatterIndx
-from sglang.srt.utils import direct_register_custom_op
+from triton_kernels.swiglu import swiglu_fn
 if TYPE_CHECKING:
     from sglang.srt.layers.moe.topk import TopKOutput
+def quantize(w, dtype, dev, **opt):
+    if dtype == "bf16":
+        return w.to(torch.bfloat16), InFlexData()
+    elif dtype == "fp8":
+        wq = w.to(torch.float8_e4m3fn).transpose(-1, -2).contiguous().transpose(-1, -2)
+        return (
+            wq,
+            InFlexData(dtype=wq.dtype, scale=w.abs().max().unsqueeze(0)),
+            MicroscalingCtx(),
+        )
+    else:
+        assert dtype == "mx4", f"{dtype=}"
+        swizzle_mx_scale = opt["swizzle_mx_scale"]
+        swizzle_axis = 2 if swizzle_mx_scale else None
+        w = w.to(torch.bfloat16)
+        w, mx_scales, weight_scale_shape = downcast_to_mxfp(
+            w, torch.uint8, axis=1, swizzle_axis=swizzle_axis
+        )
+        return (
+            w,
+            InFlexData(),
+            MicroscalingCtx(
+                weight_scale=mx_scales,
+                swizzle_mx=swizzle_mx_scale,
+                actual_weight_scale_shape=weight_scale_shape,
+            ),
+        )
 def triton_kernel_moe_forward(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -146,3 +181,153 @@ def triton_kernel_fused_experts(
     )
     return intermediate_cache3
+def triton_kernel_moe_with_bias_forward(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w1_pcg,
+    b1: torch.Tensor,
+    w2: torch.Tensor,
+    w2_pcg,
+    b2: torch.Tensor,
+    topk_output: TopKOutput,
+    inplace: bool = False,
+    activation: str = "silu",
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[int] = None,
+) -> torch.Tensor:
+    assert topk_output.format.is_triton_kernel()
+    routing_data, gather_idx, scatter_idx = topk_output
+    return triton_kernel_fused_experts_with_bias(
+        hidden_states,
+        w1=w1,
+        w1_pcg=w1_pcg,
+        b1=b1,
+        w2=w2,
+        w2_pcg=w2_pcg,
+        b2=b2,
+        routing_data=routing_data,
+        gather_indx=gather_idx,
+        scatter_indx=scatter_idx,
+        inplace=inplace,
+        activation=activation,
+        use_fp8_w8a8=use_fp8_w8a8,
+        per_channel_quant=per_channel_quant,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape,
+        activation_alpha=activation_alpha,
+        swiglu_limit=swiglu_limit,
+    )
+def triton_kernel_fused_experts_with_bias(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w1_pcg,
+    b1: torch.Tensor,
+    w2: torch.Tensor,
+    w2_pcg,
+    b2: torch.Tensor,
+    routing_data: RoutingData,
+    gather_indx: GatherIndx,
+    scatter_indx: ScatterIndx,
+    inplace: bool = False,
+    activation: str = "silu",
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[int] = None,
+) -> torch.Tensor:
+    # print(f"here in triton moe with bias", b1.shape, b1.dtype, b2.shape, b2.dtype)
+    assert use_fp8_w8a8 == False, "use_fp8_w8a8 is not supported"
+    assert per_channel_quant == False, "per_channel_quant is not supported"
+    assert expert_map == None, "expert_map is not supported"
+    assert w1_scale == None, "w1_scale is not supported"
+    assert w2_scale == None, "w2_scale is not supported"
+    assert a1_scale == None, "a1_scale is not supported"
+    assert a2_scale == None, "a2_scale is not supported"
+    assert block_shape == None, "block_shape is not supported"
+    # type check
+    assert hidden_states.dtype == torch.bfloat16, "hidden_states must be bfloat16"
+    for w in (w1, w2):
+        # TODO assert bf16 or mxfp4
+        # assert (w.dtype == torch.bfloat16) or check-is-mxfp4, f"w must be bfloat16 or mxfp4 {w1.dtype=}"
+        pass
+    # Shape check
+    assert hidden_states.ndim == 2, "hidden_states must be 2D"
+    assert (
+        hidden_states.shape[-1] == w1.shape[-2]
+    ), f"hidden_states shape[-1] {hidden_states.shape} must be equal to w1 shape[-2] {w1.shape}"
+    assert (
+        w2.shape[-1] == w1.shape[1]
+    ), f"w2 shape[-1] {w2.shape[-1]} must be equal to w1 shape[1] {w1.shape[1]}"
+    # feature check
+    assert inplace == False, "Inplace is not supported in new triton MoE kernel"
+    E, _, _ = w1.shape
+    if global_num_experts == -1:
+        global_num_experts = E
+    # TODO maybe completely remove this branch
+    if w1.dtype == torch.bfloat16:
+        device = "cuda"
+        optg = dict()
+        w1, w1_flex = quantize(w1, "bf16", device, **optg)
+        w1_pcg = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w1_flex))
+        w2, w2_flex = quantize(w2, "bf16", device, **optg)
+        w2_pcg = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w2_flex))
+    act = FusedActivation(
+        FnSpecs("swiglu", swiglu_fn, ("alpha", "limit")),
+        (activation_alpha, swiglu_limit),
+        2,
+    )
+    intermediate_cache = matmul_ogs(
+        hidden_states,
+        w1,
+        b1,
+        routing_data,
+        gather_indx=gather_indx,
+        precision_config=w1_pcg,
+        gammas=None,
+        fused_activation=act,
+    )
+    return matmul_ogs(
+        intermediate_cache,
+        w2,
+        b2,
+        routing_data,
+        scatter_indx=scatter_indx,
+        precision_config=w2_pcg,
+        gammas=routing_data.gate_scal,
+    )

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -185,8 +185,9 @@ class TopK(CustomOp):
         expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
     ) -> TopKOutput:
         if self.use_triton_kernels:
+            # renormalize=True is equivalent to sm_first=False
             routing_data, gather_idx, scatter_idx = routing(
-                router_logits, self.top_k, self.renormalize
+                router_logits, self.top_k, sm_first=not self.renormalize
             )
             return TritonKernelTopKOutput(routing_data, gather_idx, scatter_idx)
         else:
@@ -397,8 +398,12 @@ def grouped_topk_gpu(
         .reshape(num_token, -1)
     )  # [n, e]
     tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    # TODO: NPU can't support directly evaluating a comparison for now
     topk_weights, topk_ids = torch.topk(
-        tmp_scores, k=topk, dim=-1, sorted=num_fused_shared_experts > 0
+        tmp_scores,
+        k=topk,
+        dim=-1,
+        sorted=(True if num_fused_shared_experts > 0 else False),
     )
     if num_fused_shared_experts:
         topk_ids[:, -1] = torch.randint(
@@ -488,8 +493,12 @@ def biased_grouped_topk_impl(
     tmp_scores = scores_for_choice.masked_fill(
         ~score_mask.bool(), float("-inf")
     )  # [n, e]
+    # TODO: NPU can't support directly evaluating a comparison for now
     _, topk_ids = torch.topk(
-        tmp_scores, k=topk, dim=-1, sorted=num_fused_shared_experts > 0
+        tmp_scores,
+        k=topk,
+        dim=-1,
+        sorted=(True if num_fused_shared_experts > 0 else False),
     )
     topk_weights = scores.gather(1, topk_ids)

sglang/srt/layers/moe/utils.py CHANGED Viewed

@@ -1,4 +1,20 @@
+import importlib.util
 from enum import Enum
+from functools import lru_cache
+from packaging import version as pkg_version
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+@lru_cache(maxsize=1)
+def should_use_flashinfer_trtllm_moe():
+    result = global_server_args_dict["enable_flashinfer_trtllm_moe"] and (
+        not importlib.util.find_spec("flashinfer")
+        or pkg_version.parse(__import__("flashinfer").__version__)
+        >= pkg_version.parse("0.2.9rc1")
+    )
+    return result
 class MoeA2ABackend(Enum):

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -47,6 +47,12 @@ from sglang.srt.layers.quantization.blockwise_int8 import BlockInt8Config
 from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
     CompressedTensorsConfig,
 )
+from sglang.srt.utils import is_cuda, is_hip, mxfp_supported
+is_mxfp_supported = mxfp_supported()
+if is_mxfp_supported:
+    from sglang.srt.layers.quantization.fp4 import MxFp4Config
 from sglang.srt.layers.quantization.fp8 import Fp8Config
 from sglang.srt.layers.quantization.gptq import (
     GPTQConfig,
@@ -60,6 +66,7 @@ from sglang.srt.layers.quantization.modelopt_quant import (
     ModelOptFp8Config,
 )
 from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
+from sglang.srt.layers.quantization.mxfp4 import Mxfp4Config
 from sglang.srt.layers.quantization.petit import PetitNvFp4Config
 from sglang.srt.layers.quantization.qoq import QoQConfig
 from sglang.srt.layers.quantization.utils import get_linear_quant_method
@@ -85,6 +92,21 @@ BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "petit_nvfp4": PetitNvFp4Config,
 }
+if is_cuda():
+    BASE_QUANTIZATION_METHODS.update(
+        {
+            "quark": Mxfp4Config,
+            "mxfp4": Mxfp4Config,
+        }
+    )
+elif is_mxfp_supported and is_hip():
+    BASE_QUANTIZATION_METHODS.update(
+        {
+            "quark": MxFp4Config,
+            "mxfp4": MxFp4Config,
+        }
+    )
 # VLLM-dependent quantization methods
 VLLM_QUANTIZATION_METHODS = {
     "aqlm": AQLMConfig,

sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc0py3-none-any.whl