PyPI - sglang - Versions diffs - 0.4.5.post2__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl - Mend

sglang 0.4.5.post2py3-none-any.whl → 0.4.5.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

sglang/bench_serving.py +3 -2
sglang/compile_deep_gemm.py +136 -0
sglang/lang/backend/openai.py +5 -1
sglang/lang/backend/runtime_endpoint.py +5 -1
sglang/srt/configs/model_config.py +4 -1
sglang/srt/constrained/xgrammar_backend.py +1 -0
sglang/srt/disaggregation/decode.py +43 -0
sglang/srt/disaggregation/mini_lb.py +69 -8
sglang/srt/disaggregation/mooncake/conn.py +1 -1
sglang/srt/disaggregation/nixl/__init__.py +1 -0
sglang/srt/disaggregation/nixl/conn.py +622 -0
sglang/srt/disaggregation/prefill.py +100 -16
sglang/srt/disaggregation/utils.py +17 -0
sglang/srt/entrypoints/engine.py +4 -0
sglang/srt/entrypoints/http_server.py +3 -7
sglang/srt/function_call_parser.py +60 -0
sglang/srt/layers/activation.py +2 -2
sglang/srt/layers/attention/flashattention_backend.py +781 -150
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
sglang/srt/layers/dp_attention.py +1 -1
sglang/srt/layers/layernorm.py +19 -4
sglang/srt/layers/moe/ep_moe/layer.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
sglang/srt/layers/quantization/deep_gemm.py +378 -0
sglang/srt/layers/quantization/fp8_kernel.py +7 -38
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +13 -7
sglang/srt/layers/quantization/modelopt_quant.py +2 -2
sglang/srt/layers/quantization/w8a8_int8.py +3 -3
sglang/srt/layers/rotary_embedding.py +6 -6
sglang/srt/layers/sampler.py +2 -2
sglang/srt/managers/data_parallel_controller.py +7 -1
sglang/srt/managers/io_struct.py +14 -3
sglang/srt/managers/schedule_batch.py +13 -0
sglang/srt/managers/scheduler.py +16 -6
sglang/srt/managers/tokenizer_manager.py +115 -29
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +40 -32
sglang/srt/mem_cache/memory_pool.py +31 -13
sglang/srt/model_executor/cuda_graph_runner.py +13 -8
sglang/srt/model_executor/model_runner.py +19 -4
sglang/srt/models/deepseek_v2.py +9 -6
sglang/srt/models/minicpm3.py +2 -2
sglang/srt/models/minicpmo.py +17 -6
sglang/srt/openai_api/adapter.py +71 -4
sglang/srt/openai_api/protocol.py +6 -1
sglang/srt/server_args.py +52 -40
sglang/srt/speculative/build_eagle_tree.py +2 -2
sglang/srt/speculative/eagle_utils.py +2 -2
sglang/srt/speculative/eagle_worker.py +2 -7
sglang/srt/utils.py +46 -5
sglang/test/test_utils.py +3 -1
sglang/version.py +1 -1
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +3 -3
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +62 -57
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +0 -0
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py CHANGED Viewed

@@ -3,10 +3,10 @@ import triton
 import triton.language as tl
 from sglang.srt.managers.schedule_batch import global_server_args_dict
-from sglang.srt.utils import is_hip
+from sglang.srt.utils import is_cuda, is_hip
-is_cuda_available = torch.cuda.is_available()
-if is_cuda_available:
+_is_cuda = is_cuda()
+if _is_cuda:
     CUDA_CAPABILITY = torch.cuda.get_device_capability()
 _is_hip = is_hip()
@@ -1037,12 +1037,12 @@ def extend_attention_fwd(
         num_warps = 4
     else:
-        if is_cuda_available and CUDA_CAPABILITY[0] >= 9:
+        if _is_cuda and CUDA_CAPABILITY[0] >= 9:
             if Lq <= 256:
                 BLOCK_M, BLOCK_N = (128, 64)
             else:
                 BLOCK_M, BLOCK_N = (32, 64)
-        elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
+        elif _is_cuda and CUDA_CAPABILITY[0] >= 8:
             if Lq <= 128:
                 BLOCK_M, BLOCK_N = (128, 128)
             elif Lq <= 256:

sglang/srt/layers/attention/triton_ops/extend_attention.py CHANGED Viewed

@@ -23,10 +23,10 @@ import triton.language as tl
 from sglang.srt.layers.attention.triton_ops.prefill_attention import (
     context_attention_fwd,
 )
-from sglang.srt.utils import is_hip
+from sglang.srt.utils import is_cuda, is_hip
-is_cuda_available = torch.cuda.is_available()
-if is_cuda_available:
+_is_cuda = is_cuda()
+if _is_cuda:
     CUDA_CAPABILITY = torch.cuda.get_device_capability()
 _is_hip = is_hip()
@@ -345,12 +345,12 @@ def extend_attention_fwd(
         num_warps = 4
     else:
-        if is_cuda_available and CUDA_CAPABILITY[0] >= 9:
+        if _is_cuda and CUDA_CAPABILITY[0] >= 9:
             if Lq <= 256:
                 BLOCK_M, BLOCK_N = (128, 64)
             else:
                 BLOCK_M, BLOCK_N = (32, 64)
-        elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
+        elif _is_cuda and CUDA_CAPABILITY[0] >= 8:
             # sm86/sm89 has a much smaller shared memory size (100K) than sm80 (160K)
             if CUDA_CAPABILITY[1] == 9 or CUDA_CAPABILITY[1] == 6:
                 if Lq <= 128:

sglang/srt/layers/attention/triton_ops/prefill_attention.py CHANGED Viewed

@@ -22,8 +22,12 @@ import torch
 import triton
 import triton.language as tl
-is_cuda_available = torch.cuda.is_available()
-if is_cuda_available:
+from sglang.srt.utils import is_cuda, is_hip
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+if _is_cuda or _is_hip:
     CUDA_CAPABILITY = torch.cuda.get_device_capability()
@@ -172,7 +176,7 @@ def context_attention_fwd(
     b_seq_len: [b]
     out: [b * s, head, head_dim]
     """
-    if is_cuda_available and CUDA_CAPABILITY[0] > 8:
+    if (_is_cuda or _is_hip) and CUDA_CAPABILITY[0] > 8:
         BLOCK = 128
     else:
         BLOCK = 64

sglang/srt/layers/dp_attention.py CHANGED Viewed

@@ -143,7 +143,7 @@ def memcpy_triton_kernel(
     src_ptr,
     offset_ptr,
     sz_ptr,
-    offset_src,
+    offset_src: tl.constexpr,
     chunk_size,  # multiplied for offset and sz
     BLOCK_SIZE: tl.constexpr,
 ):

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -20,9 +20,12 @@ import torch
 import torch.nn as nn
 from sglang.srt.custom_op import CustomOp
-from sglang.srt.utils import is_cuda_available
+from sglang.srt.utils import is_cuda, is_hip
-_is_cuda = is_cuda_available()
+logger = logging.getLogger(__name__)
+_is_cuda = is_cuda()
+_is_hip = is_hip()
 if _is_cuda:
     from sgl_kernel import (
@@ -32,8 +35,20 @@ if _is_cuda:
         rmsnorm,
     )
+if _is_hip:
-logger = logging.getLogger(__name__)
+    from aiter.ops.rmsnorm import rms_norm, rmsnorm2d_fwd_with_add
+    rmsnorm = rms_norm
+    def fused_add_rmsnorm(
+        x: torch.Tensor,
+        residual: torch.Tensor,
+        w: torch.Tensor,
+        eps: float,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        rmsnorm2d_fwd_with_add(x, x, residual, residual, w, eps)
+        return x, residual
 class RMSNorm(CustomOp):
@@ -139,7 +154,7 @@ class Gemma3RMSNorm(nn.Module):
         return f"{tuple(self.weight.shape)}, eps={self.eps}"
-if not _is_cuda:
+if not (_is_cuda or _is_hip):
     logger.info(
         "sgl-kernel is not available on Non-NV platforms. Fallback to other kernel libraries."
     )

sglang/srt/layers/moe/ep_moe/layer.py CHANGED Viewed

@@ -802,6 +802,7 @@ class DeepEPMoE(EPMoE):
         correction_bias: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         activation: str = "silu",
+        routed_scaling_factor: Optional[float] = None,
         deepep_mode: DeepEPMode = DeepEPMode.auto,
     ):
         super().__init__(
@@ -820,6 +821,7 @@ class DeepEPMoE(EPMoE):
             correction_bias,
             custom_routing_function,
             activation,
+            routed_scaling_factor,
         )
         self.deepep_mode = deepep_mode
         if self.deepep_mode.enable_low_latency():

sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py CHANGED Viewed

@@ -33,6 +33,7 @@ from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe im
 from sglang.srt.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme,
     CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A16Fp8,
 )
 from sglang.srt.layers.quantization.compressed_tensors.utils import (
     find_matched_target,

sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py CHANGED Viewed

@@ -2,8 +2,10 @@
 from .compressed_tensors_scheme import CompressedTensorsScheme
 from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
+from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
 __all__ = [
     "CompressedTensorsScheme",
     "CompressedTensorsW8A8Fp8",
+    "CompressedTensorsW8A16Fp8",
 ]

sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py ADDED Viewed

@@ -0,0 +1,153 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+from typing import Callable, List, Optional
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+from sglang.srt.layers.parameter import (
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme,
+)
+from sglang.srt.layers.quantization.utils import convert_to_channelwise
+try:
+    from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+        apply_fp8_marlin_linear,
+        prepare_fp8_layer_for_marlin,
+    )
+    MARLIN_FP8_AVAILABLE = True
+except ImportError:
+    MARLIN_FP8_AVAILABLE = False
+    def apply_fp8_marlin_linear(*args, **kwargs):
+        raise ImportError("vllm is not installed")
+    def prepare_fp8_layer_for_marlin(*args, **kwargs):
+        raise ImportError("vllm is not installed")
+__all__ = ["CompressedTensorsW8A16Fp8"]
+SUPPORTED_STRATEGIES = [QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR]
+class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
+    def __init__(self, strategy: str, is_static_input_scheme: bool):
+        self.strategy = strategy
+        self.is_static_input_scheme = is_static_input_scheme
+        if not MARLIN_FP8_AVAILABLE:
+            raise ImportError(
+                "vllm is not installed. To use CompressedTensorsW8A16Fp8, please install vllm"
+            )
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # ampere and up
+        return 80
+    # W8A8-Fp8 kernels support only per-tensor and per-channel cases.
+    # So if we have a fused module (QKV, MLP) with per tensor scales,
+    # we expand each scale to its shard's channels.
+    def process_weights_after_loading(self, layer) -> None:
+        if self.strategy == QuantizationStrategy.TENSOR:
+            ws_channelwise = convert_to_channelwise(
+                layer.weight_scale, layer.logical_widths
+            )
+            layer.weight_scale = torch.nn.Parameter(ws_channelwise, requires_grad=False)
+        else:
+            # required by torch.compile to be torch.nn.Parameter
+            layer.weight_scale = torch.nn.Parameter(
+                layer.weight_scale.data, requires_grad=False
+            )
+        # Weights must be transposed for marlin
+        layer.weight = torch.nn.Parameter(layer.weight.t(), requires_grad=False)
+        if self.is_static_input_scheme:
+            # required by torch.compile to be torch.nn.Parameter
+            layer.input_scale = torch.nn.Parameter(
+                layer.input_scale.data, requires_grad=False
+            )
+        prepare_fp8_layer_for_marlin(layer, strategy="channel")
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size: int,
+        output_partition_sizes: List[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+        # WEIGHT
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+        # WEIGHT SCALE
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+        elif self.strategy == QuantizationStrategy.TENSOR:
+            weight_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+        else:
+            raise ValueError(
+                f"Unsupported weight strategy={self.strategy}, "
+                f"supported strategies are {SUPPORTED_STRATEGIES}"
+            )
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+        # INPUT SCALE (to deal with converted checkpoints)
+        if self.is_static_input_scheme:
+            input_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("input_scale", input_scale)
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return apply_fp8_marlin_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            workspace=layer.workspace,
+            size_n=layer.output_size_per_partition,
+            size_k=layer.input_size_per_partition,
+            bias=bias,
+        )

sglang 0.4.5.post2__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl

sglang 0.4.5.post2py3-none-any.whl → 0.4.5.post3py3-none-any.whl