PyPI - sglang - Versions diffs - 0.4.5.post1__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl - Mend

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

sglang/__init__.py +2 -4
sglang/bench_one_batch.py +2 -2
sglang/bench_serving.py +0 -4
sglang/lang/backend/anthropic.py +0 -4
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/openai.py +1 -1
sglang/lang/backend/vertexai.py +0 -1
sglang/lang/compiler.py +1 -7
sglang/lang/tracer.py +3 -7
sglang/srt/_custom_ops.py +0 -2
sglang/srt/constrained/outlines_jump_forward.py +14 -1
sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
sglang/srt/constrained/xgrammar_backend.py +26 -4
sglang/srt/custom_op.py +0 -62
sglang/srt/disaggregation/decode.py +62 -6
sglang/srt/disaggregation/mini_lb.py +5 -1
sglang/srt/disaggregation/mooncake/conn.py +32 -62
sglang/srt/disaggregation/mooncake/transfer_engine.py +30 -61
sglang/srt/disaggregation/prefill.py +40 -4
sglang/srt/disaggregation/utils.py +15 -0
sglang/srt/entrypoints/verl_engine.py +7 -5
sglang/srt/layers/activation.py +6 -8
sglang/srt/layers/attention/flashattention_backend.py +114 -71
sglang/srt/layers/attention/flashinfer_backend.py +5 -2
sglang/srt/layers/attention/torch_native_backend.py +6 -1
sglang/srt/layers/attention/triton_backend.py +6 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +13 -2
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/linear.py +17 -3
sglang/srt/layers/moe/ep_moe/layer.py +15 -29
sglang/srt/layers/moe/fused_moe_native.py +4 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +14 -19
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/topk.py +27 -30
sglang/srt/layers/parameter.py +0 -2
sglang/srt/layers/quantization/__init__.py +1 -0
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +8 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +16 -44
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
sglang/srt/layers/quantization/fp8.py +115 -132
sglang/srt/layers/quantization/fp8_kernel.py +213 -57
sglang/srt/layers/quantization/fp8_utils.py +187 -262
sglang/srt/layers/quantization/moe_wna16.py +2 -0
sglang/srt/layers/quantization/utils.py +5 -11
sglang/srt/layers/quantization/w8a8_fp8.py +2 -0
sglang/srt/layers/quantization/w8a8_int8.py +7 -7
sglang/srt/layers/radix_attention.py +15 -0
sglang/srt/layers/rotary_embedding.py +3 -2
sglang/srt/layers/sampler.py +5 -10
sglang/srt/lora/backend/base_backend.py +18 -2
sglang/srt/lora/backend/flashinfer_backend.py +1 -1
sglang/srt/lora/backend/triton_backend.py +1 -1
sglang/srt/lora/layers.py +1 -1
sglang/srt/lora/lora.py +1 -1
sglang/srt/lora/lora_manager.py +1 -1
sglang/srt/managers/detokenizer_manager.py +0 -1
sglang/srt/managers/io_struct.py +1 -0
sglang/srt/managers/mm_utils.py +4 -3
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +3 -2
sglang/srt/managers/schedule_batch.py +2 -4
sglang/srt/managers/scheduler.py +12 -71
sglang/srt/managers/tokenizer_manager.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +5 -1
sglang/srt/mem_cache/memory_pool.py +7 -2
sglang/srt/model_executor/cuda_graph_runner.py +2 -2
sglang/srt/model_executor/model_runner.py +20 -27
sglang/srt/models/bert.py +398 -0
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_nextn.py +74 -70
sglang/srt/models/deepseek_v2.py +289 -348
sglang/srt/models/llama.py +5 -5
sglang/srt/models/minicpm3.py +29 -201
sglang/srt/models/qwen2.py +4 -1
sglang/srt/models/qwen2_moe.py +14 -13
sglang/srt/models/qwen3.py +335 -0
sglang/srt/models/qwen3_moe.py +423 -0
sglang/srt/reasoning_parser.py +0 -1
sglang/srt/sampling/sampling_batch_info.py +2 -3
sglang/srt/server_args.py +34 -32
sglang/srt/speculative/eagle_worker.py +4 -7
sglang/srt/utils.py +16 -1
sglang/test/runners.py +5 -1
sglang/test/test_block_fp8.py +167 -0
sglang/test/test_custom_ops.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/METADATA +3 -3
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/RECORD +92 -91
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/WHEEL +1 -1
sglang/lang/__init__.py +0 -0
sglang/srt/lora/backend/__init__.py +0 -25
sglang/srt/server.py +0 -18
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -8,15 +8,6 @@ import torch.nn.functional as F
 from torch.nn import Module
 from torch.nn.parameter import Parameter
-from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
-from sglang.srt.layers.quantization.utils import (
-    all_close_1d,
-    convert_to_channelwise,
-    is_layer_skipped,
-    per_tensor_dequantize,
-    requantize_with_max_scale,
-)
 try:
     from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
         apply_fp8_marlin_linear,
@@ -27,11 +18,12 @@ try:
 except ImportError:
     MARLIN_FP8_AVAILABLE = False
-    def apply_fp8_marlin_linear(*args, **kwargs):
-        raise ImportError("vllm is not installed")
+    def dummy_func(*args, **kwargs):
+        raise ImportError(
+            "marlin FP8 requires some operators from vllm. Please install vllm."
+        )
-    def prepare_fp8_layer_for_marlin(*args, **kwargs):
-        raise ImportError("vllm is not installed")
+    apply_fp8_marlin_linear = prepare_fp8_layer_for_marlin = dummy_func
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
@@ -49,7 +41,10 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
+from sglang.srt.layers.quantization.fp8_kernel import (
+    per_token_group_quant_fp8,
+    scaled_fp8_quant,
+)
 from sglang.srt.layers.quantization.fp8_utils import (
     apply_fp8_linear,
     apply_w8a8_block_fp8_linear,
@@ -57,30 +52,35 @@ from sglang.srt.layers.quantization.fp8_utils import (
     input_to_float8,
     normalize_e4m3fn_to_e4m3fnuz,
 )
+from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
+from sglang.srt.layers.quantization.utils import (
+    all_close_1d,
+    convert_to_channelwise,
+    is_layer_skipped,
+    per_tensor_dequantize,
+    requantize_with_max_scale,
+)
 from sglang.srt.utils import (
     get_bool_env_var,
     is_cuda,
     is_hip,
-    permute_weight,
     print_warning_once,
     set_weight_attrs,
 )
-ACTIVATION_SCHEMES = ["static", "dynamic"]
 _is_hip = is_hip()
+_is_cuda = is_cuda()
 if _is_hip:
     from aiter import ActivationType
     from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages, ck_moe_2stages_win4
     from aiter.ops.shuffle import shuffle_weight
-_is_cuda = is_cuda()
+if not _is_cuda:
+    from vllm._custom_ops import scaled_fp8_quant
-if _is_cuda:
-    from sglang.srt.custom_op import scaled_fp8_quant as sgl_scaled_fp8_quant
-else:
-    from vllm import _custom_ops as vllm_ops
+ACTIVATION_SCHEMES = ["static", "dynamic"]
 logger = logging.getLogger(__name__)
@@ -243,7 +243,6 @@ class Fp8LinearMethod(LinearMethodBase):
                         )
         layer.logical_widths = output_partition_sizes
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
         layer.orig_dtype = params_dtype
@@ -327,7 +326,9 @@ class Fp8LinearMethod(LinearMethodBase):
                     layer.weight_scale_inv.data, requires_grad=False
                 )
             return
         layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
         # If checkpoint not serialized fp8, quantize the weights.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             if self.cutlass_fp8_supported or self.use_marlin:
@@ -391,12 +392,9 @@ class Fp8LinearMethod(LinearMethodBase):
                 )
         if self.use_marlin:
-            try:
-                prepare_fp8_layer_for_marlin(layer)
-                # Activations not quantized for marlin.
-                del layer.input_scale
-            except ImportError:
-                self.use_marlin = False
+            prepare_fp8_layer_for_marlin(layer)
+            # Activations not quantized for marlin.
+            del layer.input_scale
     def apply(
         self,
@@ -406,18 +404,15 @@ class Fp8LinearMethod(LinearMethodBase):
     ) -> torch.Tensor:
         if self.use_marlin:
-            try:
-                return apply_fp8_marlin_linear(
-                    input=x,
-                    weight=layer.weight,
-                    weight_scale=layer.weight_scale,
-                    workspace=layer.workspace,
-                    size_n=layer.output_size_per_partition,
-                    size_k=layer.input_size_per_partition,
-                    bias=bias,
-                )
-            except ImportError:
-                self.use_marlin = False
+            return apply_fp8_marlin_linear(
+                input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                workspace=layer.workspace,
+                size_n=layer.output_size_per_partition,
+                size_k=layer.input_size_per_partition,
+                bias=bias,
+            )
         if self.block_quant:
             return apply_w8a8_block_fp8_linear(
@@ -516,7 +511,7 @@ class Fp8MoEMethod:
                     )
         # WEIGHTS
-        if get_bool_env_var("USE_INT4_WEIGHT"):
+        if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
             # INT4 MoE weight - INT32 packed
             w13_weight = torch.nn.Parameter(
                 torch.empty(
@@ -617,7 +612,7 @@ class Fp8MoEMethod:
             set_weight_attrs(w13_weight_scale, extra_weight_attrs)
             set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-            if get_bool_env_var("USE_INT4_WEIGHT"):
+            if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
                 extra_weight_attrs.update(
                     {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
                 )
@@ -649,7 +644,7 @@ class Fp8MoEMethod:
             layer.w2_input_scale = None
     def process_weights_after_loading(self, layer: Module) -> None:
-        if get_bool_env_var("USE_INT4_WEIGHT"):
+        if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
             self.process_weights_hip_int4(layer)
             return
@@ -706,20 +701,12 @@ class Fp8MoEMethod:
                 requires_grad=False,
             )
             for expert in range(layer.num_experts):
-                if _is_cuda:
-                    w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
-                        sgl_scaled_fp8_quant(layer.w13_weight.data[expert, :, :])
-                    )
-                    w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
-                        sgl_scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
-                    )
-                else:
-                    w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
-                        vllm_ops.scaled_fp8_quant(layer.w13_weight.data[expert, :, :])
-                    )
-                    w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
-                        vllm_ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
-                    )
+                w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
+                    scaled_fp8_quant(layer.w13_weight.data[expert, :, :])
+                )
+                w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
+                    scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
+                )
             layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
@@ -796,18 +783,10 @@ class Fp8MoEMethod:
                         layer.w13_weight[expert_id][start : start + shard_size, :],
                         layer.w13_weight_scale[expert_id][shard_id],
                     )
-                    if _is_cuda:
-                        (
-                            layer.w13_weight[expert_id][start : start + shard_size, :],
-                            _,
-                        ) = sgl_scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
-                    else:
-                        (
-                            layer.w13_weight[expert_id][start : start + shard_size, :],
-                            _,
-                        ) = vllm_ops.scaled_fp8_quant(
-                            dq_weight, max_w13_scales[expert_id]
-                        )
+                    (
+                        layer.w13_weight[expert_id][start : start + shard_size, :],
+                        _,
+                    ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
                     start += shard_size
             layer.w13_weight_scale = torch.nn.Parameter(
@@ -913,6 +892,7 @@ class Fp8MoEMethod:
         apply_router_weight_on_input: bool = False,
         inplace: bool = True,
         no_combine: bool = False,
+        routed_scaling_factor: Optional[float] = None,
     ) -> torch.Tensor:
         from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
         from sglang.srt.layers.moe.topk import select_experts
@@ -928,43 +908,14 @@ class Fp8MoEMethod:
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
+            routed_scaling_factor=routed_scaling_factor,
         )
-        if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
-            # TODO: add triton kernel and add check get_bool_env_var("CK_MOE")
-            assert not no_combine, f"{no_combine=} is not supported."
-            return ck_moe_2stages_win4(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                layer.w13_weight_scale1,
-                layer.w2_weight_scale1,
-                activation=(
-                    ActivationType.Silu if activation == "silu" else ActivationType.Gelu
-                ),
-            )
-        if _is_hip and get_bool_env_var("CK_MOE"):
-            assert not no_combine, f"{no_combine=} is not supported."
-            if self.block_quant:
-                # TODO(CK_MOE): FP8 block_quant only supports 'silu' for the time-being.
-                assert (
-                    activation == "silu"
-                ), f"CK_MOE: FP8 bloack_quant {activation=} will be supported later, unset CK_MOE"
-                return asm_moe(
-                    x,
-                    layer.w13_weight,
-                    layer.w2_weight,
-                    topk_weights,
-                    topk_ids,
-                    layer.w13_weight_scale_inv,
-                    layer.w2_weight_scale_inv,
-                    block_shape=tuple(self.quant_config.weight_block_size),
-                    expert_mask=None,
-                )
-            else:
-                return ck_moe_2stages(
+        if _is_hip:
+            if get_bool_env_var("USE_INT4_WEIGHT"):
+                # TODO: add triton kernel and add check get_bool_env_var("CK_MOE")
+                assert not no_combine, f"{no_combine=} is not supported."
+                return ck_moe_2stages_win4(
                     x,
                     layer.w13_weight,
                     layer.w2_weight,
@@ -978,33 +929,65 @@ class Fp8MoEMethod:
                         else ActivationType.Gelu
                     ),
                 )
-        else:
-            # Expert fusion with FP8 quantization
-            return fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=inplace and not no_combine,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                use_fp8_w8a8=True,
-                w1_scale=(
-                    layer.w13_weight_scale_inv
-                    if self.block_quant
-                    else layer.w13_weight_scale
-                ),
-                w2_scale=(
-                    layer.w2_weight_scale_inv
-                    if self.block_quant
-                    else layer.w2_weight_scale
-                ),
-                a1_scale=layer.w13_input_scale,
-                a2_scale=layer.w2_input_scale,
-                block_shape=self.quant_config.weight_block_size,
-                no_combine=no_combine,
-            )
+            if get_bool_env_var("CK_MOE"):
+                assert not no_combine, f"{no_combine=} is not supported."
+                if self.block_quant:
+                    # TODO(CK_MOE): FP8 block_quant only supports 'silu' for the time-being.
+                    assert (
+                        activation == "silu"
+                    ), f"CK_MOE: FP8 bloack_quant {activation=} will be supported later, unset CK_MOE"
+                    return asm_moe(
+                        x,
+                        layer.w13_weight,
+                        layer.w2_weight,
+                        topk_weights,
+                        topk_ids,
+                        layer.w13_weight_scale_inv,
+                        layer.w2_weight_scale_inv,
+                        block_shape=tuple(self.quant_config.weight_block_size),
+                        expert_mask=None,
+                    )
+                else:
+                    return ck_moe_2stages(
+                        x,
+                        layer.w13_weight,
+                        layer.w2_weight,
+                        topk_weights,
+                        topk_ids,
+                        layer.w13_weight_scale1,
+                        layer.w2_weight_scale1,
+                        activation=(
+                            ActivationType.Silu
+                            if activation == "silu"
+                            else ActivationType.Gelu
+                        ),
+                    )
+        # Expert fusion with FP8 quantization
+        return fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=inplace and not no_combine,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            use_fp8_w8a8=True,
+            w1_scale=(
+                layer.w13_weight_scale_inv
+                if self.block_quant
+                else layer.w13_weight_scale
+            ),
+            w2_scale=(
+                layer.w2_weight_scale_inv if self.block_quant else layer.w2_weight_scale
+            ),
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            block_shape=self.quant_config.weight_block_size,
+            no_combine=no_combine,
+        )
 class Fp8KVCacheMethod(BaseKVCacheMethod):

sglang 0.4.5.post1__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post2py3-none-any.whl