PyPI - sglang - Versions diffs - 0.4.7.post1__py3-none-any.whl → 0.4.8__py3-none-any.whl - Mend

sglang 0.4.7.post1py3-none-any.whl → 0.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

sglang/bench_one_batch.py +8 -6
sglang/srt/_custom_ops.py +2 -2
sglang/srt/code_completion_parser.py +2 -44
sglang/srt/constants.py +3 -0
sglang/srt/conversation.py +13 -3
sglang/srt/custom_op.py +5 -1
sglang/srt/disaggregation/decode.py +22 -28
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
sglang/srt/disaggregation/mini_lb.py +34 -4
sglang/srt/disaggregation/mooncake/conn.py +12 -16
sglang/srt/disaggregation/prefill.py +17 -13
sglang/srt/disaggregation/utils.py +46 -18
sglang/srt/distributed/parallel_state.py +12 -4
sglang/srt/entrypoints/engine.py +22 -28
sglang/srt/entrypoints/http_server.py +149 -79
sglang/srt/entrypoints/http_server_engine.py +0 -3
sglang/srt/entrypoints/openai/__init__.py +0 -0
sglang/srt/{openai_api → entrypoints/openai}/protocol.py +67 -29
sglang/srt/entrypoints/openai/serving_base.py +149 -0
sglang/srt/entrypoints/openai/serving_chat.py +921 -0
sglang/srt/entrypoints/openai/serving_completions.py +424 -0
sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
sglang/srt/entrypoints/openai/serving_score.py +61 -0
sglang/srt/entrypoints/openai/usage_processor.py +81 -0
sglang/srt/entrypoints/openai/utils.py +72 -0
sglang/srt/function_call/base_format_detector.py +7 -4
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/function_call/ebnf_composer.py +64 -10
sglang/srt/function_call/function_call_parser.py +6 -6
sglang/srt/function_call/llama32_detector.py +1 -1
sglang/srt/function_call/mistral_detector.py +1 -1
sglang/srt/function_call/pythonic_detector.py +1 -1
sglang/srt/function_call/qwen25_detector.py +1 -1
sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
sglang/srt/layers/activation.py +21 -3
sglang/srt/layers/attention/aiter_backend.py +5 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/cutlass_mla_backend.py +1 -0
sglang/srt/layers/attention/flashattention_backend.py +19 -9
sglang/srt/layers/attention/flashinfer_backend.py +9 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +7 -4
sglang/srt/layers/attention/flashmla_backend.py +5 -2
sglang/srt/layers/attention/tbo_backend.py +3 -3
sglang/srt/layers/attention/triton_backend.py +19 -11
sglang/srt/layers/communicator.py +5 -5
sglang/srt/layers/dp_attention.py +11 -2
sglang/srt/layers/layernorm.py +29 -2
sglang/srt/layers/logits_processor.py +2 -2
sglang/srt/layers/moe/ep_moe/kernels.py +159 -2
sglang/srt/layers/moe/ep_moe/layer.py +207 -1
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +6 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
sglang/srt/layers/moe/topk.py +91 -4
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
sglang/srt/layers/quantization/fp8.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +62 -8
sglang/srt/layers/quantization/utils.py +5 -2
sglang/srt/layers/rotary_embedding.py +42 -2
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/lora_manager.py +173 -74
sglang/srt/lora/mem_pool.py +49 -45
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +33 -15
sglang/srt/managers/io_struct.py +9 -12
sglang/srt/managers/schedule_batch.py +40 -31
sglang/srt/managers/schedule_policy.py +70 -56
sglang/srt/managers/scheduler.py +147 -62
sglang/srt/managers/template_manager.py +226 -0
sglang/srt/managers/tokenizer_manager.py +11 -8
sglang/srt/managers/tp_worker.py +12 -2
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
sglang/srt/mem_cache/base_prefix_cache.py +52 -8
sglang/srt/mem_cache/chunk_cache.py +11 -16
sglang/srt/mem_cache/hiradix_cache.py +34 -23
sglang/srt/mem_cache/memory_pool.py +118 -114
sglang/srt/mem_cache/radix_cache.py +20 -16
sglang/srt/model_executor/cuda_graph_runner.py +76 -45
sglang/srt/model_executor/forward_batch_info.py +18 -5
sglang/srt/model_executor/model_runner.py +22 -6
sglang/srt/model_loader/loader.py +8 -1
sglang/srt/model_loader/weight_utils.py +11 -2
sglang/srt/models/deepseek_nextn.py +29 -27
sglang/srt/models/deepseek_v2.py +108 -26
sglang/srt/models/glm4.py +312 -0
sglang/srt/models/mimo_mtp.py +2 -18
sglang/srt/reasoning_parser.py +21 -11
sglang/srt/server_args.py +36 -8
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -10
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +125 -12
sglang/srt/speculative/eagle_utils.py +80 -8
sglang/srt/speculative/eagle_worker.py +124 -41
sglang/srt/torch_memory_saver_adapter.py +19 -15
sglang/srt/utils.py +177 -11
sglang/test/test_block_fp8_ep.py +1 -0
sglang/test/test_utils.py +1 -0
sglang/version.py +1 -1
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/METADATA +4 -10
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/RECORD +104 -93
sglang/srt/entrypoints/verl_engine.py +0 -179
sglang/srt/openai_api/adapter.py +0 -2148
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -28,10 +28,18 @@ from sglang.srt.managers.expert_location_dispatch import (
     topk_ids_logical_to_physical,
 )
 from sglang.srt.managers.schedule_batch import global_server_args_dict
-from sglang.srt.utils import get_compiler_backend, is_cuda, is_hip
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_compiler_backend,
+    is_cpu,
+    is_cuda,
+    is_hip,
+)
 _is_cuda = is_cuda()
 _is_hip = is_hip()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
 if _is_cuda:
     from sgl_kernel import moe_fused_gate
@@ -40,7 +48,7 @@ if _is_cuda or _is_hip:
     from sgl_kernel import topk_softmax
-def fused_topk_native(
+def fused_topk_torch_native(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
     topk: int,
@@ -61,6 +69,20 @@ def fused_topk_native(
     return topk_weights, topk_ids
+def fused_topk_cpu(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
+    return torch.ops.sgl_kernel.topk_softmax_cpu(
+        hidden_states=hidden_states,
+        gating_output=gating_output,
+        topk=topk,
+        renormalize=renormalize,
+    )
 def fused_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -115,7 +137,7 @@ def _fused_topk_postprocess(
 # This is used by the Deepseek V2/V3/R1 series models
 @torch.compile(dynamic=True, backend=get_compiler_backend())
-def grouped_topk(
+def grouped_topk_gpu(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
     topk: int,
@@ -171,6 +193,32 @@ def grouped_topk(
     return topk_weights, topk_ids
+def grouped_topk_cpu(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    num_fused_shared_experts: int = 0,
+    routed_scaling_factor: Optional[float] = None,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+):
+    assert expert_location_dispatch_info is None
+    return torch.ops.sgl_kernel.grouped_topk_cpu(
+        hidden_states,
+        gating_output,
+        topk,
+        renormalize,
+        num_expert_group,
+        topk_group,
+        num_fused_shared_experts,
+        routed_scaling_factor,
+        num_token_non_padded,
+    )
 def biased_grouped_topk_impl(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -258,7 +306,7 @@ def _biased_grouped_topk_postprocess(
     return topk_ids
-def biased_grouped_topk(
+def biased_grouped_topk_gpu(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
     correction_bias: torch.Tensor,
@@ -322,6 +370,45 @@ def biased_grouped_topk(
         )
+def biased_grouped_topk_cpu(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    compiled: bool = True,
+    num_fused_shared_experts: int = 0,
+    routed_scaling_factor: Optional[float] = None,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+):
+    assert expert_location_dispatch_info is None
+    return torch.ops.sgl_kernel.biased_grouped_topk_cpu(
+        hidden_states,
+        gating_output,
+        correction_bias,
+        topk,
+        renormalize,
+        num_expert_group,
+        topk_group,
+        num_fused_shared_experts,
+        routed_scaling_factor,
+        num_token_non_padded,
+    )
+if _is_cpu and _is_cpu_amx_available:
+    biased_grouped_topk = biased_grouped_topk_cpu
+    grouped_topk = grouped_topk_cpu
+    fused_topk_native = fused_topk_cpu
+else:
+    biased_grouped_topk = biased_grouped_topk_gpu
+    grouped_topk = grouped_topk_gpu
+    fused_topk_native = fused_topk_torch_native
 def select_experts(
     hidden_states: torch.Tensor,
     router_logits: torch.Tensor,

sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py CHANGED Viewed

@@ -14,14 +14,18 @@ from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz, scaled_fp8_qu
 from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
 from sglang.srt.layers.quantization.utils import (
     all_close_1d,
+    cpu_has_amx_support,
     per_tensor_dequantize,
     replace_parameter,
 )
-from sglang.srt.utils import is_cuda, set_weight_attrs
+from sglang.srt.utils import is_cpu, is_cuda, is_npu, set_weight_attrs
 _is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
-if not _is_cuda:
+if not (_is_cuda or _is_npu or (_is_cpu and _is_cpu_amx_available)):
     from vllm import _custom_ops as vllm_ops
     from vllm._custom_ops import scaled_fp8_quant

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -64,9 +64,12 @@ from sglang.srt.layers.quantization.utils import (
 )
 from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.utils import (
+    cpu_has_amx_support,
     get_bool_env_var,
+    is_cpu,
     is_cuda,
     is_hip,
+    is_npu,
     log_info_on_rank0,
     print_warning_once,
     set_weight_attrs,
@@ -74,6 +77,9 @@ from sglang.srt.utils import (
 _is_hip = is_hip()
 _is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
 _is_fp8_fnuz = is_fp8_fnuz()
@@ -82,10 +88,11 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if _is_hip:
     from aiter import ActivationType, QuantType
+    from aiter.fused_moe import fused_moe
     from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages
     from aiter.ops.shuffle import shuffle_weight
-if not _is_cuda:
+if not (_is_cuda or _is_npu or (_is_cpu and _is_cpu_amx_available)):
     from vllm._custom_ops import scaled_fp8_quant
@@ -1045,15 +1052,15 @@ class Fp8MoEMethod:
         if _use_hip_int4:
             # TODO: add triton kernel and add check _use_aiter
             assert not no_combine, f"{no_combine=} is not supported."
-            return ck_moe_2stages(
+            return fused_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
                 topk_weights,
                 topk_ids,
-                QuantType.per_Token,
-                layer.w13_weight_scale1,
-                layer.w2_weight_scale1,
+                quant_type=QuantType.per_Token,
+                w1_scale=layer.w13_weight_scale1,
+                w2_scale=layer.w2_weight_scale1,
                 activation=(
                     ActivationType.Silu if activation == "silu" else ActivationType.Gelu
                 ),
@@ -1062,31 +1069,32 @@ class Fp8MoEMethod:
         if _use_aiter:
             assert not no_combine, f"{no_combine=} is not supported."
             if self.block_quant:
-                # TODO(_use_aiter): FP8 block_quant only supports 'silu' for the time-being.
-                assert (
-                    activation == "silu"
-                ), f"_use_aiter: FP8 bloack_quant {activation=} will be supported later, unset _use_aiter"
-                return asm_moe(
+                return fused_moe(
                     x,
                     layer.w13_weight,
                     layer.w2_weight,
                     topk_weights,
                     topk_ids,
-                    layer.w13_weight_scale_inv,
-                    layer.w2_weight_scale_inv,
-                    block_shape=tuple(self.quant_config.weight_block_size),
+                    w1_scale=layer.w13_weight_scale_inv,
+                    w2_scale=layer.w2_weight_scale_inv,
+                    quant_type=QuantType.per_128x128,
+                    activation=(
+                        ActivationType.Silu
+                        if activation == "silu"
+                        else ActivationType.Gelu
+                    ),
                     expert_mask=None,
                 )
             else:
-                return ck_moe_2stages(
+                return fused_moe(
                     x,
                     layer.w13_weight,
                     layer.w2_weight,
                     topk_weights,
                     topk_ids,
-                    QuantType.per_Token,
-                    layer.w13_weight_scale1,
-                    layer.w2_weight_scale1,
+                    quant_type=QuantType.per_Token,
+                    w1_scale=layer.w13_weight_scale1,
+                    w2_scale=layer.w2_weight_scale1,
                     activation=(
                         ActivationType.Silu
                         if activation == "silu"

sglang/srt/layers/quantization/modelopt_quant.py CHANGED Viewed

@@ -29,11 +29,17 @@ from sglang.srt.layers.quantization.utils import (
     requantize_with_max_scale,
 )
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.utils import is_cuda
+from sglang.srt.utils import is_cuda, next_power_of_2
 if is_cuda():
     from sgl_kernel import cutlass_scaled_fp4_mm, scaled_fp4_quant
+try:
+    from flashinfer import fp4_quantize as fp4_quantize
+    from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
+except ImportError:
+    flashinfer_cutlass_fused_moe = None
 # Initialize logger for the module
 logger = logging.getLogger(__name__)
@@ -429,6 +435,9 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
         layer.alpha = Parameter(
             layer.input_scale * layer.weight_scale_2, requires_grad=False
         )
+        layer.input_scale_inv = Parameter(
+            (1 / input_scale_2).to(torch.float32), requires_grad=False
+        )
         # Pad and blockwise interleave weight_scale
         scales = layer.weight_scale
@@ -467,7 +476,7 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
         output_shape = [x_m, w_n]
         # Quantize BF16 or FP16 to (FP4 and interleaved block scale)
-        x_fp4, x_scale_interleaved = scaled_fp4_quant(x, 1 / layer.input_scale)
+        x_fp4, x_scale_interleaved = scaled_fp4_quant(x, layer.input_scale_inv)
         assert x_fp4.dtype == torch.uint8
         assert x_scale_interleaved.dtype == torch.float8_e4m3fn
@@ -521,6 +530,7 @@ class ModelOptNvFp4FusedMoEMethod:
                 " quantization. Please use Blackwell and"
                 " above."
             )
+        self.enable_flashinfer_moe = False
     def create_weights(
         self,
@@ -674,7 +684,10 @@ class ModelOptNvFp4FusedMoEMethod:
         w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0]
         layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False)
-        w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(torch.float32)
+        if self.enable_flashinfer_moe:
+            w13_input_scale = layer.w13_input_scale.max().to(torch.float32)
+        else:
+            w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(torch.float32)
         layer.g1_alphas = Parameter(
             (w13_input_scale * w13_weight_scale_2).to(torch.float32),
             requires_grad=False,
@@ -700,14 +713,19 @@ class ModelOptNvFp4FusedMoEMethod:
         layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
         # GEMM 2
+        if self.enable_flashinfer_moe:
+            w2_input_scale = layer.w2_input_scale.max().to(torch.float32)
+        else:
+            w2_input_scale = layer.w2_input_scale
         layer.g2_alphas = Parameter(
-            (layer.w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
+            (w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
             requires_grad=False,
         )
         # This is for quantization, so we need to invert it.
         layer.w2_input_scale_quant = Parameter(
-            (1 / layer.w2_input_scale).to(torch.float32), requires_grad=False
+            (1 / w2_input_scale).to(torch.float32), requires_grad=False
         )
         assert (
@@ -727,11 +745,16 @@ class ModelOptNvFp4FusedMoEMethod:
         layer.cutlass_moe_params = CutlassMoEParams(
             CutlassMoEType.BlockscaledFP4,
             device,
-            num_experts=layer.num_experts,
+            num_experts=layer.num_experts,  # global num experts
             intermediate_size_per_partition=layer.w2_weight.shape[2] * 2,  # n
             hidden_size=layer.w13_weight.shape[2] * 2,
         )  # k
+    @property
+    def load_up_proj_weight_first(self) -> bool:
+        # FlashInfer CUTLASS kernel assumes [Up, Gate] Proj as W13
+        return self.enable_flashinfer_moe
     def apply(
         self,
         layer: torch.nn.Module,
@@ -750,11 +773,13 @@ class ModelOptNvFp4FusedMoEMethod:
         inplace: bool = True,
         no_combine: bool = False,
         routed_scaling_factor: Optional[float] = None,
+        ep_rank: Optional[int] = None,
+        ep_size: Optional[int] = None,
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
     ) -> torch.Tensor:
         assert activation == "silu", "Only SiLU activation is supported."
-        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
         from sglang.srt.layers.moe.topk import select_experts
         topk_weights, topk_ids = select_experts(
@@ -771,6 +796,35 @@ class ModelOptNvFp4FusedMoEMethod:
             routed_scaling_factor=routed_scaling_factor,
         )
+        if self.enable_flashinfer_moe:
+            assert (
+                not apply_router_weight_on_input
+            ), "apply_router_weight_on_input is not supported for Flashinfer"
+            # TRTLLM Cutlass moe takes in activations in BF16/Half/nvfp4 precision
+            # and fp4 quantized weights loaded from the checkpoint
+            output = flashinfer_cutlass_fused_moe(
+                x,
+                topk_ids.to(torch.int),
+                topk_weights,
+                layer.w13_weight.view(torch.long),
+                layer.w2_weight.view(torch.long),
+                x.dtype,
+                quant_scales=[
+                    layer.w13_input_scale_quant,
+                    layer.w13_blockscale_swizzled.view(torch.int32),
+                    layer.g1_alphas,
+                    layer.w2_input_scale_quant,
+                    layer.w2_blockscale_swizzled.view(torch.int32),
+                    layer.g2_alphas,
+                ],
+                ep_size=ep_size,
+                ep_rank=ep_rank,
+                tp_size=tp_size,
+                tp_rank=tp_rank,
+                tune_max_num_tokens=next_power_of_2(x.shape[0]),
+            )
+            return output[0]
         from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
         return cutlass_moe_fp4(

sglang/srt/layers/quantization/utils.py CHANGED Viewed

@@ -6,11 +6,14 @@ from typing import List, Mapping, Tuple, Union
 import torch
 from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
-from sglang.srt.utils import is_cuda
+from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_npu
 _is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
-if not _is_cuda:
+if not (_is_cuda or _is_npu or (_is_cpu and _is_cpu_amx_available)):
     from vllm._custom_ops import scaled_fp8_quant

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -8,10 +8,13 @@ import torch
 import torch.nn as nn
 from sglang.srt.custom_op import CustomOp
-from sglang.srt.utils import is_cuda, is_hip
+from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_hip, is_npu
 _is_cuda = is_cuda()
 _is_hip = is_hip()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
 if _is_cuda:
     from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
@@ -84,7 +87,9 @@ class RotaryEmbedding(CustomOp):
         if not _is_cuda:
             cache = cache.to(dtype)
-        if not _is_cuda or self.head_size not in [64, 128, 256, 512]:
+        if (
+            not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512]
+        ) and not (_is_cpu and _is_cpu_amx_available):
             from vllm._custom_ops import rotary_embedding
             self.vllm_rotary_embedding = rotary_embedding
@@ -147,6 +152,26 @@ class RotaryEmbedding(CustomOp):
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
+    def forward_cpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        positions = torch.add(positions, offsets) if offsets is not None else positions
+        if _is_cpu_amx_available:
+            return torch.ops.sgl_kernel.rotary_embedding_cpu(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
+        else:
+            return self.forward_native(positions, query, key, offsets)
     def forward_cuda(
         self,
         positions: torch.Tensor,
@@ -696,6 +721,21 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
             key = key_rot
         return query.to(dtype), key.to(dtype)
+    def forward_cpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        positions = torch.add(positions, offsets) if offsets is not None else positions
+        if _is_cpu_amx_available:
+            return torch.ops.sgl_kernel.rotary_embedding_cpu(
+                positions, query, key, self.head_size, self.cos_sin_cache, False
+            )
+        else:
+            return self.forward_native(positions, query, key, offsets)
 class Llama3RotaryEmbedding(RotaryEmbedding):

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -91,7 +91,7 @@ class Sampler(nn.Module):
                         )
                     else:
                         batch_next_token_ids = top_k_top_p_sampling_from_probs(
-                            probs,
+                            probs.contiguous(),
                             sampling_info.top_ks,
                             sampling_info.top_ps,
                             filter_apply_order="joint",

sglang 0.4.7.post1__py3-none-any.whl → 0.4.8__py3-none-any.whl

sglang 0.4.7.post1py3-none-any.whl → 0.4.8py3-none-any.whl