PyPI - sglang - Versions diffs - 0.4.7.post1__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl - Mend

sglang 0.4.7.post1py3-none-any.whl → 0.4.8.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (123) hide show

sglang/bench_one_batch.py +8 -6
sglang/srt/_custom_ops.py +2 -2
sglang/srt/code_completion_parser.py +2 -44
sglang/srt/configs/model_config.py +1 -0
sglang/srt/constants.py +3 -0
sglang/srt/conversation.py +14 -3
sglang/srt/custom_op.py +11 -1
sglang/srt/disaggregation/base/conn.py +2 -0
sglang/srt/disaggregation/decode.py +22 -28
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
sglang/srt/disaggregation/mini_lb.py +34 -4
sglang/srt/disaggregation/mooncake/conn.py +301 -64
sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
sglang/srt/disaggregation/nixl/conn.py +94 -46
sglang/srt/disaggregation/prefill.py +20 -15
sglang/srt/disaggregation/utils.py +47 -18
sglang/srt/distributed/parallel_state.py +12 -4
sglang/srt/entrypoints/engine.py +27 -31
sglang/srt/entrypoints/http_server.py +149 -79
sglang/srt/entrypoints/http_server_engine.py +0 -3
sglang/srt/entrypoints/openai/__init__.py +0 -0
sglang/srt/{openai_api → entrypoints/openai}/protocol.py +115 -34
sglang/srt/entrypoints/openai/serving_base.py +149 -0
sglang/srt/entrypoints/openai/serving_chat.py +897 -0
sglang/srt/entrypoints/openai/serving_completions.py +425 -0
sglang/srt/entrypoints/openai/serving_embedding.py +170 -0
sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
sglang/srt/entrypoints/openai/serving_score.py +61 -0
sglang/srt/entrypoints/openai/usage_processor.py +81 -0
sglang/srt/entrypoints/openai/utils.py +72 -0
sglang/srt/function_call/base_format_detector.py +7 -4
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/function_call/ebnf_composer.py +64 -10
sglang/srt/function_call/function_call_parser.py +6 -6
sglang/srt/function_call/llama32_detector.py +1 -1
sglang/srt/function_call/mistral_detector.py +1 -1
sglang/srt/function_call/pythonic_detector.py +1 -1
sglang/srt/function_call/qwen25_detector.py +1 -1
sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
sglang/srt/layers/activation.py +28 -3
sglang/srt/layers/attention/aiter_backend.py +5 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/cutlass_mla_backend.py +1 -0
sglang/srt/layers/attention/flashattention_backend.py +43 -23
sglang/srt/layers/attention/flashinfer_backend.py +9 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +7 -4
sglang/srt/layers/attention/flashmla_backend.py +5 -2
sglang/srt/layers/attention/tbo_backend.py +3 -3
sglang/srt/layers/attention/triton_backend.py +19 -11
sglang/srt/layers/communicator.py +5 -5
sglang/srt/layers/dp_attention.py +11 -2
sglang/srt/layers/layernorm.py +44 -2
sglang/srt/layers/linear.py +18 -1
sglang/srt/layers/logits_processor.py +14 -5
sglang/srt/layers/moe/ep_moe/kernels.py +159 -2
sglang/srt/layers/moe/ep_moe/layer.py +286 -13
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +19 -2
sglang/srt/layers/moe/fused_moe_native.py +7 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +13 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +148 -26
sglang/srt/layers/moe/topk.py +117 -4
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
sglang/srt/layers/quantization/fp8.py +25 -17
sglang/srt/layers/quantization/fp8_utils.py +5 -4
sglang/srt/layers/quantization/modelopt_quant.py +62 -8
sglang/srt/layers/quantization/utils.py +5 -2
sglang/srt/layers/rotary_embedding.py +144 -12
sglang/srt/layers/sampler.py +1 -1
sglang/srt/layers/vocab_parallel_embedding.py +14 -1
sglang/srt/lora/lora_manager.py +173 -74
sglang/srt/lora/mem_pool.py +49 -45
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +33 -15
sglang/srt/managers/expert_distribution.py +21 -0
sglang/srt/managers/io_struct.py +19 -14
sglang/srt/managers/multimodal_processors/base_processor.py +44 -9
sglang/srt/managers/multimodal_processors/gemma3n.py +97 -0
sglang/srt/managers/schedule_batch.py +49 -32
sglang/srt/managers/schedule_policy.py +70 -56
sglang/srt/managers/scheduler.py +189 -68
sglang/srt/managers/template_manager.py +226 -0
sglang/srt/managers/tokenizer_manager.py +11 -8
sglang/srt/managers/tp_worker.py +12 -2
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
sglang/srt/mem_cache/base_prefix_cache.py +52 -8
sglang/srt/mem_cache/chunk_cache.py +11 -16
sglang/srt/mem_cache/hiradix_cache.py +34 -23
sglang/srt/mem_cache/memory_pool.py +118 -114
sglang/srt/mem_cache/radix_cache.py +20 -16
sglang/srt/model_executor/cuda_graph_runner.py +77 -46
sglang/srt/model_executor/forward_batch_info.py +18 -5
sglang/srt/model_executor/model_runner.py +27 -8
sglang/srt/model_loader/loader.py +50 -8
sglang/srt/model_loader/weight_utils.py +100 -2
sglang/srt/models/deepseek_nextn.py +35 -30
sglang/srt/models/deepseek_v2.py +255 -30
sglang/srt/models/gemma3n_audio.py +949 -0
sglang/srt/models/gemma3n_causal.py +1009 -0
sglang/srt/models/gemma3n_mm.py +511 -0
sglang/srt/models/glm4.py +312 -0
sglang/srt/models/hunyuan.py +771 -0
sglang/srt/models/mimo_mtp.py +2 -18
sglang/srt/reasoning_parser.py +21 -11
sglang/srt/server_args.py +51 -9
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -10
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +125 -12
sglang/srt/speculative/eagle_utils.py +80 -8
sglang/srt/speculative/eagle_worker.py +124 -41
sglang/srt/torch_memory_saver_adapter.py +19 -15
sglang/srt/two_batch_overlap.py +4 -1
sglang/srt/utils.py +248 -11
sglang/test/test_block_fp8_ep.py +1 -0
sglang/test/test_utils.py +1 -0
sglang/version.py +1 -1
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/METADATA +4 -10
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/RECORD +121 -105
sglang/srt/entrypoints/verl_engine.py +0 -179
sglang/srt/openai_api/adapter.py +0 -2148
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -64,9 +64,12 @@ from sglang.srt.layers.quantization.utils import (
 )
 from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.utils import (
+    cpu_has_amx_support,
     get_bool_env_var,
+    is_cpu,
     is_cuda,
     is_hip,
+    is_npu,
     log_info_on_rank0,
     print_warning_once,
     set_weight_attrs,
@@ -74,6 +77,9 @@ from sglang.srt.utils import (
 _is_hip = is_hip()
 _is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
 _is_fp8_fnuz = is_fp8_fnuz()
@@ -82,10 +88,11 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if _is_hip:
     from aiter import ActivationType, QuantType
+    from aiter.fused_moe import fused_moe
     from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages
     from aiter.ops.shuffle import shuffle_weight
-if not _is_cuda:
+if not (_is_cuda or _is_npu or (_is_cpu and _is_cpu_amx_available)):
     from vllm._custom_ops import scaled_fp8_quant
@@ -1045,15 +1052,15 @@ class Fp8MoEMethod:
         if _use_hip_int4:
             # TODO: add triton kernel and add check _use_aiter
             assert not no_combine, f"{no_combine=} is not supported."
-            return ck_moe_2stages(
+            return fused_moe(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
                 topk_weights,
                 topk_ids,
-                QuantType.per_Token,
-                layer.w13_weight_scale1,
-                layer.w2_weight_scale1,
+                quant_type=QuantType.per_Token,
+                w1_scale=layer.w13_weight_scale1,
+                w2_scale=layer.w2_weight_scale1,
                 activation=(
                     ActivationType.Silu if activation == "silu" else ActivationType.Gelu
                 ),
@@ -1062,31 +1069,32 @@ class Fp8MoEMethod:
         if _use_aiter:
             assert not no_combine, f"{no_combine=} is not supported."
             if self.block_quant:
-                # TODO(_use_aiter): FP8 block_quant only supports 'silu' for the time-being.
-                assert (
-                    activation == "silu"
-                ), f"_use_aiter: FP8 bloack_quant {activation=} will be supported later, unset _use_aiter"
-                return asm_moe(
+                return fused_moe(
                     x,
                     layer.w13_weight,
                     layer.w2_weight,
                     topk_weights,
                     topk_ids,
-                    layer.w13_weight_scale_inv,
-                    layer.w2_weight_scale_inv,
-                    block_shape=tuple(self.quant_config.weight_block_size),
+                    w1_scale=layer.w13_weight_scale_inv,
+                    w2_scale=layer.w2_weight_scale_inv,
+                    quant_type=QuantType.per_128x128,
+                    activation=(
+                        ActivationType.Silu
+                        if activation == "silu"
+                        else ActivationType.Gelu
+                    ),
                     expert_mask=None,
                 )
             else:
-                return ck_moe_2stages(
+                return fused_moe(
                     x,
                     layer.w13_weight,
                     layer.w2_weight,
                     topk_weights,
                     topk_ids,
-                    QuantType.per_Token,
-                    layer.w13_weight_scale1,
-                    layer.w2_weight_scale1,
+                    quant_type=QuantType.per_Token,
+                    w1_scale=layer.w13_weight_scale1,
+                    w2_scale=layer.w2_weight_scale1,
                     activation=(
                         ActivationType.Silu
                         if activation == "silu"

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -42,7 +42,10 @@ _is_fp8_fnuz = is_fp8_fnuz()
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if _use_aiter:
-    from aiter import gemm_a8w8_blockscale_CK
+    import aiter
+    from aiter import gemm_a8w8_blockscale_CK, get_hip_quant
+    aiter_per1x128_quant = get_hip_quant(aiter.QuantType.per_1x128)
 if _is_cuda:
     from sgl_kernel import fp8_blockwise_scaled_mm, fp8_scaled_mm
@@ -271,9 +274,7 @@ def aiter_w8a8_block_fp8_linear(
     input_2d = input.view(-1, input.shape[-1])
     output_shape = [*input.shape[:-1], weight.shape[0]]
-    q_input, x_scale = per_token_group_quant_fp8(
-        input_2d, block_size[1], column_major_scales=False
-    )
+    q_input, x_scale = aiter_per1x128_quant(input_2d, quant_dtype=aiter.dtypes.fp8)
     output = gemm_a8w8_blockscale_CK(
         q_input, weight, x_scale, weight_scale, dtype=input.dtype
     )

sglang/srt/layers/quantization/modelopt_quant.py CHANGED Viewed

@@ -29,11 +29,17 @@ from sglang.srt.layers.quantization.utils import (
     requantize_with_max_scale,
 )
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.utils import is_cuda
+from sglang.srt.utils import is_cuda, next_power_of_2
 if is_cuda():
     from sgl_kernel import cutlass_scaled_fp4_mm, scaled_fp4_quant
+try:
+    from flashinfer import fp4_quantize as fp4_quantize
+    from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
+except ImportError:
+    flashinfer_cutlass_fused_moe = None
 # Initialize logger for the module
 logger = logging.getLogger(__name__)
@@ -429,6 +435,9 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
         layer.alpha = Parameter(
             layer.input_scale * layer.weight_scale_2, requires_grad=False
         )
+        layer.input_scale_inv = Parameter(
+            (1 / input_scale_2).to(torch.float32), requires_grad=False
+        )
         # Pad and blockwise interleave weight_scale
         scales = layer.weight_scale
@@ -467,7 +476,7 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
         output_shape = [x_m, w_n]
         # Quantize BF16 or FP16 to (FP4 and interleaved block scale)
-        x_fp4, x_scale_interleaved = scaled_fp4_quant(x, 1 / layer.input_scale)
+        x_fp4, x_scale_interleaved = scaled_fp4_quant(x, layer.input_scale_inv)
         assert x_fp4.dtype == torch.uint8
         assert x_scale_interleaved.dtype == torch.float8_e4m3fn
@@ -521,6 +530,7 @@ class ModelOptNvFp4FusedMoEMethod:
                 " quantization. Please use Blackwell and"
                 " above."
             )
+        self.enable_flashinfer_moe = False
     def create_weights(
         self,
@@ -674,7 +684,10 @@ class ModelOptNvFp4FusedMoEMethod:
         w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0]
         layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False)
-        w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(torch.float32)
+        if self.enable_flashinfer_moe:
+            w13_input_scale = layer.w13_input_scale.max().to(torch.float32)
+        else:
+            w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(torch.float32)
         layer.g1_alphas = Parameter(
             (w13_input_scale * w13_weight_scale_2).to(torch.float32),
             requires_grad=False,
@@ -700,14 +713,19 @@ class ModelOptNvFp4FusedMoEMethod:
         layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
         # GEMM 2
+        if self.enable_flashinfer_moe:
+            w2_input_scale = layer.w2_input_scale.max().to(torch.float32)
+        else:
+            w2_input_scale = layer.w2_input_scale
         layer.g2_alphas = Parameter(
-            (layer.w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
+            (w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
             requires_grad=False,
         )
         # This is for quantization, so we need to invert it.
         layer.w2_input_scale_quant = Parameter(
-            (1 / layer.w2_input_scale).to(torch.float32), requires_grad=False
+            (1 / w2_input_scale).to(torch.float32), requires_grad=False
         )
         assert (
@@ -727,11 +745,16 @@ class ModelOptNvFp4FusedMoEMethod:
         layer.cutlass_moe_params = CutlassMoEParams(
             CutlassMoEType.BlockscaledFP4,
             device,
-            num_experts=layer.num_experts,
+            num_experts=layer.num_experts,  # global num experts
             intermediate_size_per_partition=layer.w2_weight.shape[2] * 2,  # n
             hidden_size=layer.w13_weight.shape[2] * 2,
         )  # k
+    @property
+    def load_up_proj_weight_first(self) -> bool:
+        # FlashInfer CUTLASS kernel assumes [Up, Gate] Proj as W13
+        return self.enable_flashinfer_moe
     def apply(
         self,
         layer: torch.nn.Module,
@@ -750,11 +773,13 @@ class ModelOptNvFp4FusedMoEMethod:
         inplace: bool = True,
         no_combine: bool = False,
         routed_scaling_factor: Optional[float] = None,
+        ep_rank: Optional[int] = None,
+        ep_size: Optional[int] = None,
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
     ) -> torch.Tensor:
         assert activation == "silu", "Only SiLU activation is supported."
-        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
         from sglang.srt.layers.moe.topk import select_experts
         topk_weights, topk_ids = select_experts(
@@ -771,6 +796,35 @@ class ModelOptNvFp4FusedMoEMethod:
             routed_scaling_factor=routed_scaling_factor,
         )
+        if self.enable_flashinfer_moe:
+            assert (
+                not apply_router_weight_on_input
+            ), "apply_router_weight_on_input is not supported for Flashinfer"
+            # TRTLLM Cutlass moe takes in activations in BF16/Half/nvfp4 precision
+            # and fp4 quantized weights loaded from the checkpoint
+            output = flashinfer_cutlass_fused_moe(
+                x,
+                topk_ids.to(torch.int),
+                topk_weights,
+                layer.w13_weight.view(torch.long),
+                layer.w2_weight.view(torch.long),
+                x.dtype,
+                quant_scales=[
+                    layer.w13_input_scale_quant,
+                    layer.w13_blockscale_swizzled.view(torch.int32),
+                    layer.g1_alphas,
+                    layer.w2_input_scale_quant,
+                    layer.w2_blockscale_swizzled.view(torch.int32),
+                    layer.g2_alphas,
+                ],
+                ep_size=ep_size,
+                ep_rank=ep_rank,
+                tp_size=tp_size,
+                tp_rank=tp_rank,
+                tune_max_num_tokens=next_power_of_2(x.shape[0]),
+            )
+            return output[0]
         from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
         return cutlass_moe_fp4(

sglang/srt/layers/quantization/utils.py CHANGED Viewed

@@ -6,11 +6,14 @@ from typing import List, Mapping, Tuple, Union
 import torch
 from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
-from sglang.srt.utils import is_cuda
+from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_npu
 _is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
-if not _is_cuda:
+if not (_is_cuda or _is_npu or (_is_cpu and _is_cpu_amx_available)):
     from vllm._custom_ops import scaled_fp8_quant

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -8,13 +8,29 @@ import torch
 import torch.nn as nn
 from sglang.srt.custom_op import CustomOp
-from sglang.srt.utils import is_cuda, is_hip
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_bool_env_var,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+)
 _is_cuda = is_cuda()
 _is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
 if _is_cuda:
     from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
+if _use_aiter:
+    from aiter.rotary_embedding import get_rope as aiter_get_rope
+if is_npu():
+    import torch_npu
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -84,7 +100,9 @@ class RotaryEmbedding(CustomOp):
         if not _is_cuda:
             cache = cache.to(dtype)
-        if not _is_cuda or self.head_size not in [64, 128, 256, 512]:
+        if (
+            not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512]
+        ) and not (_is_cpu and _is_cpu_amx_available):
             from vllm._custom_ops import rotary_embedding
             self.vllm_rotary_embedding = rotary_embedding
@@ -147,6 +165,56 @@ class RotaryEmbedding(CustomOp):
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
+    def forward_npu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """A PyTorch-npu implementation of forward()."""
+        import os
+        if get_bool_env_var("SGLANG_ENABLE_TORCH_COMPILE"):
+            return self.forward_native(positions, query, key, offsets)
+        else:
+            rotary_mode = "half"
+            if self.is_neox_style:
+                rotary_mode = "half"
+            else:
+                rotary_mode = "interleave"
+            mrope_section = [0, 0, 0]
+            query_out, key_out = torch_npu.npu_mrope(
+                positions,
+                query,
+                key,
+                self.cos_sin_cache,
+                self.head_size,
+                mrope_section=mrope_section,
+                rotary_mode=rotary_mode,
+            )
+            return query_out, key_out
+    def forward_cpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        positions = torch.add(positions, offsets) if offsets is not None else positions
+        if _is_cpu_amx_available:
+            return torch.ops.sgl_kernel.rotary_embedding_cpu(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
+        else:
+            return self.forward_native(positions, query, key, offsets)
     def forward_cuda(
         self,
         positions: torch.Tensor,
@@ -696,6 +764,21 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
             key = key_rot
         return query.to(dtype), key.to(dtype)
+    def forward_cpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        positions = torch.add(positions, offsets) if offsets is not None else positions
+        if _is_cpu_amx_available:
+            return torch.ops.sgl_kernel.rotary_embedding_cpu(
+                positions, query, key, self.head_size, self.cos_sin_cache, False
+            )
+        else:
+            return self.forward_native(positions, query, key, offsets)
 class Llama3RotaryEmbedding(RotaryEmbedding):
@@ -807,6 +890,43 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding):
         return query_out.type_as(query), key_out.type_as(key)
+class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling.
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_alpha: float,
+        dtype: torch.dtype,
+    ) -> None:
+        self.scaling_alpha = scaling_alpha
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        max_len = self.max_position_embeddings
+        base = self.base * self.scaling_alpha ** (
+            self.rotary_dim / (self.rotary_dim - 2)
+        )
+        inv_freq = self._compute_inv_freq(base)
+        t = torch.arange(max_len, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
 class MRotaryEmbedding(RotaryEmbedding):
     """Rotary Embedding with Multimodal Sections."""
@@ -1151,15 +1271,26 @@ def get_rope(
             )
         elif scaling_type == "dynamic":
             scaling_factor = rope_scaling["factor"]
-            rotary_emb = DynamicNTKScalingRotaryEmbedding(
-                head_size,
-                rotary_dim,
-                max_position,
-                base,
-                is_neox_style,
-                scaling_factor,
-                dtype,
-            )
+            if "alpha" in rope_scaling:
+                rotary_emb = DynamicNTKAlphaRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    rope_scaling["alpha"],
+                    dtype,
+                )
+            else:
+                rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    scaling_factor,
+                    dtype,
+                )
         elif scaling_type == "yarn":
             scaling_factor = rope_scaling["factor"]
             original_max_position = rope_scaling["original_max_position_embeddings"]
@@ -1348,7 +1479,8 @@ def get_rope_wrapper(
     device: Optional[str] = None,
 ):
     if device != "cpu":
-        return get_rope(
+        wrapper = aiter_get_rope if _use_aiter else get_rope
+        return wrapper(
             head_size,
             rotary_dim,
             max_position,

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -91,7 +91,7 @@ class Sampler(nn.Module):
                         )
                     else:
                         batch_next_token_ids = top_k_top_p_sampling_from_probs(
-                            probs,
+                            probs.contiguous(),
                             sampling_info.top_ks,
                             sampling_info.top_ps,
                             filter_apply_order="joint",

sglang/srt/layers/vocab_parallel_embedding.py CHANGED Viewed

@@ -20,10 +20,18 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizeMethodBase,
     method_has_implemented_embedding,
 )
-from sglang.srt.utils import set_weight_attrs
+from sglang.srt.utils import (
+    PackWeightMethod,
+    cpu_has_amx_support,
+    is_cpu,
+    set_weight_attrs,
+)
 DEFAULT_VOCAB_PADDING_SIZE = 64
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
 class UnquantizedEmbeddingMethod(QuantizeMethodBase):
     """Unquantized method for embeddings."""
@@ -549,6 +557,11 @@ class ParallelLMHead(VocabParallelEmbedding):
             use_presharded_weights=use_presharded_weights,
         )
         self.quant_config = quant_config
+        # We only support pack LMHead if it's not quantized. For LMHead with quant_config, the weight_name will be "qweight"
+        if self.quant_config is None and _is_cpu and _is_cpu_amx_available:
+            self.quant_method = PackWeightMethod(weight_names=["weight"])
         if bias:
             self.bias = Parameter(
                 torch.empty(self.num_embeddings_per_partition, dtype=params_dtype)

sglang 0.4.7.post1__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl

sglang 0.4.7.post1py3-none-any.whl → 0.4.8.post1py3-none-any.whl