PyPI - sglang - Versions diffs - 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl - Mend

sglang 0.4.8py3-none-any.whl → 0.4.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_one_batch_server.py +17 -2
sglang/bench_serving.py +168 -22
sglang/srt/configs/internvl.py +4 -2
sglang/srt/configs/janus_pro.py +1 -1
sglang/srt/configs/model_config.py +49 -0
sglang/srt/configs/update_config.py +119 -0
sglang/srt/conversation.py +35 -0
sglang/srt/custom_op.py +7 -1
sglang/srt/disaggregation/base/conn.py +2 -0
sglang/srt/disaggregation/decode.py +22 -6
sglang/srt/disaggregation/mooncake/conn.py +289 -48
sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
sglang/srt/disaggregation/nixl/conn.py +100 -52
sglang/srt/disaggregation/prefill.py +5 -4
sglang/srt/disaggregation/utils.py +13 -12
sglang/srt/distributed/parallel_state.py +44 -17
sglang/srt/entrypoints/EngineBase.py +8 -0
sglang/srt/entrypoints/engine.py +45 -9
sglang/srt/entrypoints/http_server.py +111 -24
sglang/srt/entrypoints/openai/protocol.py +51 -6
sglang/srt/entrypoints/openai/serving_chat.py +52 -76
sglang/srt/entrypoints/openai/serving_completions.py +1 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/eplb/__init__.py +0 -0
sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
sglang/srt/{managers → eplb}/expert_distribution.py +18 -1
sglang/srt/{managers → eplb}/expert_location.py +1 -1
sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
sglang/srt/hf_transformers_utils.py +2 -1
sglang/srt/layers/activation.py +7 -0
sglang/srt/layers/amx_utils.py +86 -0
sglang/srt/layers/attention/ascend_backend.py +219 -0
sglang/srt/layers/attention/flashattention_backend.py +56 -23
sglang/srt/layers/attention/tbo_backend.py +37 -9
sglang/srt/layers/communicator.py +18 -2
sglang/srt/layers/dp_attention.py +9 -3
sglang/srt/layers/elementwise.py +76 -12
sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
sglang/srt/layers/layernorm.py +41 -0
sglang/srt/layers/linear.py +99 -12
sglang/srt/layers/logits_processor.py +15 -6
sglang/srt/layers/moe/ep_moe/kernels.py +23 -8
sglang/srt/layers/moe/ep_moe/layer.py +115 -25
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +42 -19
sglang/srt/layers/moe/fused_moe_native.py +7 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -4
sglang/srt/layers/moe/fused_moe_triton/layer.py +129 -10
sglang/srt/layers/moe/router.py +60 -22
sglang/srt/layers/moe/topk.py +36 -28
sglang/srt/layers/parameter.py +67 -7
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
sglang/srt/layers/quantization/fp8.py +44 -0
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +6 -6
sglang/srt/layers/quantization/gptq.py +5 -1
sglang/srt/layers/quantization/moe_wna16.py +1 -1
sglang/srt/layers/quantization/quant_utils.py +166 -0
sglang/srt/layers/quantization/w8a8_int8.py +52 -1
sglang/srt/layers/rotary_embedding.py +105 -13
sglang/srt/layers/vocab_parallel_embedding.py +19 -2
sglang/srt/lora/lora.py +4 -5
sglang/srt/lora/lora_manager.py +73 -20
sglang/srt/managers/configure_logging.py +1 -1
sglang/srt/managers/io_struct.py +60 -15
sglang/srt/managers/mm_utils.py +73 -59
sglang/srt/managers/multimodal_processor.py +2 -6
sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
sglang/srt/managers/schedule_batch.py +80 -79
sglang/srt/managers/scheduler.py +153 -63
sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
sglang/srt/managers/session_controller.py +12 -3
sglang/srt/managers/tokenizer_manager.py +314 -103
sglang/srt/managers/tp_worker.py +13 -1
sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
sglang/srt/mem_cache/allocator.py +290 -0
sglang/srt/mem_cache/chunk_cache.py +34 -2
sglang/srt/mem_cache/memory_pool.py +289 -3
sglang/srt/mem_cache/multimodal_cache.py +3 -0
sglang/srt/model_executor/cuda_graph_runner.py +3 -2
sglang/srt/model_executor/forward_batch_info.py +17 -4
sglang/srt/model_executor/model_runner.py +302 -58
sglang/srt/model_loader/loader.py +86 -10
sglang/srt/model_loader/weight_utils.py +160 -3
sglang/srt/models/deepseek_nextn.py +5 -4
sglang/srt/models/deepseek_v2.py +305 -26
sglang/srt/models/deepseek_vl2.py +3 -5
sglang/srt/models/gemma3_causal.py +1 -2
sglang/srt/models/gemma3n_audio.py +949 -0
sglang/srt/models/gemma3n_causal.py +1010 -0
sglang/srt/models/gemma3n_mm.py +495 -0
sglang/srt/models/hunyuan.py +771 -0
sglang/srt/models/kimi_vl.py +1 -2
sglang/srt/models/llama.py +10 -4
sglang/srt/models/llama4.py +32 -45
sglang/srt/models/llama_eagle3.py +61 -11
sglang/srt/models/llava.py +5 -5
sglang/srt/models/minicpmo.py +2 -2
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mllama4.py +43 -11
sglang/srt/models/phi4mm.py +1 -3
sglang/srt/models/pixtral.py +3 -7
sglang/srt/models/qwen2.py +31 -3
sglang/srt/models/qwen2_5_vl.py +1 -3
sglang/srt/models/qwen2_audio.py +200 -0
sglang/srt/models/qwen2_moe.py +32 -6
sglang/srt/models/qwen2_vl.py +1 -4
sglang/srt/models/qwen3.py +94 -25
sglang/srt/models/qwen3_moe.py +68 -21
sglang/srt/models/vila.py +3 -8
sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +150 -133
sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
sglang/srt/multimodal/processors/gemma3n.py +82 -0
sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +3 -6
sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
sglang/srt/operations_strategy.py +6 -2
sglang/srt/reasoning_parser.py +26 -0
sglang/srt/sampling/sampling_batch_info.py +39 -1
sglang/srt/server_args.py +85 -24
sglang/srt/speculative/build_eagle_tree.py +57 -18
sglang/srt/speculative/eagle_worker.py +6 -4
sglang/srt/two_batch_overlap.py +204 -28
sglang/srt/utils.py +369 -138
sglang/srt/warmup.py +12 -3
sglang/test/runners.py +10 -1
sglang/test/test_utils.py +15 -3
sglang/version.py +1 -1
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/METADATA +9 -6
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/RECORD +149 -137
sglang/math_utils.py +0 -8
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
/sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
/sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +0 -0
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/WHEEL +0 -0
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/top_level.txt +0 -0

sglang/srt/layers/elementwise.py CHANGED Viewed

@@ -8,6 +8,7 @@ from sglang.srt.utils import is_hip
 _is_hip = is_hip()
 fused_softcap_autotune = triton.autotune(
     configs=[
         triton.Config(kwargs={"BLOCK_SIZE": 128}, num_warps=4),
@@ -189,21 +190,16 @@ def fused_dual_residual_rmsnorm(x, residual, weight1, weight2, eps, autotune=Fal
     assert x.shape == residual.shape and x.dtype == residual.dtype
     output, mid = torch.empty_like(x), torch.empty_like(x)
     bs, hidden_dim = x.shape
-    min_num_warps = 16 if _is_hip else 32
     if autotune:
         fused_dual_residual_rmsnorm_kernel_autotune[(bs,)](
             output, mid, x, residual, weight1, weight2, eps=eps, hidden_dim=hidden_dim
         )
     else:
+        max_warps = 16 if _is_hip else 32
         config = {
             "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
             "num_warps": max(
-                min(
-                    triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), min_num_warps
-                ),
-                4,
+                min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), max_warps), 4
             ),
         }
@@ -260,13 +256,11 @@ def fused_rmsnorm(x, weight, eps, autotune=False, inplace=False):
     else:
         output = torch.empty_like(x)
     bs, hidden_dim = x.shape
-    min_num_warps = 16 if _is_hip else 32
+    max_warps = 16 if _is_hip else 32
     config = {
         "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
         "num_warps": max(
-            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), min_num_warps), 4
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), max_warps), 4
         ),
     }
@@ -331,6 +325,75 @@ class FusedDualResidualRMSNorm:
         return self.rmsnorm2.forward_native(residual), residual
+@triton.jit
+def experts_combine_kernel(
+    out_hidden_states,
+    moe_hidden_states,
+    mlp_hidden_states,
+    combine_k: tl.constexpr,
+    hidden_dim: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_index_mlp = pid * hidden_dim
+    start_index_rmoe = pid * hidden_dim * combine_k
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < hidden_dim
+    combine_k_offsets = tl.arange(0, combine_k)
+    moe_x = tl.load(
+        moe_hidden_states
+        + start_index_rmoe
+        + combine_k_offsets[:, None] * hidden_dim
+        + offsets[None, :],
+        mask=mask[None, :],
+        other=0.0,
+    )
+    moe_x = tl.sum(moe_x, axis=0)
+    mlp_x = tl.load(mlp_hidden_states + start_index_mlp + offsets, mask=mask, other=0.0)
+    combined_x = (moe_x + mlp_x) / 1.4142135623730951
+    tl.store(out_hidden_states + start_index_mlp + offsets, combined_x, mask=mask)
+def experts_combine_triton(moe_hidden_states, mlp_hidden_states, output_buffer=None):
+    assert moe_hidden_states.is_contiguous()
+    assert mlp_hidden_states.is_contiguous()
+    if len(moe_hidden_states.shape) == 2:
+        combine_k = 1  # pre-combined
+    else:
+        combine_k = moe_hidden_states.shape[1]
+    if output_buffer is None:
+        out_hidden_states = torch.empty_like(mlp_hidden_states)
+    else:
+        flat_output_buffer = output_buffer.view(mlp_hidden_states.dtype).reshape(-1)
+        assert flat_output_buffer.numel() >= mlp_hidden_states.numel()
+        out_hidden_states = flat_output_buffer[: mlp_hidden_states.numel()].reshape(
+            mlp_hidden_states.shape
+        )
+    bs, hidden_dim = mlp_hidden_states.shape
+    config = {
+        "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 1024)), 8), 4
+        ),
+    }
+    experts_combine_kernel[(bs,)](
+        out_hidden_states,
+        moe_hidden_states,
+        mlp_hidden_states,
+        combine_k,
+        hidden_dim,
+        **config,
+    )
+    return out_hidden_states
 # gelu on first half of vector
 @triton.jit
 def gelu_and_mul_kernel(
@@ -400,10 +463,11 @@ def gelu_and_mul_triton(
             out_scales = scales
             static_scale = True
+    max_warps = 16 if _is_hip else 32
     config = {
         # 8 ele per thread (not tuned)
         "num_warps": max(
-            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 8 * 32)), 32), 4
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 8 * 32)), max_warps), 4
         ),
     }

sglang/srt/layers/flashinfer_comm_fusion.py ADDED Viewed

@@ -0,0 +1,202 @@
+import logging
+from typing import Tuple
+import torch
+import torch.distributed as dist
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.utils import is_flashinfer_available
+logger = logging.getLogger(__name__)
+_flashinfer_comm = None
+_workspace_manager = None
+if is_flashinfer_available():
+    try:
+        import flashinfer.comm as comm
+        _flashinfer_comm = comm
+    except ImportError:
+        logger.warning(
+            "flashinfer.comm is not available, falling back to standard "
+            "implementation"
+        )
+class FlashInferWorkspaceManager:
+    def __init__(self):
+        self.workspace_tensor = None
+        self.ipc_handles = None
+        self.world_size = None
+        self.rank = None
+        self.initialized = False
+    def initialize(
+        self,
+        world_size: int,
+        rank: int,
+        max_token_num: int,
+        hidden_dim: int,
+        group=None,
+        use_fp32_lamport: bool = False,
+    ):
+        """Initialize workspace"""
+        if self.initialized and self.world_size == world_size:
+            return
+        if _flashinfer_comm is None:
+            logger.warning(
+                "FlashInfer comm not available, skipping workspace " "initialization"
+            )
+            return
+        self.cleanup()
+        self.ipc_handles, self.workspace_tensor = (
+            comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
+                rank,
+                world_size,
+                max_token_num,
+                hidden_dim,
+                group=group,
+                use_fp32_lamport=use_fp32_lamport,
+            )
+        )
+        self.world_size = world_size
+        self.rank = rank
+        self.initialized = True
+        logger.info(
+            f"FlashInfer workspace initialized for rank {rank}, "
+            f"world_size {world_size}"
+        )
+    def cleanup(self):
+        """Clean up workspace"""
+        if self.initialized and self.ipc_handles is not None:
+            try:
+                _flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(
+                    self.ipc_handles, group=dist.group.WORLD
+                )
+            except Exception as e:
+                logger.warning(f"Failed to cleanup FlashInfer workspace: {e}")
+            finally:
+                self.workspace_tensor = None
+                self.ipc_handles = None
+                self.initialized = False
+_workspace_manager = FlashInferWorkspaceManager()
+def ensure_workspace_initialized(
+    max_token_num: int = 1024, hidden_dim: int = 4096, use_fp32_lamport: bool = False
+):
+    """Ensure workspace is initialized"""
+    if not is_flashinfer_available() or _flashinfer_comm is None:
+        return False
+    world_size = get_tensor_model_parallel_world_size()
+    if world_size <= 1:
+        return False
+    rank = dist.get_rank()
+    if (
+        not _workspace_manager.initialized
+        or _workspace_manager.world_size != world_size
+    ):
+        _workspace_manager.initialize(
+            world_size=world_size,
+            rank=rank,
+            max_token_num=max_token_num,
+            hidden_dim=hidden_dim,
+            use_fp32_lamport=use_fp32_lamport,
+        )
+    return _workspace_manager.initialized
+def flashinfer_allreduce_add_rmsnorm(
+    input_tensor: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    max_token_num: int = 1024,
+    use_oneshot: bool = True,
+    trigger_completion_at_end: bool = False,
+    fp32_acc: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Use FlashInfer's fused allreduce + residual + RMS norm operation
+    Args:
+        input_tensor: Input tensor that needs allreduce
+        residual: Residual tensor
+        weight: RMS norm weight
+        eps: RMS norm epsilon
+        max_token_num: Maximum token number
+        use_oneshot: Whether to use oneshot mode
+        trigger_completion_at_end: Whether to trigger completion at end
+        fp32_acc: Whether to use fp32 precision
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: (norm_output, residual_output)
+    """
+    if not is_flashinfer_available() or _flashinfer_comm is None:
+        logger.debug(
+            "FlashInfer not available, falling back to standard " "implementation"
+        )
+        return None, None
+    world_size = get_tensor_model_parallel_world_size()
+    if world_size <= 1:
+        logger.debug("Single GPU, no need for allreduce fusion")
+        return None, None
+    if not ensure_workspace_initialized(
+        max_token_num=max_token_num,
+        hidden_dim=input_tensor.shape[-1],
+        use_fp32_lamport=(input_tensor.dtype == torch.float32),
+    ):
+        logger.debug("FlashInfer workspace not available")
+        return None, None
+    token_num, hidden_dim = input_tensor.shape
+    residual_out = torch.empty_like(residual)
+    norm_out = torch.empty_like(input_tensor)
+    _flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_tensor,
+        world_size=world_size,
+        world_rank=dist.get_rank(),
+        token_num=token_num,
+        hidden_dim=hidden_dim,
+        workspace_ptrs=_workspace_manager.workspace_tensor,
+        launch_with_pdl=True,
+        use_oneshot=use_oneshot,
+        trigger_completion_at_end=trigger_completion_at_end,
+        fp32_acc=fp32_acc,
+        pattern_code=(_flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm),
+        allreduce_out=None,
+        residual_in=residual,
+        residual_out=residual_out,
+        norm_out=norm_out,
+        quant_out=None,
+        scale_out=None,
+        rms_gamma=weight,
+        rms_eps=eps,
+        scale_factor=None,
+        layout_code=None,
+    )
+    return norm_out, residual_out
+def cleanup_flashinfer_workspace():
+    global _workspace_manager
+    if _workspace_manager is not None:
+        _workspace_manager.cleanup()

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -52,6 +52,9 @@ elif _is_hip:
 logger = logging.getLogger(__name__)
+if is_npu():
+    import torch_npu
 class RMSNorm(CustomOp):
     def __init__(
@@ -76,6 +79,18 @@ class RMSNorm(CustomOp):
         out = rmsnorm(x, self.weight.data, self.variance_epsilon)
         return out
+    def forward_npu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            out, _, residual_out = torch_npu.npu_add_rms_norm(
+                residual, x, self.weight.data, self.variance_epsilon
+            )
+            return out, residual_out
+        return torch_npu.npu_rms_norm(x, self.weight.data, self.variance_epsilon)[0]
     def forward_aiter(
         self,
         x: torch.Tensor,
@@ -148,6 +163,32 @@ class RMSNorm(CustomOp):
         else:
             return self.forward_native(x, residual)
+    def forward_with_allreduce_fusion(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Forward method with allreduce fusion, prioritizing flashinfer fused operations
+        """
+        if residual is not None:
+            from sglang.srt.distributed import get_tensor_model_parallel_world_size
+            from sglang.srt.layers.flashinfer_comm_fusion import (
+                flashinfer_allreduce_add_rmsnorm,
+            )
+            if get_tensor_model_parallel_world_size() > 1:
+                fused_result = flashinfer_allreduce_add_rmsnorm(
+                    input_tensor=x,
+                    residual=residual,
+                    weight=self.weight,
+                    eps=self.variance_epsilon,
+                )
+                if fused_result[0] is not None:
+                    return fused_result
+        return self.forward(x, residual)
 class GemmaRMSNorm(CustomOp):
     def __init__(

sglang/srt/layers/linear.py CHANGED Viewed

@@ -17,6 +17,7 @@ from sglang.srt.distributed import (
     tensor_model_parallel_all_gather,
     tensor_model_parallel_all_reduce,
 )
+from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading
 from sglang.srt.layers.parameter import (
     BasevLLMParameter,
     BlockQuantScaleParameter,
@@ -30,7 +31,12 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.utils import set_weight_attrs
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    is_cpu,
+    set_weight_attrs,
+    use_intel_amx_backend,
+)
 logger = logging.getLogger(__name__)
@@ -52,6 +58,9 @@ WEIGHT_LOADER_V2_SUPPORTED = [
     "IPEXAWQLinearMethod",
 ]
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
 def adjust_marlin_shard(param, shard_size, shard_offset):
     marlin_tile_size = getattr(param, "marlin_tile_size", None)
@@ -165,6 +174,10 @@ class UnquantizedLinearMethod(LinearMethodBase):
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, extra_weight_attrs)
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if _is_cpu and _is_cpu_amx_available:
+            _amx_process_weight_after_loading(layer, ["weight"])
     def apply(
         self,
         layer: torch.nn.Module,
@@ -172,6 +185,11 @@ class UnquantizedLinearMethod(LinearMethodBase):
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        if use_intel_amx_backend(layer):
+            return torch.ops.sgl_kernel.weight_packed_linear(
+                x, layer.weight, bias, True  # is_vnni
+            )
         return F.linear(x, layer.weight, bias)
@@ -408,8 +426,26 @@ class ColumnParallelLinear(LinearBase):
         if output_dim is not None and not use_bitsandbytes_4bit:
             shard_size = param_data.shape[output_dim]
             start_idx = self.tp_rank * shard_size
-            if not self.use_presharded_weights:
-                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+            if _is_cpu:
+                from sglang.srt.model_loader.weight_utils import (
+                    narrow_padded_param_and_loaded_weight,
+                )
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    param_data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    start_idx,
+                    output_dim,
+                    shard_size,
+                    not self.use_presharded_weights,
+                )
+            else:
+                if not self.use_presharded_weights:
+                    loaded_weight = loaded_weight.narrow(
+                        output_dim, start_idx, shard_size
+                    )
         # Special case for loading scales off disk, which often do not
         # have a shape (such as in the case of AutoFP8).
@@ -626,10 +662,29 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             param_data = param_data.narrow(output_dim, shard_offset, shard_size)
             start_idx = self.tp_rank * shard_size
-            # bitsandbytes loads the weights of the specific portion
-            # no need to narrow here
-            if not use_bitsandbytes_4bit and not self.use_presharded_weights:
-                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+            if _is_cpu:
+                from sglang.srt.model_loader.weight_utils import (
+                    narrow_padded_param_and_loaded_weight,
+                )
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    param_data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    start_idx,
+                    output_dim,
+                    shard_size,
+                    not use_bitsandbytes_4bit and not self.use_presharded_weights,
+                )
+            else:
+                # bitsandbytes loads the weights of the specific portion
+                # no need to narrow here
+                if not use_bitsandbytes_4bit and not self.use_presharded_weights:
+                    loaded_weight = loaded_weight.narrow(
+                        output_dim, start_idx, shard_size
+                    )
         # Special case for AQLM codebooks.
         elif is_metadata:
             # metadata indicates fixed size concatenated along dim 0
@@ -1094,10 +1149,27 @@ class QKVParallelLinear(ColumnParallelLinear):
                 shard_id = self.tp_rank // self.num_kv_head_replicas
             start_idx = shard_id * shard_size
-            # bitsandbytes loads the weights of the specific portion
-            # no need to narrow here
-            if not use_bitsandbytes_4bit and not self.use_presharded_weights:
-                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+            if _is_cpu:
+                from sglang.srt.model_loader.weight_utils import (
+                    narrow_padded_param_and_loaded_weight,
+                )
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    param_data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    start_idx,
+                    output_dim,
+                    shard_size,
+                    not use_bitsandbytes_4bit and not self.use_presharded_weights,
+                )
+            else:
+                # bitsandbytes loads the weights of the specific portion
+                # no need to narrow here
+                if not use_bitsandbytes_4bit and not self.use_presharded_weights:
+                    loaded_weight = loaded_weight.narrow(
+                        output_dim, start_idx, shard_size
+                    )
         # Special case for for AQLM codebooks.
         elif is_metadata:
@@ -1239,7 +1311,22 @@ class RowParallelLinear(LinearBase):
         ):
             shard_size = param_data.shape[input_dim]
             start_idx = self.tp_rank * shard_size
-            loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size)
+            if _is_cpu:
+                from sglang.srt.model_loader.weight_utils import (
+                    narrow_padded_param_and_loaded_weight,
+                )
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    param_data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    start_idx,
+                    input_dim,
+                    shard_size,
+                )
+            else:
+                loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size)
         # Special case for loading scales off disk, which often do not
         # have a shape (such as in the case of AutoFP8).

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -42,7 +42,7 @@ from sglang.srt.model_executor.forward_batch_info import (
     ForwardBatch,
     ForwardMode,
 )
-from sglang.srt.utils import dump_to_file
+from sglang.srt.utils import dump_to_file, use_intel_amx_backend
 logger = logging.getLogger(__name__)
@@ -436,17 +436,26 @@ class LogitsProcessor(nn.Module):
         if self.do_tensor_parallel_all_gather_dp_attn:
             logits_metadata.compute_dp_attention_metadata(hidden_states)
             hidden_states, local_hidden_states = (
-                logits_metadata.gathered_buffer,
-                hidden_states.clone(),
+                torch.empty_like(logits_metadata.gathered_buffer),
+                hidden_states,
             )
             dp_gather_replicate(hidden_states, local_hidden_states, logits_metadata)
         if hasattr(lm_head, "weight"):
-            logits = torch.matmul(
-                hidden_states.to(lm_head.weight.dtype), lm_head.weight.T
-            )
+            if use_intel_amx_backend(lm_head):
+                logits = torch.ops.sgl_kernel.weight_packed_linear(
+                    hidden_states.to(lm_head.weight.dtype),
+                    lm_head.weight,
+                    None,  # bias
+                    True,  # is_vnni
+                )
+            else:
+                logits = torch.matmul(
+                    hidden_states.to(lm_head.weight.dtype), lm_head.weight.T
+                )
         else:
             # GGUF models
+            # TODO: use weight_packed_linear for GGUF models
             logits = lm_head.quant_method.apply(lm_head, hidden_states, embedding_bias)
         if self.logit_scale is not None:

sglang/srt/layers/moe/ep_moe/kernels.py CHANGED Viewed

@@ -4,9 +4,8 @@ from typing import List, Optional
 import torch
 import triton
-from sglang.math_utils import ceil_div
 from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
-from sglang.srt.utils import dispose_tensor, is_cuda
+from sglang.srt.utils import ceil_div, dispose_tensor, is_cuda
 logger = logging.getLogger(__name__)
@@ -814,14 +813,17 @@ def _fwd_kernel_ep_scatter_2(
     offset_in = tl.arange(0, HIDDEN_SIZE_PAD)
     mask = offset_in < HIDDEN_SIZE
-    offset_in_s = tl.arange(0, SCALE_HIDDEN_SIZE_PAD)
-    mask_s = offset_in_s < SCALE_HIDDEN_SIZE
+    index_in_s = tl.arange(0, SCALE_HIDDEN_SIZE_PAD)
+    mask_s = index_in_s < SCALE_HIDDEN_SIZE
     for token_id_int32 in range(start_token_id, total_token_num, grid_num):
         token_id = token_id_int32.to(tl.int64)
         to_copy = tl.load(recv_x + token_id * recv_x_stride0 + offset_in, mask=mask)
         to_copy_s = tl.load(
-            recv_x_scale + token_id * recv_x_scale_stride0 + offset_in_s, mask=mask_s
+            recv_x_scale
+            + token_id * recv_x_scale_stride0
+            + index_in_s * recv_x_scale_stride1,
+            mask=mask_s,
         )
         for topk_idx_int32 in tl.range(0, topk_num, 1, num_stages=4):
@@ -842,7 +844,11 @@ def _fwd_kernel_ep_scatter_2(
                     output_tensor_scale + dest_token_index * output_tensor_scale_stride0
                 )
                 tl.store(output_tensor_ptr + offset_in, to_copy, mask=mask)
-                tl.store(output_tensor_scale_ptr + offset_in_s, to_copy_s, mask=mask_s)
+                tl.store(
+                    output_tensor_scale_ptr + index_in_s * output_tensor_scale_stride1,
+                    to_copy_s,
+                    mask=mask_s,
+                )
 # copy from https://github.com/ModelTC/lightllm/blob/main/lightllm/common/fused_moe/deepep_scatter_gather.py
@@ -857,6 +863,7 @@ def ep_scatter(
     output_tensor_scale: torch.Tensor,
     m_indices: torch.Tensor,
     output_index: torch.Tensor,
+    scale_ue8m0: bool = False,
 ):
     BLOCK_E = 128  # token num of per expert is aligned to 128
     BLOCK_D = 128  # block size of quantization
@@ -866,7 +873,15 @@ def ep_scatter(
     # grid = (triton.cdiv(hidden_size, BLOCK_D), num_experts)
     grid = num_experts
+    scale_hidden_size = hidden_size // BLOCK_D
+    if scale_ue8m0:
+        # ue8m0 scales are packed here (4 scales per int32),
+        # hence the effective size of this dimension is divided by 4.
+        scale_hidden_size = ceil_div(scale_hidden_size, 4)
     assert m_indices.shape[0] % BLOCK_E == 0
+    assert recv_x_scale.dtype == output_tensor_scale.dtype
+    assert recv_x_scale.shape[1] == output_tensor_scale.shape[1] == scale_hidden_size
     _fwd_kernel_ep_scatter_1[(grid,)](
         num_recv_tokens_per_expert,
@@ -905,8 +920,8 @@ def ep_scatter(
         num_warps=num_warps,
         HIDDEN_SIZE=hidden_size,
         HIDDEN_SIZE_PAD=triton.next_power_of_2(hidden_size),
-        SCALE_HIDDEN_SIZE=hidden_size // BLOCK_D,
-        SCALE_HIDDEN_SIZE_PAD=triton.next_power_of_2(hidden_size // BLOCK_D),
+        SCALE_HIDDEN_SIZE=scale_hidden_size,
+        SCALE_HIDDEN_SIZE_PAD=triton.next_power_of_2(scale_hidden_size),
     )
     return

sglang 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl

sglang 0.4.8py3-none-any.whl → 0.4.9py3-none-any.whl