PyPI - sglang - Versions diffs - 0.5.1.post3__py3-none-any.whl → 0.5.2rc1__py3-none-any.whl - Mend

sglang 0.5.1.post3py3-none-any.whl → 0.5.2rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

sglang/bench_one_batch.py +3 -0
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/longcat_flash.py +104 -0
sglang/srt/configs/model_config.py +14 -1
sglang/srt/connector/__init__.py +1 -1
sglang/srt/connector/base_connector.py +1 -2
sglang/srt/connector/redis.py +2 -2
sglang/srt/connector/serde/__init__.py +1 -1
sglang/srt/connector/serde/safe_serde.py +4 -3
sglang/srt/disaggregation/ascend/conn.py +75 -0
sglang/srt/disaggregation/launch_lb.py +0 -13
sglang/srt/disaggregation/mini_lb.py +33 -8
sglang/srt/disaggregation/prefill.py +1 -1
sglang/srt/distributed/parallel_state.py +27 -15
sglang/srt/entrypoints/engine.py +19 -12
sglang/srt/entrypoints/http_server.py +174 -34
sglang/srt/entrypoints/openai/protocol.py +60 -0
sglang/srt/eplb/eplb_manager.py +26 -2
sglang/srt/eplb/expert_distribution.py +29 -2
sglang/srt/hf_transformers_utils.py +10 -0
sglang/srt/layers/activation.py +12 -0
sglang/srt/layers/attention/ascend_backend.py +240 -109
sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
sglang/srt/layers/layernorm.py +28 -3
sglang/srt/layers/linear.py +3 -2
sglang/srt/layers/logits_processor.py +1 -1
sglang/srt/layers/moe/cutlass_w4a8_moe.py +1 -9
sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
sglang/srt/layers/moe/ep_moe/layer.py +14 -13
sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -1048
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +796 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +5 -2
sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
sglang/srt/layers/moe/topk.py +35 -12
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +9 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
sglang/srt/layers/quantization/modelopt_quant.py +7 -0
sglang/srt/layers/quantization/mxfp4.py +9 -4
sglang/srt/layers/quantization/utils.py +13 -0
sglang/srt/layers/quantization/w4afp8.py +30 -25
sglang/srt/layers/quantization/w8a8_int8.py +7 -3
sglang/srt/layers/rotary_embedding.py +28 -1
sglang/srt/layers/sampler.py +29 -5
sglang/srt/managers/cache_controller.py +62 -96
sglang/srt/managers/detokenizer_manager.py +9 -2
sglang/srt/managers/io_struct.py +27 -0
sglang/srt/managers/mm_utils.py +5 -1
sglang/srt/managers/multi_tokenizer_mixin.py +629 -0
sglang/srt/managers/scheduler.py +39 -2
sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
sglang/srt/managers/tokenizer_manager.py +86 -39
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +20 -3
sglang/srt/mem_cache/hiradix_cache.py +94 -71
sglang/srt/mem_cache/lora_radix_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +4 -0
sglang/srt/mem_cache/memory_pool_host.py +4 -4
sglang/srt/mem_cache/radix_cache.py +5 -4
sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -9
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +2 -1
sglang/srt/mem_cache/swa_radix_cache.py +1 -1
sglang/srt/model_executor/model_runner.py +5 -4
sglang/srt/model_loader/loader.py +15 -24
sglang/srt/model_loader/utils.py +12 -0
sglang/srt/models/deepseek_v2.py +31 -10
sglang/srt/models/gpt_oss.py +5 -18
sglang/srt/models/llama_eagle3.py +4 -0
sglang/srt/models/longcat_flash.py +1026 -0
sglang/srt/models/longcat_flash_nextn.py +699 -0
sglang/srt/models/qwen2.py +26 -3
sglang/srt/models/qwen2_5_vl.py +65 -41
sglang/srt/models/qwen2_moe.py +22 -2
sglang/srt/models/transformers.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +4 -2
sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
sglang/srt/server_args.py +112 -55
sglang/srt/speculative/eagle_worker.py +28 -8
sglang/srt/utils.py +4 -0
sglang/test/attention/test_trtllm_mla_backend.py +12 -3
sglang/test/test_cutlass_w4a8_moe.py +24 -9
sglang/version.py +1 -1
{sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/METADATA +5 -5
{sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/RECORD +93 -85
{sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/WHEEL +0 -0
{sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/layer.py CHANGED Viewed

@@ -175,6 +175,8 @@ class FusedMoE(torch.nn.Module):
         self.moe_tp_rank = get_moe_tensor_parallel_rank()
         assert num_experts % self.moe_ep_size == 0
         self.num_local_experts = num_experts // self.moe_ep_size
+        self.start_expert_id = self.moe_ep_rank * self.num_local_experts
+        self.end_expert_id = self.start_expert_id + self.num_local_experts - 1
         if self.moe_ep_size > 1:
             # TODO(ch-wan): support shared experts fusion
             # Create a tensor of size num_experts filled with -1
@@ -593,8 +595,9 @@ class FusedMoE(torch.nn.Module):
             if (
                 "compressed" in self.quant_method.__class__.__name__.lower()
-                and param.data[expert_id] != 1
-                and (param.data[expert_id] - loaded_weight).abs() > 1e-5
+                or "w4afp8" in self.quant_config.get_name()
+                and (param.data[expert_id] != 1).any()
+                and ((param.data[expert_id] - loaded_weight).abs() > 1e-5).any()
             ):
                 raise ValueError(
                     "input_scales of w1 and w3 of a layer "

sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py ADDED Viewed

@@ -0,0 +1,87 @@
+from __future__ import annotations
+from typing import Tuple
+import torch
+import triton
+from sglang.srt.utils import is_cuda, is_hip
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+if _is_cuda or _is_hip:
+    from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
+def moe_align_block_size(
+    topk_ids: torch.Tensor, block_size: int, num_experts: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Aligns the token distribution across experts to be compatible with block
+    size for matrix multiplication.
+    Parameters:
+    - topk_ids: A tensor of shape [total_tokens, top_k] representing the
+        top-k expert indices for each token.
+    - block_size: The block size used in block matrix multiplication.
+    - num_experts: The total number of experts.
+    Returns:
+    - sorted_token_ids: A tensor containing the sorted token indices according
+        to their allocated expert.
+    - expert_ids: A tensor indicating the assigned expert index for each block.
+    - num_tokens_post_padded: The total number of tokens after padding,
+        ensuring divisibility by block_size.
+    This function pads the number of tokens that each expert needs to process
+    so that it is divisible by block_size.
+    Padding ensures that during block matrix multiplication, the dimensions
+    align correctly.
+    Example:
+    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
+    block_size = 4, and num_experts = 4:
+    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
+        with each expert needing to process 3 tokens.
+    - As block_size is 4, we pad 1 token for each expert.
+    - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
+    - Then append padding tokens [12, 12, 12, 12] for each block.
+    - After sorting by expert index, we obtain token_ids
+        [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
+        Tokens 12 are non-existent (padding) and are ignored in
+        the subsequent matrix multiplication.
+    - The padding ensures that the total number of tokens is now divisible
+        by block_size for proper block matrix operations.
+    """
+    max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
+    sorted_ids = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
+    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+    expert_ids = torch.empty(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
+    # In EP, expert_ids for filtered experts are -1. We have num_experts + 1 ids in total.
+    cumsum_buffer = torch.empty(
+        (num_experts + 2,), dtype=torch.int32, device=topk_ids.device
+    )
+    # Threshold based on benchmark results
+    fuse_sorted_ids_padding = sorted_ids.shape[0] <= 4096
+    if not fuse_sorted_ids_padding:
+        sorted_ids.fill_(topk_ids.numel())
+    sgl_moe_align_block_size(
+        topk_ids,
+        num_experts + 1,
+        block_size,
+        sorted_ids,
+        expert_ids,
+        num_tokens_post_pad,
+        cumsum_buffer,
+        fuse_sorted_ids_padding,
+    )
+    return sorted_ids, expert_ids, num_tokens_post_pad

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -304,12 +304,12 @@ class TopK(CustomOp):
         global_num_experts = router_logits.shape[-1]
         # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
-        if global_num_experts == 256 and self.topk_config.renormalize is True:
+        if global_num_experts == 256:
             routed_scaling_factor = self.topk_config.routed_scaling_factor or 1
             router_logits = router_logits.to(torch.float32)
-            return torch_npu.npu_moe_gating_top_k(
+            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
                 router_logits,
                 k=self.topk_config.top_k,
                 bias=self.topk_config.correction_bias.to(torch.float32),
@@ -321,6 +321,16 @@ class TopK(CustomOp):
                 routed_scaling_factor=routed_scaling_factor,
                 eps=float(1e-20),
             )
+            if self.topk_config.renormalize:
+                topk_weights_sum = (
+                    topk_weights.sum(dim=-1, keepdim=True)
+                    if self.topk_config.num_fused_shared_experts == 0
+                    else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
+                )
+                topk_weights = topk_weights / topk_weights_sum
+            return StandardTopKOutput(topk_weights, topk_ids, _)
         else:
             self.topk_config.torch_native = True
             return select_experts(
@@ -347,17 +357,28 @@ def fused_topk_torch_native(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
+    correction_bias: torch.Tensor = None,
 ):
-    assert (
-        hidden_states.shape[0] == gating_output.shape[0]
-    ), f"Number of tokens mismatch, {hidden_states.shape=} vs {gating_output.shape=}"
-    M, _ = hidden_states.shape
-    topk_weights = torch.empty(
-        M, topk, dtype=torch.float32, device=hidden_states.device
-    )
-    topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
-    topk_weights = F.softmax(gating_output.float(), dim=-1)
-    topk_weights, topk_ids = torch.topk(topk_weights, topk, dim=-1)
+    if correction_bias is not None:
+        n_routed_experts = gating_output.shape[-1]
+        scores = gating_output.softmax(dim=-1)
+        scores_for_choice = scores.view(
+            -1, n_routed_experts
+        ) + correction_bias.unsqueeze(0)
+        topk_ids = torch.topk(scores_for_choice, k=topk, dim=-1, sorted=False)[1]
+        topk_weights = scores.gather(1, topk_ids)
+    else:
+        assert (
+            hidden_states.shape[0] == gating_output.shape[0]
+        ), f"Number of tokens mismatch, {hidden_states.shape=} vs {gating_output.shape=}"
+        M, _ = hidden_states.shape
+        topk_weights = torch.empty(
+            M, topk, dtype=torch.float32, device=hidden_states.device
+        )
+        topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
+        topk_weights = F.softmax(gating_output.float(), dim=-1)
+        topk_weights, topk_ids = torch.topk(topk_weights, topk, dim=-1)
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
     return topk_weights, topk_ids
@@ -370,6 +391,7 @@ def fused_topk_cpu(
     renormalize: bool,
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    correction_bias: torch.Tensor = None,
 ):
     topk_weights, topk_ids = torch.ops.sgl_kernel.topk_softmax_cpu(
         hidden_states=hidden_states,
@@ -815,6 +837,7 @@ def select_experts(
             gating_output=router_logits,
             topk=top_k,
             renormalize=renormalize,
+            correction_bias=correction_bias,
         )
     elif custom_routing_function is None:
         assert not apply_routed_scaling_factor_on_output, "Not implemented"

sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py CHANGED Viewed

@@ -93,7 +93,7 @@ def _maybe_compile_deep_gemm_one_type_all(
         if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
             logger.warning(
                 "Entering DeepGEMM JIT Pre-Compile session. "
-                "It may takes a long time (typically 10-20 mins) "
+                "It may take a long time (typically 10-20 mins) "
                 "if you have not run `sglang.compile_deep_gemm`. "
                 "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
                 " for pre-compilation to reduce the overhead if you have not run it before. "
@@ -132,9 +132,17 @@ def _compile_deep_gemm_one_type_all(
         kernel_type, max_m=max(m_list), n=n, k=k, num_groups=num_groups
     )
+    old_compile_mode = deep_gemm.get_compile_mode()
+    deep_gemm.set_compile_mode(1)
     # TODO can use multi thread
     for m in tqdm(m_list, desc=f"DeepGEMM warmup"):
         executor.execute(m=m)
+    deep_gemm.set_compile_mode(old_compile_mode)
+    # clean up input buffers
+    torch.cuda.current_stream().synchronize()
+    del executor
+    torch.cuda.empty_cache()
 class _BaseWarmupExecutor:

sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py CHANGED Viewed

@@ -11,9 +11,6 @@ def _compute_enable_deep_gemm():
     sm_version = get_device_sm()
     if sm_version < 90:
         return False
-    # TODO fix deepgemm cu129 fp8 issue
-    if torch.version.cuda == "12.9":
-        return False
     try:
         import deep_gemm

sglang/srt/layers/quantization/modelopt_quant.py CHANGED Viewed

@@ -599,6 +599,13 @@ class ModelOptFp4Config(QuantizationConfig):
             regex_str = pattern.replace(".", r"\.").replace("*", r".*")
             if re.fullmatch(regex_str, prefix):
                 return True
+            # Check if the last part of the excluded pattern is contained in the last part of the prefix
+            # This handles fused modules like fused_qkv_a_proj_with_mqa that contain q_a_proj and kv_a_proj_with_mqa
+            pattern_last_part = pattern.split(".")[-1]
+            prefix_last_part = prefix.split(".")[-1]
+            if pattern_last_part in prefix_last_part:
+                return True
         return False
     def get_quant_method(

sglang/srt/layers/quantization/mxfp4.py CHANGED Viewed

@@ -66,10 +66,15 @@ _is_hip = is_hip()
 if _is_hip:
     # import aiter
-    from aiter import ActivationType, QuantType, dtypes
-    from aiter.fused_moe import fused_moe
-    from aiter.ops.triton.quant import dynamic_mxfp4_quant
-    from aiter.utility.fp4_utils import e8m0_shuffle
+    try:
+        from aiter import ActivationType, QuantType, dtypes
+        from aiter.fused_moe import fused_moe
+        from aiter.ops.triton.quant import dynamic_mxfp4_quant
+        from aiter.utility.fp4_utils import e8m0_shuffle
+    except ImportError as err:
+        ActivationType = QuantType = dtypes = fused_moe = dynamic_mxfp4_quant = (
+            e8m0_shuffle
+        ) = err
 def _swizzle_mxfp4(quant_tensor, scale, num_warps):

sglang/srt/layers/quantization/utils.py CHANGED Viewed

@@ -77,6 +77,19 @@ def is_layer_skipped(
                 )
     else:
         is_skipped = prefix in ignored_layers
+        if "gate_up_proj" in prefix:
+            prefix_gate = prefix.replace("gate_up_proj", "gate_proj")
+            prefix_up = prefix.replace("gate_up_proj", "up_proj")
+            if prefix_gate in ignored_layers and prefix_up in ignored_layers:
+                is_skipped = True
+        elif "experts" in prefix:
+            is_skipped = any(
+                [
+                    prefix in layer_name
+                    for layer_name in ignored_layers
+                    if "experts" in layer_name
+                ]
+            )
     assert is_skipped is not None
     return is_skipped

sglang/srt/layers/quantization/w4afp8.py CHANGED Viewed

@@ -1,12 +1,14 @@
 from __future__ import annotations
 import logging
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
 import torch
 from torch.nn import Module
 from torch.nn.parameter import Parameter
+from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
+from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
 from sglang.srt.layers.quantization.base_config import (
     FusedMoEMethodBase,
     QuantizationConfig,
@@ -91,12 +93,13 @@ class W4AFp8Config(QuantizationConfig):
         from sglang.srt.layers.linear import LinearBase
         from sglang.srt.layers.moe.ep_moe.layer import EPMoE
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        from sglang.srt.managers.schedule_batch import global_server_args_dict
         if isinstance(layer, LinearBase):
             if is_layer_skipped(prefix, self.ignored_layers):
                 return UnquantizedLinearMethod()
             return Fp8LinearMethod(self)
-        elif isinstance(layer, EPMoE):
+        elif isinstance(layer, FusedMoE):
             return W4AFp8MoEMethod(self)
         return None
@@ -104,8 +107,24 @@ class W4AFp8Config(QuantizationConfig):
         return []
-class W4AFp8MoEMethod(FusedMoEMethodBase):
+def interleave_scales(scales: torch.Tensor) -> torch.Tensor:
+    """Interleave scales in groups of 4 similar to TRT-LLM implementation."""
+    s_shape = scales.shape
+    # Reshape to separate groups of 4
+    alignment = 4 if s_shape[2] % 4 == 0 else 1
+    scales_interleaved = scales.reshape(
+        s_shape[0], s_shape[1], (s_shape[2] // alignment), alignment
+    )
+    # Permute dimensions to interleave
+    scales_interleaved = scales_interleaved.permute(0, 2, 1, 3)
+    # Reshape back to original dimensions but with interleaved values
+    scales_interleaved = scales_interleaved.reshape(
+        s_shape[0], s_shape[2] // alignment, s_shape[1] * alignment
+    )
+    return scales_interleaved.contiguous()
+class W4AFp8MoEMethod(FusedMoEMethodBase):
     def __init__(self, quant_config: W4AFp8Config):
         self.quant_config = quant_config
@@ -234,33 +253,18 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         return
-    def _interleave_scales(self, scales: torch.Tensor) -> torch.Tensor:
-        """Interleave scales in groups of 4 similar to TRT-LLM implementation."""
-        s_shape = scales.shape
-        # Reshape to separate groups of 4
-        scales_interleaved = scales.reshape(
-            s_shape[0], s_shape[1], (s_shape[2] // 4), 4
-        )
-        # Permute dimensions to interleave
-        scales_interleaved = scales_interleaved.permute(0, 2, 1, 3)
-        # Reshape back to original dimensions but with interleaved values
-        scales_interleaved = scales_interleaved.reshape(
-            s_shape[0], s_shape[2] // 4, s_shape[1] * 4
-        )
-        return scales_interleaved.contiguous()
     def process_weights_after_loading(self, layer: Module) -> None:
         dtype = torch.bfloat16
         device = layer.w2_weight.device
         # Interleave w13_weight_scale (gate_up_proj)
         w13_weight_scale = layer.w13_weight_scale_inv.to(dtype)
-        w13_weight_scale = self._interleave_scales(w13_weight_scale)
+        w13_weight_scale = interleave_scales(w13_weight_scale)
         layer.w13_weight_scale_inv = Parameter(w13_weight_scale, requires_grad=False)
         # Interleave w2_weight_scale (down_proj)
         w2_weight_scale = layer.w2_weight_scale_inv.to(dtype)
-        w2_weight_scale = self._interleave_scales(w2_weight_scale)
+        w2_weight_scale = interleave_scales(w2_weight_scale)
         layer.w2_weight_scale_inv = Parameter(w2_weight_scale, requires_grad=False)
         # Process input scales
@@ -291,11 +295,12 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         topk_weights, topk_ids, _ = topk_output
         local_topk_ids = topk_ids
-        local_topk_ids = torch.where(
-            topk_ids == -1,
-            layer.num_experts,
-            topk_ids,
-        )
+        if get_moe_expert_parallel_world_size() > 1:
+            local_topk_ids = torch.where(
+                topk_ids == -1,
+                layer.num_experts,
+                topk_ids,
+            )
         output = cutlass_w4a8_moe(
             layer.start_expert_id,

sglang/srt/layers/quantization/w8a8_int8.py CHANGED Viewed

@@ -551,7 +551,7 @@ class NPU_W8A8LinearMethodImpl:
     def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]:
         params_dict = {}
         params_dict["input_scale"] = torch.empty(1, dtype=params_dtype)
-        params_dict["input_offset"] = torch.empty(1, dtype=torch.int8)
+        params_dict["input_offset"] = torch.empty(1, dtype=params_dtype)
         return params_dict
     @staticmethod
@@ -582,11 +582,11 @@ class NPU_W8A8LinearMethodImpl:
         if original_dtype != torch.int8:
             x = torch_npu.npu_quantize(
                 x,
-                layer.aclnn_input_scale,
+                layer.aclnn_input_scale_reciprocal,
                 layer.aclnn_input_offset,
                 torch.qint8,
                 -1,
-                True,
+                False,
             )
         # Only fuse bias add into GEMM for rank 0 (this ensures that
         # bias will not get added more than once in Attention TP>1 case)
@@ -608,6 +608,10 @@ class NPU_W8A8LinearMethodImpl:
             layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
             requires_grad=False,
         )
+        layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
+            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
+            requires_grad=False,
+        )
         layer.aclnn_input_offset = torch.nn.Parameter(
             layer.input_offset.data.repeat(expanding_factor).to(device="npu"),
             requires_grad=False,

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -1876,7 +1876,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(
+def apply_rotary_pos_emb_native(
     q: torch.Tensor,
     k: torch.Tensor,
     cos: torch.Tensor,
@@ -1899,6 +1899,33 @@ def apply_rotary_pos_emb(
     return q_embed, k_embed
+def apply_rotary_pos_emb_npu(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim=1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if q.shape[1] != 128:
+        return apply_rotary_pos_emb_native(q, k, cos, sin, unsqueeze_dim)
+    cos = cos.unsqueeze(unsqueeze_dim)
+    cos = torch.transpose(cos, 1, 2)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    sin = torch.transpose(sin, 1, 2)
+    q = torch.transpose(q, 1, 2)
+    k = torch.transpose(k, 1, 2)
+    q_embed, k_embed = torch_npu.npu_apply_rotary_pos_emb(q, k, cos, sin)
+    q_embed = torch.transpose(q_embed, 1, 2)
+    k_embed = torch.transpose(k_embed, 1, 2)
+    return q_embed, k_embed
+if _is_npu:
+    apply_rotary_pos_emb = apply_rotary_pos_emb_npu
+else:
+    apply_rotary_pos_emb = apply_rotary_pos_emb_native
 def get_rope_cpu(
     head_size: int,
     rotary_dim: int,

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -27,6 +27,7 @@ if is_cuda():
 logger = logging.getLogger(__name__)
 SYNC_TOKEN_IDS_ACROSS_TP = get_bool_env_var("SYNC_TOKEN_IDS_ACROSS_TP")
+RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB")
 class Sampler(nn.Module):
@@ -77,7 +78,12 @@ class Sampler(nn.Module):
             batch_next_token_ids = torch.argmax(logits, -1)
             if return_logprob:
                 logprobs = torch.nn.functional.log_softmax(logits, dim=-1)
         else:
+            # Post process original logits. if temperatures are all 1.0, no need to rescale
+            if return_logprob and RETURN_ORIGINAL_LOGPROB:
+                logprobs = torch.softmax(logits, dim=-1)
             # Post process logits
             logits.div_(sampling_info.temperatures)
             logits[:] = torch.softmax(logits, dim=-1)
@@ -116,7 +122,12 @@ class Sampler(nn.Module):
             if return_logprob:
                 # clamp to avoid -inf
-                logprobs = torch.log(probs).clamp(min=torch.finfo(probs.dtype).min)
+                if RETURN_ORIGINAL_LOGPROB:
+                    logprobs = torch.log(logprobs).clamp(
+                        min=torch.finfo(logprobs.dtype).min
+                    )
+                else:
+                    logprobs = torch.log(probs).clamp(min=torch.finfo(probs.dtype).min)
         # Attach logprobs to logits_output (in-place modification)
         if return_logprob:
@@ -201,7 +212,10 @@ def top_p_normalize_probs_torch(
     return torch.zeros_like(probs_sort).scatter_(-1, probs_idx, probs_sort)
-def get_top_logprobs(logprobs: torch.Tensor, top_logprobs_nums: List[int]):
+def get_top_logprobs(
+    logprobs: torch.Tensor,
+    top_logprobs_nums: List[int],
+):
     max_k = max(top_logprobs_nums)
     ret = logprobs.topk(max_k, dim=1)
     values = ret.values.tolist()
@@ -212,10 +226,17 @@ def get_top_logprobs(logprobs: torch.Tensor, top_logprobs_nums: List[int]):
     for i, k in enumerate(top_logprobs_nums):
         output_top_logprobs_val.append(values[i][:k])
         output_top_logprobs_idx.append(indices[i][:k])
-    return output_top_logprobs_val, output_top_logprobs_idx
+    return (
+        output_top_logprobs_val,
+        output_top_logprobs_idx,
+    )
-def get_token_ids_logprobs(logprobs: torch.Tensor, token_ids_logprobs: List[List[int]]):
+def get_token_ids_logprobs(
+    logprobs: torch.Tensor,
+    token_ids_logprobs: List[List[int]],
+):
     output_token_ids_logprobs_val = []
     output_token_ids_logprobs_idx = []
     for i, token_ids in enumerate(token_ids_logprobs):
@@ -226,7 +247,10 @@ def get_token_ids_logprobs(logprobs: torch.Tensor, token_ids_logprobs: List[List
             output_token_ids_logprobs_val.append([])
             output_token_ids_logprobs_idx.append([])
-    return output_token_ids_logprobs_val, output_token_ids_logprobs_idx
+    return (
+        output_token_ids_logprobs_val,
+        output_token_ids_logprobs_idx,
+    )
 def apply_custom_logit_processor(

sglang 0.5.1.post3__py3-none-any.whl → 0.5.2rc1__py3-none-any.whl

sglang 0.5.1.post3py3-none-any.whl → 0.5.2rc1py3-none-any.whl