PyPI - sglang - Versions diffs - 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +18 -3
sglang/compile_deep_gemm.py +13 -7
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +120 -0
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +25 -2
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -5
sglang/srt/entrypoints/engine.py +13 -5
sglang/srt/entrypoints/http_server.py +22 -3
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +7 -0
sglang/srt/eplb/expert_distribution.py +34 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +7 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +3 -1
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +10 -2
sglang/srt/layers/communicator.py +23 -1
sglang/srt/layers/layernorm.py +16 -2
sglang/srt/layers/logits_processor.py +4 -20
sglang/srt/layers/moe/ep_moe/layer.py +0 -18
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/moe_runner/deep_gemm.py +53 -33
sglang/srt/layers/moe/token_dispatcher/deepep.py +12 -9
sglang/srt/layers/moe/topk.py +31 -6
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +9 -78
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/rotary_embedding.py +117 -45
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +26 -4
sglang/srt/managers/multi_tokenizer_mixin.py +5 -0
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +164 -129
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +154 -59
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/memory_pool.py +171 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +11 -11
sglang/srt/model_executor/model_runner.py +76 -21
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +149 -34
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +0 -1
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +1 -1
sglang/srt/models/qwen3_moe.py +16 -8
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +103 -22
sglang/srt/single_batch_overlap.py +4 -1
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +55 -32
sglang/srt/utils/hf_transformers_utils.py +38 -16
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +10 -24
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +150 -136
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/fla/layernorm_gated.py CHANGED Viewed

@@ -12,7 +12,9 @@ import triton
 import triton.language as tl
 from einops import rearrange
-from sglang.srt.utils import device_context
+from sglang.srt.utils import device_context, is_npu
+_is_npu = is_npu()
 def rms_norm_ref(
@@ -182,6 +184,10 @@ def _layer_norm_fwd(
     return out, mean, rstd
+if _is_npu:
+    from sgl_kernel_npu.fla.layernorm_gated import layer_norm_fwd_npu as _layer_norm_fwd
 def rms_norm_gated(
     *,
     x,

sglang/srt/layers/attention/flashattention_backend.py CHANGED Viewed

@@ -584,7 +584,9 @@ class FlashAttentionBackend(AttentionBackend):
                         metadata, metadata_expand
                     )
-        elif forward_batch.forward_mode.is_extend_or_draft_extend_or_mixed():
+        elif forward_batch.forward_mode.is_extend_or_draft_extend_or_mixed(
+            include_draft_extend_v2=True
+        ):
             metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
             metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
             metadata.cu_seqlens_k = torch.nn.functional.pad(
@@ -594,10 +596,9 @@ class FlashAttentionBackend(AttentionBackend):
                 forward_batch.req_pool_indices, : metadata.max_seq_len_k
             ]
-            if (
-                any(forward_batch.extend_prefix_lens_cpu)
-                or forward_batch.forward_mode == ForwardMode.DRAFT_EXTEND
-            ):
+            if any(
+                forward_batch.extend_prefix_lens_cpu
+            ) or forward_batch.forward_mode.is_draft_extend(include_v2=True):
                 extend_seq_lens = forward_batch.extend_seq_lens
                 metadata.max_seq_len_q = max(forward_batch.extend_seq_lens_cpu)
                 metadata.cu_seqlens_q = torch.nn.functional.pad(
@@ -826,7 +827,7 @@ class FlashAttentionBackend(AttentionBackend):
             if (
                 forward_batch.attn_attend_prefix_cache is not None
                 and not forward_batch.forward_mode.is_target_verify()
-                and not forward_batch.forward_mode.is_draft_extend()
+                and not forward_batch.forward_mode.is_draft_extend(include_v2=True)
             ):
                 # Do multi-head attention with chunked prefix cache
                 if forward_batch.attn_attend_prefix_cache:

sglang/srt/layers/attention/flashinfer_mla_backend.py CHANGED Viewed

@@ -242,9 +242,11 @@ class FlashInferMLAAttnBackend(AttentionBackend):
         else:
             self.q_indptr_decode = q_indptr_decode_buf
-        self.fmha_backend = "auto"
         if is_sm100_supported():
             self.fmha_backend = "cutlass"
+        else:
+            self.fmha_backend = "auto"
         self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
             self.workspace_buffer, "NHD", backend=self.fmha_backend
         )

sglang/srt/layers/attention/flashmla_backend.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Callable, Optional, Tuple, Union
 import torch
 import triton
-from flash_mla import flash_mla_with_kvcache, get_mla_metadata
+from sgl_kernel.flash_mla import flash_mla_with_kvcache, get_mla_metadata
 from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
 from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton

sglang/srt/layers/attention/hybrid_linear_attn_backend.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Optional, Union
 import torch
+from einops import rearrange
 from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
 from sglang.srt.layers.attention.fla.chunk import chunk_gated_delta_rule
@@ -10,6 +11,11 @@ from sglang.srt.layers.attention.fla.fused_recurrent import (
 from sglang.srt.layers.attention.fla.fused_sigmoid_gating_recurrent import (
     fused_sigmoid_gating_delta_rule_update,
 )
+from sglang.srt.layers.attention.fla.kda import (
+    chunk_kda,
+    fused_kda_gate,
+    fused_recurrent_kda,
+)
 from sglang.srt.layers.attention.mamba.causal_conv1d_triton import (
     PAD_SLOT_ID,
     causal_conv1d_fn,
@@ -227,6 +233,223 @@ class MambaAttnBackendBase(AttentionBackend):
         return 1  # Mamba attn does not use seq lens to index kv cache
+class KimiLinearAttnBackend(MambaAttnBackendBase):
+    """Attention backend using Mamba kernel."""
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        q_proj_states = kwargs["q_proj_states"]
+        k_proj_states = kwargs["k_proj_states"]
+        v_proj_states = kwargs["v_proj_states"]
+        q_conv_weights = kwargs["q_conv_weights"]
+        k_conv_weights = kwargs["k_conv_weights"]
+        v_conv_weights = kwargs["v_conv_weights"]
+        q_conv_bias = kwargs["q_conv_bias"]
+        k_conv_bias = kwargs["k_conv_bias"]
+        v_conv_bias = kwargs["v_conv_bias"]
+        A_log = kwargs["A_log"]
+        dt_bias = kwargs["dt_bias"]
+        b_proj = kwargs["b_proj"]
+        f_a_proj = kwargs["f_a_proj"]
+        f_b_proj = kwargs["f_b_proj"]
+        hidden_states = kwargs["hidden_states"]
+        head_dim = kwargs["head_dim"]
+        layer_id = kwargs["layer_id"]
+        layer_cache = self.req_to_token_pool.mamba2_layer_cache(layer_id)
+        q_conv_state, k_conv_state, v_conv_state = layer_cache.conv
+        ssm_states = layer_cache.temporal
+        query_start_loc = self.forward_metadata.query_start_loc
+        cache_indices = self.forward_metadata.mamba_cache_indices
+        q_conv_state = q_conv_state.transpose(-1, -2)
+        k_conv_state = k_conv_state.transpose(-1, -2)
+        v_conv_state = v_conv_state.transpose(-1, -2)
+        q = causal_conv1d_update(
+            q_proj_states,
+            q_conv_state,
+            q_conv_weights,
+            q_conv_bias,
+            activation="silu",
+            conv_state_indices=cache_indices,
+        )
+        k = causal_conv1d_update(
+            k_proj_states,
+            k_conv_state,
+            k_conv_weights,
+            k_conv_bias,
+            activation="silu",
+            conv_state_indices=cache_indices,
+        )
+        v = causal_conv1d_update(
+            v_proj_states,
+            v_conv_state,
+            v_conv_weights,
+            v_conv_bias,
+            activation="silu",
+            conv_state_indices=cache_indices,
+        )
+        q, k, v = map(
+            lambda x: rearrange(x, "n (h d) -> 1 n h d", d=head_dim), (q, k, v)
+        )
+        beta = b_proj(hidden_states)[0].float().sigmoid()
+        g = f_b_proj(f_a_proj(hidden_states)[0])[0]
+        g = fused_kda_gate(g, A_log, head_dim, g_bias=dt_bias)
+        beta = beta.unsqueeze(0)
+        g = g.unsqueeze(0)
+        initial_state = ssm_states[cache_indices].contiguous()
+        (
+            core_attn_out,
+            last_recurrent_state,
+        ) = fused_recurrent_kda(
+            q=q,
+            k=k,
+            v=v,
+            g=g,
+            beta=beta,
+            initial_state=initial_state,
+            use_qk_l2norm_in_kernel=True,
+            cu_seqlens=query_start_loc,
+        )
+        ssm_states[cache_indices] = last_recurrent_state
+        return core_attn_out
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        from sglang.srt.layers.attention.mamba.causal_conv1d_triton import (
+            causal_conv1d_fn,
+        )
+        q_proj_states = kwargs["q_proj_states"]
+        k_proj_states = kwargs["k_proj_states"]
+        v_proj_states = kwargs["v_proj_states"]
+        q_conv_weights = kwargs["q_conv_weights"]
+        k_conv_weights = kwargs["k_conv_weights"]
+        v_conv_weights = kwargs["v_conv_weights"]
+        q_conv_bias = kwargs["q_conv_bias"]
+        k_conv_bias = kwargs["k_conv_bias"]
+        v_conv_bias = kwargs["v_conv_bias"]
+        A_log = kwargs["A_log"]
+        dt_bias = kwargs["dt_bias"]
+        b_proj = kwargs["b_proj"]
+        f_a_proj = kwargs["f_a_proj"]
+        f_b_proj = kwargs["f_b_proj"]
+        hidden_states = kwargs["hidden_states"]
+        head_dim = kwargs["head_dim"]
+        layer_id = kwargs["layer_id"]
+        query_start_loc = self.forward_metadata.query_start_loc
+        cache_indices = self.forward_metadata.mamba_cache_indices
+        mamba_cache_params = self.req_to_token_pool.mamba2_layer_cache(layer_id)
+        conv_state_q, conv_state_k, conv_state_v = mamba_cache_params.conv
+        # deal with strides
+        conv_state_q = conv_state_q.transpose(-1, -2)
+        conv_state_k = conv_state_k.transpose(-1, -2)
+        conv_state_v = conv_state_v.transpose(-1, -2)
+        ssm_states = mamba_cache_params.temporal
+        has_initial_state = forward_batch.extend_prefix_lens > 0
+        q_proj_states = q_proj_states.transpose(0, 1)
+        k_proj_states = k_proj_states.transpose(0, 1)
+        v_proj_states = v_proj_states.transpose(0, 1)
+        q = causal_conv1d_fn(
+            q_proj_states,
+            q_conv_weights,
+            q_conv_bias,
+            activation="silu",
+            conv_states=conv_state_q,
+            has_initial_state=has_initial_state,
+            cache_indices=cache_indices,
+            query_start_loc=query_start_loc,
+            seq_lens_cpu=forward_batch.extend_seq_lens_cpu,
+        ).transpose(0, 1)
+        k = causal_conv1d_fn(
+            k_proj_states,
+            k_conv_weights,
+            k_conv_bias,
+            activation="silu",
+            conv_states=conv_state_k,
+            has_initial_state=has_initial_state,
+            cache_indices=cache_indices,
+            query_start_loc=query_start_loc,
+            seq_lens_cpu=forward_batch.extend_seq_lens_cpu,
+        ).transpose(0, 1)
+        v = causal_conv1d_fn(
+            v_proj_states,
+            v_conv_weights,
+            v_conv_bias,
+            activation="silu",
+            conv_states=conv_state_v,
+            has_initial_state=has_initial_state,
+            cache_indices=cache_indices,
+            query_start_loc=query_start_loc,
+            seq_lens_cpu=forward_batch.extend_seq_lens_cpu,
+        ).transpose(0, 1)
+        q, k, v = map(
+            lambda x: rearrange(x, "n (h d) -> 1 n h d", d=head_dim), (q, k, v)
+        )
+        beta = b_proj(hidden_states)[0].float().sigmoid()
+        g = f_b_proj(f_a_proj(hidden_states)[0])[0]
+        g = fused_kda_gate(g, A_log, head_dim, g_bias=dt_bias)
+        beta = beta.unsqueeze(0)
+        g = g.unsqueeze(0)
+        initial_state = ssm_states[cache_indices].contiguous()
+        (
+            core_attn_out,
+            last_recurrent_state,
+        ) = chunk_kda(
+            q=q,
+            k=k,
+            v=v,
+            g=g,
+            beta=beta,
+            initial_state=initial_state,
+            output_final_state=True,
+            use_qk_l2norm_in_kernel=True,
+            cu_seqlens=query_start_loc,
+        )
+        ssm_states[cache_indices] = last_recurrent_state
+        return core_attn_out
 class GDNAttnBackend(MambaAttnBackendBase):
     """Attention backend using Mamba kernel."""

sglang/srt/layers/attention/mamba/mamba.py CHANGED Viewed

@@ -13,16 +13,6 @@ from sglang.srt.distributed import (
     get_tensor_model_parallel_world_size,
 )
 from sglang.srt.distributed.utils import divide
-from sglang.srt.layers.attention.mamba.causal_conv1d import (
-    causal_conv1d_fn,
-    causal_conv1d_update,
-)
-from sglang.srt.layers.attention.mamba.causal_conv1d_triton import (
-    causal_conv1d_fn as causal_conv1d_fn_triton,
-)
-from sglang.srt.layers.attention.mamba.causal_conv1d_triton import (
-    causal_conv1d_update as causal_conv1d_update_triton,
-)
 from sglang.srt.layers.attention.mamba.mamba2_metadata import Mamba2Metadata
 from sglang.srt.layers.attention.mamba.mixer2_rms_norm_gated import Mixer2RMSNormGated
 from sglang.srt.layers.attention.mamba.ops import (
@@ -40,7 +30,26 @@ from sglang.srt.model_loader.weight_utils import (
     composed_weight_loader,
     sharded_weight_loader,
 )
-from sglang.srt.utils import set_weight_attrs
+from sglang.srt.utils import is_cuda, is_npu, set_weight_attrs
+if is_cuda():
+    from sglang.srt.layers.attention.mamba.causal_conv1d import (
+        causal_conv1d_fn,
+        causal_conv1d_update,
+    )
+    from sglang.srt.layers.attention.mamba.causal_conv1d_triton import (
+        causal_conv1d_fn as causal_conv1d_fn_triton,
+    )
+    from sglang.srt.layers.attention.mamba.causal_conv1d_triton import (
+        causal_conv1d_update as causal_conv1d_update_triton,
+    )
+elif is_npu():
+    from sgl_kernel_npu.mamba.causal_conv1d import (
+        causal_conv1d_fn_npu as causal_conv1d_fn,
+    )
+    from sgl_kernel_npu.mamba.causal_conv1d import (
+        causal_conv1d_update_npu as causal_conv1d_update,
+    )
 LoaderFunction = Callable[[torch.Tensor, torch.Tensor], None]

sglang/srt/layers/attention/nsa/dequant_k_cache.py CHANGED Viewed

@@ -22,6 +22,10 @@ def _dequantize_k_cache_slow(
     De-quantize the k-cache
     """
     assert dv % tile_size == 0
+    original_ndim = quant_k_cache.ndim
+    if original_ndim == 3:
+        # set block_size = 1
+        quant_k_cache = quant_k_cache.unsqueeze(1)
     num_tiles = dv // tile_size
     num_blocks, block_size, h_k, _ = quant_k_cache.shape
     assert h_k == 1
@@ -45,8 +49,10 @@ def _dequantize_k_cache_slow(
             cur_nope * cur_scales
         )
-    result = result.view(num_blocks, block_size, 1, d)
-    return result
+    if original_ndim == 3:
+        return result.view(num_blocks, 1, -1)
+    else:
+        return result.view(num_blocks, block_size, 1, -1)
 def _dequantize_k_cache_fast_wrapped(
@@ -54,7 +60,10 @@ def _dequantize_k_cache_fast_wrapped(
     dv: int = 512,
     tile_size: int = 128,
 ) -> torch.Tensor:
-    # TODO the final API may be 2D instead of 4D, thus we convert them here
+    original_ndim = quant_k_cache.ndim
+    if original_ndim == 3:
+        # set block_size = 1
+        quant_k_cache = quant_k_cache.unsqueeze(1)
     num_blocks, block_size, _, dim_quant = quant_k_cache.shape
     assert dv == 512
     assert dim_quant == 656
@@ -63,7 +72,10 @@ def _dequantize_k_cache_fast_wrapped(
     output = _dequantize_k_cache_fast(quant_k_cache)
-    return output.view(num_blocks, block_size, 1, -1)
+    if original_ndim == 3:
+        return output.view(num_blocks, 1, -1)
+    else:
+        return output.view(num_blocks, block_size, 1, -1)
 def _dequantize_k_cache_fast(quant_k_cache, group_size: int = 128):
@@ -85,7 +97,6 @@ def _dequantize_k_cache_fast(quant_k_cache, group_size: int = 128):
     assert num_blocks_per_token == 5
     assert dim_nope % group_size == 0
-    NUM_NOPE_BLOCKS = dim_nope // group_size
     input_nope_q = quant_k_cache[:, :dim_nope]
     input_nope_s = quant_k_cache[:, dim_nope : dim_nope + num_tiles * 4].view(
@@ -102,7 +113,7 @@ def _dequantize_k_cache_fast(quant_k_cache, group_size: int = 128):
         input_nope_q.stride(0),
         input_nope_s.stride(0),
         input_rope.stride(0),
-        NUM_NOPE_BLOCKS=NUM_NOPE_BLOCKS,
+        NUM_NOPE_BLOCKS=num_tiles,
         GROUP_SIZE=group_size,
         DIM_NOPE=dim_nope,
         DIM_ROPE=dim_rope,
@@ -159,5 +170,126 @@ def _dequantize_k_cache_fast_kernel(
         tl.store(dst_ptr, data, mask=mask)
+def dequantize_k_cache_paged(
+    quant_k_cache: torch.Tensor,
+    page_table_1_flattened: torch.Tensor,
+    group_size: int = 128,
+) -> torch.Tensor:
+    """
+    De-quantize the k-cache with paged layout
+    Args:
+        quant_k_cache: [total_num_tokens, 1, dim_quant] or [num_blocks, block_size, 1, dim_quant], the quantized k-cache in paged layout
+        page_table_1_flattened: [num_tokens], the flattened page_table_1 with the page indices in each requests concatenated together
+    Returns:
+        output: [num_tokens, 1, dim_nope + dim_rope], the de-quantized k-cache
+    """
+    dim_quant = quant_k_cache.shape[-1]
+    assert (
+        dim_quant == 656
+    ), f"dim_quant: {dim_quant} != 656 detected in dequantize_k_cache_paged"
+    quant_k_cache = quant_k_cache.view((-1, dim_quant))
+    total_num_tokens, _ = quant_k_cache.shape
+    num_tokens = page_table_1_flattened.shape[0]
+    assert num_tokens <= total_num_tokens
+    assert quant_k_cache.dtype == torch.float8_e4m3fn
+    dim_nope = 512
+    dim_rope = 64
+    num_tiles = dim_nope // group_size  # 512 // 128 = 4
+    output = torch.empty(
+        (num_tokens, 1, dim_nope + dim_rope),
+        dtype=torch.bfloat16,
+        device=quant_k_cache.device,
+    )
+    # cdiv(512 + 64, 128) = 5
+    num_blocks_per_token = triton.cdiv(dim_nope + dim_rope, group_size)
+    assert num_blocks_per_token == 5
+    assert dim_nope % group_size == 0
+    input_nope_q = quant_k_cache[:, :dim_nope]
+    # [:, 512:512+4*4] = [:, 512:528]
+    input_nope_s = quant_k_cache[:, dim_nope : dim_nope + num_tiles * 4].view(
+        torch.float32
+    )
+    # [:, 528:]
+    input_rope = quant_k_cache[:, dim_nope + num_tiles * 4 :].view(torch.bfloat16)
+    _dequantize_k_cache_paged_kernel[(num_tokens, num_blocks_per_token)](
+        output,
+        input_nope_q,
+        input_nope_s,
+        input_rope,
+        page_table_1_flattened,
+        output.stride(0),
+        input_nope_q.stride(0),
+        input_nope_s.stride(0),
+        input_rope.stride(0),
+        NUM_NOPE_BLOCKS=num_tiles,
+        GROUP_SIZE=group_size,
+        DIM_NOPE=dim_nope,
+        DIM_ROPE=dim_rope,
+    )
+    return output
+@triton.jit
+def _dequantize_k_cache_paged_kernel(
+    output_ptr,
+    input_nope_q_ptr,
+    input_nope_s_ptr,
+    input_rope_ptr,
+    page_table_1_ptr,
+    output_stride_0: int,
+    input_nope_q_stride_0: int,
+    input_nope_s_stride_0: int,
+    input_rope_stride_0: int,
+    NUM_NOPE_BLOCKS: tl.constexpr,
+    GROUP_SIZE: tl.constexpr,
+    DIM_NOPE: tl.constexpr,
+    DIM_ROPE: tl.constexpr,
+):
+    token_id = tl.program_id(0)
+    token_id_paged = tl.load(page_table_1_ptr + token_id).to(tl.int32)
+    raw_block_id = tl.program_id(1)
+    if raw_block_id < NUM_NOPE_BLOCKS:
+        # a. dequant nope
+        effective_block_id = raw_block_id
+        offs_q = effective_block_id * GROUP_SIZE + tl.arange(0, GROUP_SIZE)
+        mask = offs_q < DIM_NOPE
+        ptr_q = input_nope_q_ptr + token_id_paged * input_nope_q_stride_0 + offs_q
+        ptr_s = (
+            input_nope_s_ptr
+            + token_id_paged * input_nope_s_stride_0
+            + effective_block_id
+        )
+        y_q = tl.load(ptr_q, mask=mask, other=0.0).to(tl.float32)
+        y_s = tl.load(ptr_s)
+        y = (y_q * y_s).to(output_ptr.dtype.element_ty)
+        dst_ptr = output_ptr + token_id * output_stride_0 + offs_q
+        tl.store(dst_ptr, y, mask=mask)
+    else:
+        # b. copy rope
+        effective_block_id = raw_block_id - NUM_NOPE_BLOCKS
+        offs = effective_block_id * GROUP_SIZE + tl.arange(0, GROUP_SIZE)
+        mask = offs < DIM_ROPE
+        src_ptr = input_rope_ptr + token_id_paged * input_rope_stride_0 + offs
+        dst_ptr = output_ptr + token_id * output_stride_0 + DIM_NOPE + offs
+        data = tl.load(src_ptr, mask=mask).to(tl.bfloat16)
+        tl.store(dst_ptr, data, mask=mask)
 if __name__ == "__main__":
     raise Exception("UT is in quant_k_cache.py")

sglang/srt/layers/attention/nsa/nsa_indexer.py CHANGED Viewed

@@ -119,6 +119,7 @@ class Indexer(CustomOp):
         prefix: str = "",
         quant_config: Optional[QuantizationConfig] = None,
         alt_stream: Optional[torch.cuda.Stream] = None,
+        fuse_wk_and_weights_proj: bool = False,
     ):
         super().__init__()
         self.hidden_size = hidden_size
@@ -129,6 +130,7 @@ class Indexer(CustomOp):
         self.q_lora_rank = q_lora_rank
         self.layer_id = layer_id
         self.alt_stream = alt_stream
+        self.fuse_wk_and_weights_proj = fuse_wk_and_weights_proj
         if is_cuda():
             self.sm_count = deep_gemm.get_num_sms()
             self.half_device_sm_count = align(self.sm_count // 2, 8)
@@ -140,21 +142,29 @@ class Indexer(CustomOp):
             quant_config=quant_config,
             prefix=add_prefix("wq_b", prefix),
         )
-        self.wk = ReplicatedLinear(
-            self.hidden_size,
-            self.head_dim,
-            bias=False,
-            quant_config=quant_config,
-            prefix=add_prefix("wk", prefix),
-        )
+        if self.fuse_wk_and_weights_proj:
+            self.fused_wk_and_weights_proj = ReplicatedLinear(
+                self.hidden_size,
+                self.head_dim + self.n_heads,
+                bias=False,
+                prefix=add_prefix("fused_wk_and_weights_proj", prefix),
+            )
+        else:
+            self.wk = ReplicatedLinear(
+                self.hidden_size,
+                self.head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("wk", prefix),
+            )
+            # NOTE: weight_proj is not quantized
+            self.weights_proj = ReplicatedLinear(
+                self.hidden_size,
+                self.n_heads,
+                bias=False,
+                prefix=add_prefix("weights_proj", prefix),
+            )
         self.k_norm = V32LayerNorm(self.head_dim)
-        # NOTE: weight_proj is not quantized
-        self.weights_proj = ReplicatedLinear(
-            self.hidden_size,
-            self.n_heads,
-            bias=False,
-            prefix=add_prefix("weights_proj", prefix),
-        )
         self.rotary_emb = get_rope_wrapper(
             rope_head_dim,
             rotary_dim=rope_head_dim,
@@ -169,8 +179,7 @@ class Indexer(CustomOp):
         self.softmax_scale = self.head_dim**-0.5
     @torch.compile(dynamic=True)
-    def _get_logits_head_gate(self, x: torch.Tensor, q_scale: torch.Tensor):
-        weights, _ = self.weights_proj(x)
+    def _get_logits_head_gate(self, weights: torch.Tensor, q_scale: torch.Tensor):
         weights = weights * self.n_heads**-0.5
         weights = weights.unsqueeze(-1) * q_scale * self.softmax_scale
         return weights
@@ -182,7 +191,7 @@ class Indexer(CustomOp):
         positions: torch.Tensor,
         enable_dual_stream: bool,
     ):
+        weights = None
         if enable_dual_stream:
             current_stream = torch.cuda.current_stream()
             self.alt_stream.wait_stream(current_stream)
@@ -199,7 +208,12 @@ class Indexer(CustomOp):
                 )
             with torch.cuda.stream(self.alt_stream):
                 # TODO we should also put DeepGEMM half SM here?
-                key, _ = self.wk(x)
+                if self.fuse_wk_and_weights_proj:
+                    key, weights = self.fused_wk_and_weights_proj(x)[0].split(
+                        [self.head_dim, self.n_heads], dim=-1
+                    )
+                else:
+                    key, _ = self.wk(x)
                 key = self.k_norm(key)
                 k_rope, _ = torch.split(
@@ -217,7 +231,12 @@ class Indexer(CustomOp):
                 query, [self.rope_head_dim, self.head_dim - self.rope_head_dim], dim=-1
             )
-            key, _ = self.wk(x)
+            if self.fuse_wk_and_weights_proj:
+                key, weights = self.fused_wk_and_weights_proj(x)[0].split(
+                    [self.head_dim, self.n_heads], dim=-1
+                )
+            else:
+                key, _ = self.wk(x)
             key = self.k_norm(key)
             k_rope, _ = torch.split(
                 key, [self.rope_head_dim, self.head_dim - self.rope_head_dim], dim=-1
@@ -240,7 +259,7 @@ class Indexer(CustomOp):
             query = rotate_activation(query)
             key = rotate_activation(key)
-        return query, key
+        return query, key, weights
     def _get_topk_paged(
         self,
@@ -490,7 +509,9 @@ class Indexer(CustomOp):
         if metadata is None:
             return None
-        query, key = self._get_q_k_bf16(q_lora, x, positions, enable_dual_stream)
+        query, key, weights = self._get_q_k_bf16(
+            q_lora, x, positions, enable_dual_stream
+        )
         if enable_dual_stream:
             current_stream = torch.cuda.current_stream()
@@ -517,7 +538,9 @@ class Indexer(CustomOp):
             index_k_scale=k_scale,
         )
-        weights = self._get_logits_head_gate(x, q_scale)
+        if not self.fuse_wk_and_weights_proj:
+            weights, _ = self.weights_proj(x)
+        weights = self._get_logits_head_gate(weights, q_scale)
         if is_cuda():
             assert forward_batch.seq_lens_cpu is not None

sglang 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl