PyPI - sglang - Versions diffs - 0.5.1.post3__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

sglang 0.5.1.post3py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (245) hide show

sglang/bench_one_batch.py +3 -0
sglang/bench_one_batch_server.py +10 -1
sglang/bench_serving.py +251 -26
sglang/lang/interpreter.py +1 -1
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/internvl.py +6 -0
sglang/srt/configs/longcat_flash.py +104 -0
sglang/srt/configs/model_config.py +37 -7
sglang/srt/configs/qwen3_next.py +326 -0
sglang/srt/connector/__init__.py +1 -1
sglang/srt/connector/base_connector.py +1 -2
sglang/srt/connector/redis.py +2 -2
sglang/srt/connector/serde/__init__.py +1 -1
sglang/srt/connector/serde/safe_serde.py +4 -3
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +11 -3
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/ascend/conn.py +75 -0
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +15 -12
sglang/srt/disaggregation/decode.py +6 -4
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -420
sglang/srt/disaggregation/mooncake/conn.py +18 -10
sglang/srt/disaggregation/nixl/conn.py +180 -16
sglang/srt/disaggregation/prefill.py +6 -4
sglang/srt/disaggregation/utils.py +5 -50
sglang/srt/distributed/parallel_state.py +94 -58
sglang/srt/entrypoints/engine.py +34 -14
sglang/srt/entrypoints/http_server.py +172 -47
sglang/srt/entrypoints/openai/protocol.py +63 -3
sglang/srt/entrypoints/openai/serving_base.py +6 -2
sglang/srt/entrypoints/openai/serving_chat.py +34 -19
sglang/srt/entrypoints/openai/serving_completions.py +10 -4
sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
sglang/srt/entrypoints/openai/serving_responses.py +7 -4
sglang/srt/eplb/eplb_manager.py +28 -4
sglang/srt/eplb/expert_distribution.py +55 -15
sglang/srt/eplb/expert_location.py +8 -3
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +12 -0
sglang/srt/layers/activation.py +44 -9
sglang/srt/layers/attention/aiter_backend.py +93 -68
sglang/srt/layers/attention/ascend_backend.py +250 -112
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashinfer_backend.py +6 -4
sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
sglang/srt/layers/attention/mamba/mamba.py +64 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/communicator.py +45 -7
sglang/srt/layers/layernorm.py +54 -12
sglang/srt/layers/logits_processor.py +10 -3
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
sglang/srt/layers/moe/ep_moe/layer.py +110 -49
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/runner.py +80 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +43 -12
sglang/srt/layers/moe/utils.py +6 -5
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +9 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
sglang/srt/layers/quantization/fp8.py +76 -47
sglang/srt/layers/quantization/fp8_utils.py +43 -29
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +107 -40
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +77 -45
sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
sglang/srt/layers/quantization/quark/utils.py +97 -0
sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/utils.py +13 -0
sglang/srt/layers/quantization/w4afp8.py +60 -42
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +83 -41
sglang/srt/layers/rocm_linear_utils.py +44 -0
sglang/srt/layers/rotary_embedding.py +28 -19
sglang/srt/layers/sampler.py +29 -5
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/triton_backend.py +90 -2
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +4 -1
sglang/srt/lora/lora_manager.py +35 -112
sglang/srt/lora/mem_pool.py +24 -10
sglang/srt/lora/utils.py +18 -9
sglang/srt/managers/cache_controller.py +242 -278
sglang/srt/managers/data_parallel_controller.py +30 -15
sglang/srt/managers/detokenizer_manager.py +13 -2
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +160 -11
sglang/srt/managers/mm_utils.py +6 -1
sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
sglang/srt/managers/schedule_batch.py +27 -44
sglang/srt/managers/schedule_policy.py +4 -3
sglang/srt/managers/scheduler.py +90 -115
sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
sglang/srt/managers/template_manager.py +3 -3
sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
sglang/srt/managers/tokenizer_manager.py +41 -477
sglang/srt/managers/tp_worker.py +16 -4
sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
sglang/srt/mem_cache/allocator.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +24 -22
sglang/srt/mem_cache/hiradix_cache.py +184 -101
sglang/srt/mem_cache/lora_radix_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +324 -41
sglang/srt/mem_cache/memory_pool_host.py +25 -18
sglang/srt/mem_cache/radix_cache.py +5 -6
sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +149 -12
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +74 -19
sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
sglang/srt/mem_cache/swa_radix_cache.py +1 -3
sglang/srt/metrics/collector.py +484 -63
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/metrics/utils.py +48 -0
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +13 -5
sglang/srt/model_executor/forward_batch_info.py +72 -18
sglang/srt/model_executor/model_runner.py +189 -31
sglang/srt/model_loader/__init__.py +9 -3
sglang/srt/model_loader/loader.py +33 -28
sglang/srt/model_loader/utils.py +12 -0
sglang/srt/model_loader/weight_utils.py +2 -1
sglang/srt/models/deepseek_v2.py +311 -50
sglang/srt/models/gemma3n_mm.py +1 -1
sglang/srt/models/glm4_moe.py +10 -1
sglang/srt/models/glm4v.py +4 -2
sglang/srt/models/gpt_oss.py +5 -18
sglang/srt/models/internvl.py +28 -0
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +17 -0
sglang/srt/models/longcat_flash.py +1026 -0
sglang/srt/models/longcat_flash_nextn.py +699 -0
sglang/srt/models/minicpmv.py +165 -3
sglang/srt/models/mllama4.py +25 -0
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2.py +33 -3
sglang/srt/models/qwen2_5_vl.py +90 -42
sglang/srt/models/qwen2_moe.py +79 -14
sglang/srt/models/qwen3.py +8 -2
sglang/srt/models/qwen3_moe.py +39 -8
sglang/srt/models/qwen3_next.py +1039 -0
sglang/srt/models/qwen3_next_mtp.py +109 -0
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/models/transformers.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +4 -2
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +141 -129
sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
sglang/srt/sampling/sampling_batch_info.py +18 -15
sglang/srt/server_args.py +297 -79
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
sglang/srt/speculative/eagle_worker.py +216 -120
sglang/srt/speculative/spec_info.py +5 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/utils.py +37 -2
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_trtllm_mla_backend.py +181 -8
sglang/test/few_shot_gsm8k.py +1 -0
sglang/test/runners.py +4 -0
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_cutlass_w4a8_moe.py +24 -9
sglang/test/test_disaggregation_utils.py +66 -0
sglang/test/test_utils.py +25 -1
sglang/utils.py +5 -0
sglang/version.py +1 -1
{sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/METADATA +11 -9
{sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/RECORD +243 -194
sglang/srt/disaggregation/launch_lb.py +0 -131
sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
/sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
/sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
/sglang/srt/{conversation.py → parser/conversation.py} +0 -0
/sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
/sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
{sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
{sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/fla/chunk.py ADDED Viewed

@@ -0,0 +1,242 @@
+# Adapted from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/gated_delta_rule/chunk.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import warnings
+from typing import Optional
+import torch
+from einops import rearrange
+from sglang.srt.layers.attention.fla.chunk_delta_h import chunk_gated_delta_rule_fwd_h
+from sglang.srt.layers.attention.fla.chunk_o import chunk_fwd_o
+from sglang.srt.layers.attention.fla.chunk_scaled_dot_kkt import (
+    chunk_scaled_dot_kkt_fwd,
+)
+from sglang.srt.layers.attention.fla.cumsum import chunk_local_cumsum
+from sglang.srt.layers.attention.fla.l2norm import l2norm_fwd
+from sglang.srt.layers.attention.fla.solve_tril import solve_tril
+from sglang.srt.layers.attention.fla.utils import (
+    SUPPRESS_LEVEL,
+    autocast_custom_fwd,
+    input_guard,
+)
+from sglang.srt.layers.attention.fla.wy_fast import recompute_w_u_fwd
+def chunk_gated_delta_rule_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    output_final_state: bool,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+):
+    g = chunk_local_cumsum(g, chunk_size=64, cu_seqlens=cu_seqlens)
+    # obtain WY representation. u is actually the new v.
+    A = chunk_scaled_dot_kkt_fwd(
+        k=k, beta=beta, g_cumsum=g, cu_seqlens=cu_seqlens, output_dtype=torch.float32
+    )
+    A = solve_tril(A=A, cu_seqlens=cu_seqlens, output_dtype=k.dtype)
+    w, u = recompute_w_u_fwd(
+        k=k,
+        v=v,
+        beta=beta,
+        A=A,
+        g_cumsum=g,
+        cu_seqlens=cu_seqlens,
+    )
+    h, v_new, final_state = chunk_gated_delta_rule_fwd_h(
+        k=k,
+        w=w,
+        u=u,
+        g=g,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        cu_seqlens=cu_seqlens,
+    )
+    o = chunk_fwd_o(
+        q=q,
+        k=k,
+        v=v_new,
+        h=h,
+        g=g,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+    )
+    if SUPPRESS_LEVEL < 3:
+        return g, o, A, final_state, None, None, None
+    elif SUPPRESS_LEVEL >= 3:
+        return g, o, A, final_state, w, h, v_new
+class ChunkGatedDeltaRuleFunction(torch.autograd.Function):
+    @staticmethod
+    @input_guard
+    @autocast_custom_fwd
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        output_final_state: bool,
+        cu_seqlens: Optional[torch.LongTensor] = None,
+        use_qk_l2norm_in_kernel: bool = False,
+    ):
+        q_orig = q
+        k_orig = k
+        if use_qk_l2norm_in_kernel:
+            q = l2norm_fwd(q)
+            k = l2norm_fwd(k)
+        g, o, A, final_state, w, h, v_new = chunk_gated_delta_rule_fwd(
+            q=q,
+            k=k,
+            v=v,
+            g=g,
+            beta=beta,
+            scale=scale,
+            initial_state=initial_state,
+            output_final_state=output_final_state,
+            cu_seqlens=cu_seqlens,
+        )
+        return o.to(q.dtype), final_state
+@torch.compiler.disable
+def chunk_gated_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+    use_qk_l2norm_in_kernel: bool = False,
+):
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
+        v (torch.Tensor):
+            values of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
+        g (torch.Tensor):
+            (forget) gating tensor (in log space!) of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`.
+        beta (torch.Tensor):
+            betas of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`.
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, H, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `False`.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
+    Examples::
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from einops import rearrange
+        >>> from fla.ops.gated_delta_rule import chunk_gated_delta_rule
+        # inputs with equal lengths
+        >>> B, T, H, K, V = 4, 2048, 4, 512, 512
+        >>> q = torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda')
+        >>> k = F.normalize(torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda'), p=2, dim=-1)
+        >>> v = torch.randn(B, T, H, V, dtype=torch.bfloat16, device='cuda')
+        >>> beta = torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda').sigmoid()
+        >>> g = F.logsigmoid(torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda'))
+        >>> h0 = torch.randn(B, H, K, V, dtype=torch.bfloat16, device='cuda')
+        >>> o, ht = chunk_gated_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            output_final_state=True
+        )
+        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
+        >>> q, k, v, beta, g = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, beta, g))
+        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
+        >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
+        >>> o_var, ht_var = chunk_gated_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            output_final_state=True,
+            cu_seqlens=cu_seqlens
+        )
+    """
+    assert q.dtype == k.dtype == v.dtype
+    assert (
+        q.dtype != torch.float32
+    ), "ChunkGatedDeltaRuleFunction does not support float32. Please use bfloat16."
+    assert (
+        len(beta.shape) == 3
+    ), "beta must be of shape [B, T, H] if head_first=False, or [B, H, T] otherwise."
+    if head_first:
+        raise DeprecationWarning(
+            "head_first is deprecated and will be removed in a future version. "
+            "Please use head_first=False for now instead."
+        )
+        q, k, v, beta, g = map(
+            lambda x: rearrange(x, "b h t ... -> b t h ..."), (q, k, v, beta, g)
+        )
+    # if not head_first and q.shape[1] < q.shape[2]:
+    #     warnings.warn(
+    #         f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
+    #         "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
+    #         "when head_first=False was specified. "
+    #         "Please verify your input tensor format matches the expected shape [B, T, H, ...]."
+    #     )
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(
+                f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                f"Please flatten variable-length inputs before processing."
+            )
+        if initial_state is not None and initial_state.shape[0] != len(cu_seqlens) - 1:
+            raise ValueError(
+                f"The number of initial states is expected to be equal to the number of input sequences, "
+                f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}."
+            )
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    o, final_state = ChunkGatedDeltaRuleFunction.apply(
+        q,
+        k,
+        v,
+        g,
+        beta,
+        scale,
+        initial_state,
+        output_final_state,
+        cu_seqlens,
+        use_qk_l2norm_in_kernel,
+    )
+    if head_first:
+        o = rearrange(o, "b t h ... -> b h t ...")
+    return o, final_state

sglang/srt/layers/attention/fla/chunk_delta_h.py ADDED Viewed

@@ -0,0 +1,314 @@
+# Adapted from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/common/chunk_delta_h.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from sglang.srt.layers.attention.fla.index import (
+    prepare_chunk_indices,
+    prepare_chunk_offsets,
+)
+from sglang.srt.layers.attention.fla.op import exp, safe_exp
+from sglang.srt.layers.attention.fla.utils import is_nvidia_hopper
+NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8, 16]
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "USE_INITIAL_STATE": lambda args: args["h0"] is not None,
+        "STORE_FINAL_STATE": lambda args: args["ht"] is not None,
+        "SAVE_NEW_VALUE": lambda args: args["v_new"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+# @triton.autotune(
+#     configs=[
+#         triton.Config({"BV": BV}, num_warps=num_warps, num_stages=num_stages)
+#         for num_warps in [2, 4]
+#         for num_stages in [2, 3, 4]
+#         for BV in [32, 64]
+#     ],
+#     key=["H", "K", "V", "BT", "USE_G"],
+#     use_cuda_graph=use_cuda_graph,
+# )
+@triton.jit(do_not_specialize=["T"])
+def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
+    k,
+    v,
+    w,
+    v_new,
+    g,
+    h,
+    h0,
+    ht,
+    cu_seqlens,
+    chunk_offsets,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    SAVE_NEW_VALUE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_nh = tl.program_id(0), tl.program_id(1)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+    # [BK, BV]
+    b_h1 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 64:
+        b_h2 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 128:
+        b_h3 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 192:
+        b_h4 = tl.zeros([64, BV], dtype=tl.float32)
+    # calculate offset
+    h += (boh * H + i_h) * K * V
+    v += (bos * H + i_h) * V
+    k += (bos * Hg + i_h // (H // Hg)) * K
+    w += (bos * H + i_h) * K
+    if SAVE_NEW_VALUE:
+        v_new += (bos * H + i_h) * V
+    stride_v = H * V
+    stride_h = H * K * V
+    stride_k = Hg * K
+    stride_w = H * K
+    if USE_INITIAL_STATE:
+        h0 = h0 + i_nh * K * V
+    if STORE_FINAL_STATE:
+        ht = ht + i_nh * K * V
+    # load initial state
+    if USE_INITIAL_STATE:
+        p_h0_1 = tl.make_block_ptr(h0, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        b_h1 += tl.load(p_h0_1, boundary_check=(0, 1)).to(tl.float32)
+        if K > 64:
+            p_h0_2 = tl.make_block_ptr(
+                h0, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)
+            )
+            b_h2 += tl.load(p_h0_2, boundary_check=(0, 1)).to(tl.float32)
+        if K > 128:
+            p_h0_3 = tl.make_block_ptr(
+                h0, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)
+            )
+            b_h3 += tl.load(p_h0_3, boundary_check=(0, 1)).to(tl.float32)
+        if K > 192:
+            p_h0_4 = tl.make_block_ptr(
+                h0, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)
+            )
+            b_h4 += tl.load(p_h0_4, boundary_check=(0, 1)).to(tl.float32)
+    # main recurrence
+    for i_t in range(NT):
+        p_h1 = tl.make_block_ptr(
+            h + i_t * stride_h, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0)
+        )
+        tl.store(p_h1, b_h1.to(p_h1.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_h2 = tl.make_block_ptr(
+                h + i_t * stride_h, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_h2, b_h2.to(p_h2.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_h3 = tl.make_block_ptr(
+                h + i_t * stride_h, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_h3, b_h3.to(p_h3.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_h4 = tl.make_block_ptr(
+                h + i_t * stride_h, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_h4, b_h4.to(p_h4.dtype.element_ty), boundary_check=(0, 1))
+        p_v = tl.make_block_ptr(
+            v, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+        )
+        p_v_new = (
+            tl.make_block_ptr(
+                v_new, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+            )
+            if SAVE_NEW_VALUE
+            else None
+        )
+        b_v_new = tl.zeros([BT, BV], dtype=tl.float32)
+        p_w = tl.make_block_ptr(
+            w, (T, K), (stride_w, 1), (i_t * BT, 0), (BT, 64), (1, 0)
+        )
+        b_w = tl.load(p_w, boundary_check=(0, 1))
+        b_v_new += tl.dot(b_w, b_h1.to(b_w.dtype))
+        if K > 64:
+            p_w = tl.make_block_ptr(
+                w, (T, K), (stride_w, 1), (i_t * BT, 64), (BT, 64), (1, 0)
+            )
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v_new += tl.dot(b_w, b_h2.to(b_w.dtype))
+        if K > 128:
+            p_w = tl.make_block_ptr(
+                w, (T, K), (stride_w, 1), (i_t * BT, 128), (BT, 64), (1, 0)
+            )
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v_new += tl.dot(b_w, b_h3.to(b_w.dtype))
+        if K > 192:
+            p_w = tl.make_block_ptr(
+                w, (T, K), (stride_w, 1), (i_t * BT, 192), (BT, 64), (1, 0)
+            )
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v_new += tl.dot(b_w, b_h4.to(b_w.dtype))
+        b_v_new = -b_v_new + tl.load(p_v, boundary_check=(0, 1))
+        if SAVE_NEW_VALUE:
+            p_v_new = tl.make_block_ptr(
+                v_new, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+            )
+            tl.store(
+                p_v_new, b_v_new.to(p_v_new.dtype.element_ty), boundary_check=(0, 1)
+            )
+        if USE_G:
+            last_idx = min((i_t + 1) * BT, T) - 1
+            b_g_last = tl.load(g + bos * H + last_idx * H + i_h)
+            p_g = tl.make_block_ptr(
+                g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)
+            )
+            b_g = tl.load(p_g, boundary_check=(0,))
+            b_v_new = b_v_new * safe_exp(b_g_last - b_g)[:, None]
+            b_g_last = exp(b_g_last)
+            b_h1 = b_h1 * b_g_last
+            if K > 64:
+                b_h2 = b_h2 * b_g_last
+            if K > 128:
+                b_h3 = b_h3 * b_g_last
+            if K > 192:
+                b_h4 = b_h4 * b_g_last
+        b_v_new = b_v_new.to(k.dtype.element_ty)
+        p_k = tl.make_block_ptr(
+            k, (K, T), (1, stride_k), (0, i_t * BT), (64, BT), (0, 1)
+        )
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_h1 += tl.dot(b_k, b_v_new)
+        if K > 64:
+            p_k = tl.make_block_ptr(
+                k, (K, T), (1, stride_k), (64, i_t * BT), (64, BT), (0, 1)
+            )
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h2 += tl.dot(b_k, b_v_new)
+        if K > 128:
+            p_k = tl.make_block_ptr(
+                k, (K, T), (1, stride_k), (128, i_t * BT), (64, BT), (0, 1)
+            )
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h3 += tl.dot(b_k, b_v_new)
+        if K > 192:
+            p_k = tl.make_block_ptr(
+                k, (K, T), (1, stride_k), (192, i_t * BT), (64, BT), (0, 1)
+            )
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h4 += tl.dot(b_k, b_v_new)
+    # epilogue
+    if STORE_FINAL_STATE:
+        p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_ht, b_h1.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_ht = tl.make_block_ptr(
+                ht, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_ht, b_h2.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_ht = tl.make_block_ptr(
+                ht, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_ht, b_h3.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_ht = tl.make_block_ptr(
+                ht, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_ht, b_h4.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+def chunk_gated_delta_rule_fwd_h(
+    k: torch.Tensor,
+    w: torch.Tensor,
+    u: torch.Tensor,
+    g: Optional[torch.Tensor] = None,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    chunk_size: int = 64,  # SY: remove this argument and force chunk size 64?
+    save_new_value: bool = True,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    B, T, Hg, K, V = *k.shape, u.shape[-1]
+    H = u.shape[-2]
+    BT = chunk_size
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, chunk_size)
+        if cu_seqlens is not None
+        else None
+    )
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if cu_seqlens is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    else:
+        N, NT, chunk_offsets = (
+            len(cu_seqlens) - 1,
+            len(chunk_indices),
+            prepare_chunk_offsets(cu_seqlens, BT),
+        )
+    assert K <= 256, "current kernel does not support head dimension larger than 256."
+    h = k.new_empty(B, NT, H, K, V)
+    final_state = (
+        k.new_empty(N, H, K, V, dtype=torch.float32) if output_final_state else None
+    )
+    v_new = torch.empty_like(u) if save_new_value else None
+    def grid(meta):
+        return (triton.cdiv(V, meta["BV"]), N * H)
+    chunk_gated_delta_rule_fwd_kernel_h_blockdim64[grid](
+        k=k,
+        v=u,
+        w=w,
+        v_new=v_new,
+        g=g,
+        h=h,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        V=V,
+        BT=BT,
+        BV=32,
+        num_warps=4,
+        num_stages=2,
+    )
+    return h, v_new, final_state

sglang 0.5.1.post3__py3-none-any.whl → 0.5.2__py3-none-any.whl

sglang 0.5.1.post3py3-none-any.whl → 0.5.2py3-none-any.whl