PyPI - rwkv-ops - Versions diffs - 0.1.0__py3-none-any.whl - Mend

rwkv-ops 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rwkv-ops might be problematic. Click here for more details.

Files changed (43) hide show

rwkv_ops/__init__.py +26 -0
rwkv_ops/rwkv7_kernel/__init__.py +153 -0
rwkv_ops/rwkv7_kernel/get_jax_devices_info.py +221 -0
rwkv_ops/rwkv7_kernel/get_torch_devices_info.py +250 -0
rwkv_ops/rwkv7_kernel/jax_kernel/__init__.py +9 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_bwd.py +95 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_fwd.py +60 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_bwd.py +78 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_fwd.py +80 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_bwd.py +150 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_fwd.py +45 -0
rwkv_ops/rwkv7_kernel/jax_kernel/cumsum.py +34 -0
rwkv_ops/rwkv7_kernel/jax_kernel/wy_fast_bwd.py +61 -0
rwkv_ops/rwkv7_kernel/jax_kernel/wy_fast_fwd.py +86 -0
rwkv_ops/rwkv7_kernel/jax_op.py +382 -0
rwkv_ops/rwkv7_kernel/native_keras_op.py +95 -0
rwkv_ops/rwkv7_kernel/torch_kernel/__init__.py +13 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_bwd.py +96 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_fwd.py +64 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_bwd.py +74 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_fwd.py +75 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_bwd.py +148 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_fwd.py +44 -0
rwkv_ops/rwkv7_kernel/torch_kernel/cumsum.py +31 -0
rwkv_ops/rwkv7_kernel/torch_kernel/wy_fast_bwd.py +63 -0
rwkv_ops/rwkv7_kernel/torch_kernel/wy_fast_fwd.py +79 -0
rwkv_ops/rwkv7_kernel/torch_op.py +523 -0
rwkv_ops/rwkv7_kernel/triton_kernel/__init__.py +34 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_bwd.py +328 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_fwd.py +186 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_bwd.py +157 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_fwd.py +160 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_o_bwd.py +382 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_o_fwd.py +137 -0
rwkv_ops/rwkv7_kernel/triton_kernel/cumsum.py +86 -0
rwkv_ops/rwkv7_kernel/triton_kernel/utils.py +20 -0
rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_bwd.py +193 -0
rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_fwd.py +326 -0
rwkv_ops-0.1.0.dist-info/LICENSE.txt +201 -0
rwkv_ops-0.1.0.dist-info/METADATA +118 -0
rwkv_ops-0.1.0.dist-info/RECORD +43 -0
rwkv_ops-0.1.0.dist-info/WHEEL +5 -0
rwkv_ops-0.1.0.dist-info/top_level.txt +1 -0

rwkv_ops/rwkv7_kernel/triton_kernel/cumsum.py ADDED Viewed

@@ -0,0 +1,86 @@
+import triton
+import triton.language as tl
+from ..triton_kernel.utils import use_cuda_graph
+@triton.autotune(
+    configs=[
+        triton.Config({"BS": BS}, num_warps=num_warps, num_stages=num_stages)
+        for BS in [16, 32, 64]
+        for num_warps in [4, 8, 16]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["S", "BT"],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_rwkv6_fwd_cumsum_kernel(
+    s,
+    T,
+    oi,
+    oe,
+    H: tl.constexpr,
+    S: tl.constexpr,
+    BT: tl.constexpr,
+    BS: tl.constexpr,
+):
+    cu_seqlens = None
+    chunk_indices = None
+    i_s, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if False:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    o_i = tl.arange(0, BT)
+    m_i = tl.where(o_i[:, None] >= o_i[None, :], 1.0, 0.0).to(tl.float32)
+    m_e = tl.where(o_i[:, None] > o_i[None, :], 1.0, 0.0).to(tl.float32)
+    p_s = tl.make_block_ptr(
+        s + (bos * H + i_h) * S,
+        (T, S),
+        (H * S, 1),
+        (i_t * BT, i_s * BS),
+        (BT, BS),
+        (1, 0),
+    )
+    p_oi = tl.make_block_ptr(
+        oi + (bos * H + i_h) * S,
+        (T, S),
+        (H * S, 1),
+        (i_t * BT, i_s * BS),
+        (BT, BS),
+        (1, 0),
+    )
+    p_oe = tl.make_block_ptr(
+        oe + (bos * H + i_h) * S,
+        (T, S),
+        (H * S, 1),
+        (i_t * BT, i_s * BS),
+        (BT, BS),
+        (1, 0),
+    )
+    # [BT, BS]
+    b_s = tl.load(p_s, boundary_check=(0, 1)).to(tl.float32)
+    b_oi = tl.dot(m_i, b_s)
+    b_oe = tl.dot(m_e, b_s)
+    tl.store(
+        p_oi,
+        b_oi.to(p_oi.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_oe,
+        b_oe.to(p_oe.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )

rwkv_ops/rwkv7_kernel/triton_kernel/utils.py ADDED Viewed

@@ -0,0 +1,20 @@
+import triton
+import triton.language as tl
+is_gather_supported = hasattr(triton.language, "gather")
+if not is_gather_supported:
+    @triton.jit
+    def gather(src, index, axis, _builder=None):
+        # This is a fallback implementation when tl.gather is not supported
+        # In order to pass triton compiler, there is no actual gather operation
+        return src
+else:
+    gather = tl.gather
+exp = tl.exp
+import keras
+if keras.backend.backend() == "jax":
+    from ..get_jax_devices_info import *
+else:
+    from ..get_torch_devices_info import *

rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_bwd.py ADDED Viewed

@@ -0,0 +1,193 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import triton
+import triton.language as tl
+from ..triton_kernel.utils import use_cuda_graph
+triton_config = {}
+@triton.autotune(
+    configs=[
+        triton.Config(triton_config, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BT", "BK", "BV"],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=["T"])
+def prepare_wy_repr_bwd_kernel(
+    A_ab_inv,
+    A_ak,
+    ag,
+    v,
+    dw,
+    du,
+    dv0,
+    T,
+    dAak,
+    dAab,
+    dv,
+    dag,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if False:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_Aak_t = tl.make_block_ptr(
+        A_ak + (bos * H + i_h) * BT,
+        (BT, T),
+        (1, H * BT),
+        (0, i_t * BT),
+        (BT, BT),
+        (0, 1),
+    )
+    p_Aab_inv_t = tl.make_block_ptr(
+        A_ab_inv + (bos * H + i_h) * BT,
+        (BT, T),
+        (1, H * BT),
+        (0, i_t * BT),
+        (BT, BT),
+        (0, 1),
+    )
+    p_dAak = tl.make_block_ptr(
+        dAak + (bos * H + i_h) * BT,
+        (T, BT),
+        (H * BT, 1),
+        (i_t * BT, 0),
+        (BT, BT),
+        (1, 0),
+    )
+    p_dAab = tl.make_block_ptr(
+        dAab + (bos * H + i_h) * BT,
+        (T, BT),
+        (H * BT, 1),
+        (i_t * BT, 0),
+        (BT, BT),
+        (1, 0),
+    )
+    b_A_ab_inv_t = tl.load(p_Aab_inv_t, boundary_check=(0, 1))
+    b_A_ak_t = tl.load(p_Aak_t, boundary_check=(0, 1))
+    b_A_ak_t = tl.where(
+        tl.arange(0, BT)[:, None] < tl.arange(0, BT)[None, :], b_A_ak_t, 0
+    )
+    b_A_ab_inv_t = tl.where(
+        tl.arange(0, BT)[:, None] <= tl.arange(0, BT)[None, :], b_A_ab_inv_t, 0
+    )
+    b_A_tmp_t = tl.dot(b_A_ak_t, b_A_ab_inv_t).to(v.dtype.element_ty)
+    b_dA_tmp = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(
+            v + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        p_dv = tl.make_block_ptr(
+            dv + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        p_dv0 = tl.make_block_ptr(
+            dv0 + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        p_du = tl.make_block_ptr(
+            du + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_du = tl.load(p_du, boundary_check=(0, 1))
+        b_dA_tmp += tl.dot(b_du.to(b_v.dtype), tl.trans(b_v))
+        b_dv0 = tl.load(p_dv0, boundary_check=(0, 1))
+        b_dv = b_dv0 + tl.dot(b_A_tmp_t, b_du)
+        tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+    m_i = tl.arange(0, BT)[:, None] > tl.arange(0, BT)[None, :]
+    b_dA_tmp = tl.where(m_i, b_dA_tmp, 0)
+    b_dA_ak = tl.dot(b_A_ab_inv_t, b_dA_tmp)
+    b_dA_ak = tl.where(m_i, b_dA_ak, 0)
+    tl.store(p_dAak, b_dA_ak, boundary_check=(0, 1))
+    b_dA_ab_inv = tl.dot(b_dA_tmp, b_A_ak_t)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_ag = tl.make_block_ptr(
+            ag + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        p_dag = tl.make_block_ptr(
+            dag + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        p_dw = tl.make_block_ptr(
+            dw + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        b_ag = tl.load(p_ag, boundary_check=(0, 1))
+        b_dw = tl.load(p_dw, boundary_check=(0, 1))
+        b_dA_ab_inv += tl.dot(b_dw, tl.trans(b_ag))
+        b_dag = tl.dot(b_A_ab_inv_t.to(b_dw.dtype), b_dw)
+        tl.store(p_dag, b_dag.to(p_dag.dtype.element_ty), boundary_check=(0, 1))
+    # if we know dL/dA^(-1), for dL/dA, we can use the following formula:
+    # dL/dA = -(A^(-1))^T @ (dL/dA^(-1)) @ (A^(-1))^T
+    # in the fwd pass we use fwd substitution to calculate (I-lower(A_ab))^-1.
+    # denote A = I - lower(A_ab), B = A^-1
+    # in the backward pass.
+    # dL/dA = -(B)^T @ (dL/dB) @ B^T
+    # dL/dA_ab = lower(B^T @ dL/dB @ B^T)
+    b_dA_ab_inv = tl.where(
+        tl.arange(0, BT)[:, None] >= tl.arange(0, BT)[None, :], b_dA_ab_inv, 0
+    )
+    b_dA_ab_inv = tl.dot(b_A_ab_inv_t, b_dA_ab_inv)
+    b_dA_ab_inv = tl.dot(b_dA_ab_inv, b_A_ab_inv_t)
+    b_dA_ab_inv = tl.where(m_i, b_dA_ab_inv, 0)
+    tl.store(p_dAab, b_dA_ab_inv, boundary_check=(0, 1))

rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_fwd.py ADDED Viewed

@@ -0,0 +1,326 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import triton
+import triton.language as tl
+from ..triton_kernel.utils import is_gather_supported, use_cuda_graph, gather
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8, 16]],
+    key=["BT"],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=["T"])
+def prepare_wy_repr_fwd_kernel_chunk32(
+    A_ab,
+    T,
+    A_ab_inv,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,  # placeholder, do not delete
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if False:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_Aab = tl.make_block_ptr(
+        A_ab + (bos * H + i_h) * BT,
+        (T, BT),
+        (H * BT, 1),
+        (i_t * BT, 0),
+        (BT, BT),
+        (1, 0),
+    )
+    p_Aab_inv = tl.make_block_ptr(
+        A_ab_inv + (bos * H + i_h) * BT,
+        (T, BT),
+        (H * BT, 1),
+        (i_t * BT, 0),
+        (BT, BT),
+        (1, 0),
+    )
+    b_A_ab = tl.load(p_Aab, boundary_check=(0, 1))
+    b_A_ab = tl.where(tl.arange(0, BT)[:, None] > tl.arange(0, BT)[None, :], b_A_ab, 0)
+    for i in range(1, BT):
+        mask = tl.arange(0, BT) == i
+        b_a = tl.sum(tl.where(mask[:, None], b_A_ab, 0), 0)
+        b_a = b_a + tl.sum(b_a[:, None] * b_A_ab, 0) * (tl.arange(0, BT) < i)
+        b_A_ab = tl.where(mask[:, None], b_a, b_A_ab)
+    b_A_ab += tl.arange(0, BT)[:, None] == tl.arange(0, BT)[None, :]
+    tl.store(p_Aab_inv, b_A_ab.to(p_Aab_inv.dtype.element_ty), boundary_check=(0, 1))
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BC"],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=["T"])
+def prepare_wy_repr_fwd_kernel_chunk64(
+    A_ab,
+    T,
+    A_ab_inv,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    GATHER_SUPPORTED: tl.constexpr = is_gather_supported,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if False:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_A1 = tl.make_block_ptr(
+        A_ab + (bos * H + i_h) * BT,
+        (T, BT),
+        (H * BT, 1),
+        (i_t * BT, 0),
+        (BC, BC),
+        (1, 0),
+    )
+    p_A2 = tl.make_block_ptr(
+        A_ab + (bos * H + i_h) * BT,
+        (T, BT),
+        (H * BT, 1),
+        (i_t * BT + BC, BC),
+        (BC, BC),
+        (1, 0),
+    )
+    p_A3 = tl.make_block_ptr(
+        A_ab + (bos * H + i_h) * BT,
+        (T, BT),
+        (H * BT, 1),
+        (i_t * BT + BC, 0),
+        (BC, BC),
+        (1, 0),
+    )
+    p_A_inv1 = tl.make_block_ptr(
+        A_ab_inv + (bos * H + i_h) * BT,
+        (T, BT),
+        (H * BT, 1),
+        (i_t * BT, 0),
+        (BC, BC),
+        (1, 0),
+    )
+    p_A_inv2 = tl.make_block_ptr(
+        A_ab_inv + (bos * H + i_h) * BT,
+        (T, BT),
+        (H * BT, 1),
+        (i_t * BT + BC, BC),
+        (BC, BC),
+        (1, 0),
+    )
+    p_A_inv3 = tl.make_block_ptr(
+        A_ab_inv + (bos * H + i_h) * BT,
+        (T, BT),
+        (H * BT, 1),
+        (i_t * BT + BC, 0),
+        (BC, BC),
+        (1, 0),
+    )
+    p_A_inv4 = tl.make_block_ptr(
+        A_ab_inv + (bos * H + i_h) * BT,
+        (T, BT),
+        (H * BT, 1),
+        (i_t * BT, BC),
+        (BC, BC),
+        (1, 0),
+    )
+    b_A = tl.load(p_A1, boundary_check=(0, 1))
+    b_A2 = tl.load(p_A2, boundary_check=(0, 1))
+    b_A3 = tl.load(p_A3, boundary_check=(0, 1))
+    b_A = tl.where(tl.arange(0, BC)[:, None] > tl.arange(0, BC)[None, :], b_A, 0)
+    b_A2 = tl.where(tl.arange(0, BC)[:, None] > tl.arange(0, BC)[None, :], b_A2, 0)
+    for i in range(1, BC):
+        if GATHER_SUPPORTED:
+            row_idx = tl.full([1, BC], i, dtype=tl.int16)
+            # [1, BK] -> [BK]
+            b_a = tl.sum(gather(b_A, row_idx, axis=0), 0)
+            b_a2 = tl.sum(gather(b_A2, row_idx, axis=0), 0)
+        else:
+            mask = tl.arange(0, BC) == i
+            b_a = tl.sum(tl.where(mask[:, None], b_A, 0), 0)
+            b_a2 = tl.sum(tl.where(mask[:, None], b_A2, 0), 0)
+        mask = tl.arange(0, BC) == i
+        # b_a = tl.sum(tl.where(mask[:, None], b_A, 0), 0)
+        # b_a2 = tl.sum(tl.where(mask[:, None], b_A2, 0), 0)
+        b_a = b_a + tl.sum(b_a[:, None] * b_A, 0) * (tl.arange(0, BC) < i)
+        b_a2 = b_a2 + tl.sum(b_a2[:, None] * b_A2, 0) * (tl.arange(0, BC) < i)
+        b_A = tl.where(mask[:, None], b_a, b_A)
+        b_A2 = tl.where(mask[:, None], b_a2, b_A2)
+    # blockwise computation of lower triangular matrix's inverse
+    # i.e., [A11, 0; A21, A22]^-1 = [A11^-1, 0; -A22^-1 A21 A11^-1, A22^-1]
+    b_A += tl.arange(0, BC)[:, None] == tl.arange(0, BC)[None, :]
+    b_A2 += tl.arange(0, BC)[:, None] == tl.arange(0, BC)[None, :]
+    b_A3 = tl.dot(tl.dot(b_A2, b_A3), b_A)
+    # tl.debug_barrier()
+    tl.store(
+        p_A_inv1,
+        b_A.to(p_A_inv1.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_A_inv2,
+        b_A2.to(p_A_inv2.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_A_inv3,
+        b_A3.to(p_A_inv3.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    # causal mask
+    tl.store(
+        p_A_inv4,
+        tl.zeros([BC, BC], dtype=tl.float32).to(p_A_inv4.dtype.element_ty),
+        boundary_check=(0, 1),
+    )
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["H", "K", "V", "BT", "BK", "BV"],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=["T"])
+def wu_fwd_kernel(
+    ag,
+    v,
+    A_ab_inv,
+    A_ak,
+    T,
+    w,
+    u,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if False:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    o_s = tl.arange(0, BT)
+    p_A_ab_inv = tl.make_block_ptr(
+        A_ab_inv + (bos * H + i_h) * BT,
+        (T, BT),
+        (H * BT, 1),
+        (i_t * BT, 0),
+        (BT, BT),
+        (1, 0),
+    )
+    p_A_ak = tl.make_block_ptr(
+        A_ak + (bos * H + i_h) * BT,
+        (T, BT),
+        (H * BT, 1),
+        (i_t * BT, 0),
+        (BT, BT),
+        (1, 0),
+    )
+    b_Aab_inv = tl.load(p_A_ab_inv, boundary_check=(0, 1))
+    b_Aak = tl.load(p_A_ak, boundary_check=(0, 1))
+    b_Aab_inv = tl.where(o_s[:, None] >= o_s[None, :], b_Aab_inv, 0)
+    b_Aak = tl.where(o_s[:, None] > o_s[None, :], b_Aak, 0)
+    # let's use tf32 here
+    b_Aak = tl.dot(b_Aab_inv, b_Aak)
+    # (SY 01/04) should be bf16 or tf32? To verify.
+    b_Aak = b_Aak.to(v.dtype.element_ty, fp_downcast_rounding="rtne")
+    b_Aab_inv = b_Aab_inv.to(ag.dtype.element_ty, fp_downcast_rounding="rtne")
+    for i_k in range(tl.cdiv(K, BK)):
+        p_ag = tl.make_block_ptr(
+            ag + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        p_w = tl.make_block_ptr(
+            w + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        b_ag = tl.load(p_ag, boundary_check=(0, 1))
+        b_w = tl.dot(b_Aab_inv, b_ag)  # both bf16 or fp16
+        tl.store(
+            p_w,
+            b_w.to(p_w.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(
+            v + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        p_u = tl.make_block_ptr(
+            u + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_u = tl.dot(b_Aak, b_v)  # both bf16 or fp16
+        tl.store(
+            p_u,
+            b_u.to(p_u.dtype.element_ty, fp_downcast_rounding="rtne"),
+            boundary_check=(0, 1),
+        )