PyPI - rwkv-ops - Versions diffs - 0.1.0__py3-none-any.whl - Mend

rwkv-ops 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rwkv-ops might be problematic. Click here for more details.

Files changed (43) hide show

rwkv_ops/__init__.py +26 -0
rwkv_ops/rwkv7_kernel/__init__.py +153 -0
rwkv_ops/rwkv7_kernel/get_jax_devices_info.py +221 -0
rwkv_ops/rwkv7_kernel/get_torch_devices_info.py +250 -0
rwkv_ops/rwkv7_kernel/jax_kernel/__init__.py +9 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_bwd.py +95 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_fwd.py +60 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_bwd.py +78 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_fwd.py +80 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_bwd.py +150 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_fwd.py +45 -0
rwkv_ops/rwkv7_kernel/jax_kernel/cumsum.py +34 -0
rwkv_ops/rwkv7_kernel/jax_kernel/wy_fast_bwd.py +61 -0
rwkv_ops/rwkv7_kernel/jax_kernel/wy_fast_fwd.py +86 -0
rwkv_ops/rwkv7_kernel/jax_op.py +382 -0
rwkv_ops/rwkv7_kernel/native_keras_op.py +95 -0
rwkv_ops/rwkv7_kernel/torch_kernel/__init__.py +13 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_bwd.py +96 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_fwd.py +64 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_bwd.py +74 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_fwd.py +75 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_bwd.py +148 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_fwd.py +44 -0
rwkv_ops/rwkv7_kernel/torch_kernel/cumsum.py +31 -0
rwkv_ops/rwkv7_kernel/torch_kernel/wy_fast_bwd.py +63 -0
rwkv_ops/rwkv7_kernel/torch_kernel/wy_fast_fwd.py +79 -0
rwkv_ops/rwkv7_kernel/torch_op.py +523 -0
rwkv_ops/rwkv7_kernel/triton_kernel/__init__.py +34 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_bwd.py +328 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_fwd.py +186 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_bwd.py +157 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_fwd.py +160 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_o_bwd.py +382 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_o_fwd.py +137 -0
rwkv_ops/rwkv7_kernel/triton_kernel/cumsum.py +86 -0
rwkv_ops/rwkv7_kernel/triton_kernel/utils.py +20 -0
rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_bwd.py +193 -0
rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_fwd.py +326 -0
rwkv_ops-0.1.0.dist-info/LICENSE.txt +201 -0
rwkv_ops-0.1.0.dist-info/METADATA +118 -0
rwkv_ops-0.1.0.dist-info/RECORD +43 -0
rwkv_ops-0.1.0.dist-info/WHEEL +5 -0
rwkv_ops-0.1.0.dist-info/top_level.txt +1 -0

rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_bwd.py ADDED Viewed

@@ -0,0 +1,328 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import triton
+import triton.language as tl
+from ..triton_kernel.utils import exp, gather, use_cuda_graph
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BK", "BT", "K"],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_dplr_bwd_kernel_intra(
+    q,
+    k,
+    a,
+    b,
+    gi,
+    ge,
+    dAqk,
+    dAqb,
+    dAak,
+    dAab,
+    dqg,
+    dkg,
+    dag,
+    dbg,
+    T,
+    dq,
+    dk,
+    da,
+    db,
+    dgk,
+    dgk_offset,
+    scale: tl.constexpr,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    GATHER_SUPPORTED: tl.constexpr,
+):
+    i_k, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if False:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = (i_b * T).to(tl.int32), (i_b * T + T).to(tl.int32)
+    if i_t * BT >= T:
+        return
+    # offset calculation
+    ge += (bos * H + i_h) * K
+    gi += (bos * H + i_h) * K
+    q += (bos * H + i_h) * K
+    a += (bos * H + i_h) * K
+    b += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    dq += (bos * H + i_h) * K
+    dk += (bos * H + i_h) * K
+    da += (bos * H + i_h) * K
+    db += (bos * H + i_h) * K
+    dqg += (bos * H + i_h) * K
+    dag += (bos * H + i_h) * K
+    dkg += (bos * H + i_h) * K
+    dbg += (bos * H + i_h) * K
+    dgk += (bos * H + i_h) * K
+    dgk_offset += (bos * H + i_h) * K
+    dAqk += (bos * H + i_h) * BT
+    dAqb += (bos * H + i_h) * BT
+    dAak += (bos * H + i_h) * BT
+    dAab += (bos * H + i_h) * BT
+    stride_qk = H * K
+    stride_A = H * BT
+    p_ge = tl.make_block_ptr(
+        ge, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    p_gi = tl.make_block_ptr(
+        gi, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    # [BC, BK]
+    b_ge = tl.load(p_ge, boundary_check=(0, 1))
+    b_gi = tl.load(p_gi, boundary_check=(0, 1))
+    b_dq = tl.zeros([BC, BK], dtype=tl.float32)
+    b_da = tl.zeros([BC, BK], dtype=tl.float32)
+    b_dk = tl.zeros([BC, BK], dtype=tl.float32)
+    b_db = tl.zeros([BC, BK], dtype=tl.float32)
+    # intra chunk gradient calculation
+    p_dAqk = tl.make_block_ptr(
+        dAqk, (T, BT), (stride_A, 1), (i_t * BT, 0), (BC, BC), (1, 0)
+    )
+    p_dAab = tl.make_block_ptr(
+        dAab, (T, BT), (stride_A, 1), (i_t * BT, 0), (BC, BC), (1, 0)
+    )
+    p_dAqb = tl.make_block_ptr(
+        dAqb, (T, BT), (stride_A, 1), (i_t * BT, 0), (BC, BC), (1, 0)
+    )
+    p_dAak = tl.make_block_ptr(
+        dAak, (T, BT), (stride_A, 1), (i_t * BT, 0), (BC, BC), (1, 0)
+    )
+    o_i = tl.arange(0, BC)
+    p_k = tl.make_block_ptr(
+        k, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    p_b = tl.make_block_ptr(
+        b, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    p_a = tl.make_block_ptr(
+        a, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    p_q = tl.make_block_ptr(
+        q, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+    b_b = tl.load(p_b, boundary_check=(0, 1))
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_a = tl.load(p_a, boundary_check=(0, 1))
+    b_dAqk = tl.load(p_dAqk, boundary_check=(0, 1))
+    b_dAab = tl.load(p_dAab, boundary_check=(0, 1))
+    b_dAqb = tl.load(p_dAqb, boundary_check=(0, 1))
+    b_dAak = tl.load(p_dAak, boundary_check=(0, 1))
+    # inter chunk gradient calculation
+    o_k = i_k * BK + tl.arange(0, BK)
+    m_k = o_k < K
+    # intra chunk gradient calculation
+    for j in range(0, min(BC, T - i_t * BT)):
+        # trick to index the block
+        if GATHER_SUPPORTED:
+            row_idx = tl.full([1, BK], j, dtype=tl.int16)
+            col_idx = tl.full([BC, 1], j, dtype=tl.int16)
+            row_idx_bc = tl.full([1, BC], j, dtype=tl.int16)
+            # [1, BK]
+            b_kj = gather(b_k, row_idx, axis=0)
+            b_bj = gather(b_b, row_idx, axis=0)
+            b_gij = gather(b_gi, row_idx, axis=0)
+            b_gej = gather(b_ge, row_idx, axis=0)
+            b_qj = gather(b_q, row_idx, axis=0)
+            b_aj = gather(b_a, row_idx, axis=0)
+            # [BC, 1]
+            b_dAqk_j = gather(b_dAqk, col_idx, axis=1)
+            b_dAab_j = gather(b_dAab, col_idx, axis=1)
+            b_dAqb_j = gather(b_dAqb, col_idx, axis=1)
+            b_dAak_j = gather(b_dAak, col_idx, axis=1)
+            # [1, BC] -> [BC, 1]
+            b_dA_qk_j = tl.sum(gather(b_dAqk, row_idx_bc, axis=0), 0)[:, None]
+            b_dA_qk_j = tl.sum(gather(b_dAqk, row_idx_bc, axis=0), 0)[:, None]
+            b_dA_ab_j = tl.sum(gather(b_dAab, row_idx_bc, axis=0), 0)[:, None]
+            b_dA_qb_j = tl.sum(gather(b_dAqb, row_idx_bc, axis=0), 0)[:, None]
+            b_dA_ak_j = tl.sum(gather(b_dAak, row_idx_bc, axis=0), 0)[:, None]
+        else:
+            mask_idx = tl.arange(0, BC) == j
+            b_kj = tl.sum(tl.where(mask_idx[:, None], b_k, 0), 0)[None, :]
+            b_bj = tl.sum(tl.where(mask_idx[:, None], b_b, 0), 0)[None, :]
+            b_gij = tl.sum(tl.where(mask_idx[:, None], b_gi, 0), 0)[None, :]
+            b_gej = tl.sum(tl.where(mask_idx[:, None], b_ge, 0), 0)[None, :]
+            b_dAqk_j = tl.sum(tl.where(mask_idx[None, :], b_dAqk, 0), 1)[:, None]
+            b_dAab_j = tl.sum(tl.where(mask_idx[None, :], b_dAab, 0), 1)[:, None]
+            b_dAqb_j = tl.sum(tl.where(mask_idx[None, :], b_dAqb, 0), 1)[:, None]
+            b_dAak_j = tl.sum(tl.where(mask_idx[None, :], b_dAak, 0), 1)[:, None]
+            b_dA_qk_j = tl.sum(tl.where(mask_idx[:, None], b_dAqk, 0), 0)[:, None]
+            b_dA_ab_j = tl.sum(tl.where(mask_idx[:, None], b_dAab, 0), 0)[:, None]
+            b_dA_qb_j = tl.sum(tl.where(mask_idx[:, None], b_dAqb, 0), 0)[:, None]
+            b_dA_ak_j = tl.sum(tl.where(mask_idx[:, None], b_dAak, 0), 0)[:, None]
+            # [1, BK] b_qj, b_aj
+            b_qj = tl.sum(tl.where(mask_idx[:, None], b_q, 0), 0)[None, :]
+            b_aj = tl.sum(tl.where(mask_idx[:, None], b_a, 0), 0)[None, :]
+        m_e = o_i[:, None] > j
+        m_i = o_i[:, None] >= j
+        tmp1 = exp(b_gi - b_gij)
+        tmp2 = exp(b_ge - b_gij)
+        b_dq += tl.where(m_i, b_dAqk_j * b_kj * tmp1, 0.0)
+        b_dq += tl.where(m_i, b_dAqb_j * b_bj * tmp1, 0.0)
+        b_da += tl.where(m_e, b_dAab_j * b_bj * tmp2, 0.0)
+        b_da += tl.where(m_e, b_dAak_j * b_kj * tmp2, 0.0)
+        m_i = o_i[:, None] <= j
+        m_e = o_i[:, None] < j
+        tmp1 = exp(b_gij - b_gi)
+        tmp2 = exp(b_gej - b_gi)
+        b_dk += tl.where(m_i, b_dA_qk_j * b_qj * tmp1, 0.0)
+        b_dk += tl.where(m_e, b_dA_ak_j * b_aj * tmp2, 0.0)
+        b_db += tl.where(m_i, b_dA_qb_j * b_qj * tmp1, 0.0)
+        b_db += tl.where(m_e, b_dA_ab_j * b_aj * tmp2, 0.0)
+    # post processing
+    p_dq = tl.make_block_ptr(
+        dq, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    p_dk = tl.make_block_ptr(
+        dk, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    p_da = tl.make_block_ptr(
+        da, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    p_db = tl.make_block_ptr(
+        db, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    p_dgk = tl.make_block_ptr(
+        dgk, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    p_dgk_offset = tl.make_block_ptr(
+        dgk_offset, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    p_dqg = tl.make_block_ptr(
+        dqg, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    p_dkg = tl.make_block_ptr(
+        dkg, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    p_dag = tl.make_block_ptr(
+        dag, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    p_dbg = tl.make_block_ptr(
+        dbg, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0)
+    )
+    p_gn = gi + (min(i_t * BT + BT, T) - 1) * stride_qk + o_k
+    p_gn = tl.max_contiguous(tl.multiple_of(p_gn, BK), BK)
+    b_gn = tl.load(p_gn, mask=m_k, other=0)
+    b_da += tl.load(p_dag, boundary_check=(0, 1)) * exp(b_ge)
+    b_dq += tl.load(p_dqg, boundary_check=(0, 1)) * exp(b_gi) * scale
+    tmp = exp(b_gn[None, :] - b_gi)
+    b_dk += tl.load(p_dkg, boundary_check=(0, 1)).to(tl.float32) * tmp
+    b_db += tl.load(p_dbg, boundary_check=(0, 1)).to(tl.float32) * tmp
+    tl.store(p_dq, (b_dq).to(p_dq.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_da, b_da.to(p_da.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_db, b_db.to(p_db.dtype.element_ty), boundary_check=(0, 1))
+    b_dgk = (b_dq * b_q + b_da * b_a - b_dk * b_k - b_db * b_b).to(tl.float32)
+    b_dgk_offset = b_da * b_a
+    tl.store(p_dgk, b_dgk.to(p_dgk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(
+        p_dgk_offset,
+        b_dgk_offset.to(p_dgk_offset.dtype.element_ty),
+        boundary_check=(0, 1),
+    )
+@triton.autotune(
+    configs=[
+        triton.Config({"BK": BK}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+        for BK in [32, 64]
+    ],
+    key=["BK", "BT", "K"],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_dplr_bwd_dgk_kernel(
+    dgk,
+    dgk_offset,
+    dgk_last,
+    T,
+    dgk_output,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+):
+    i_t, i_k, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if False:
+        i_tg = i_t
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = (i_b * NT + i_t).to(tl.int32)
+        bos, eos = (i_b * T).to(tl.int32), (i_b * T + T).to(tl.int32)
+    stride_qk = H * K
+    dgk += (bos * H + i_h) * K
+    dgk_offset += (bos * H + i_h) * K
+    dgk_last += (i_tg * H + i_h) * K
+    dgk_output += (bos * H + i_h) * K
+    p_dgk_last = dgk_last + tl.arange(0, BK) + i_k * BK
+    m_k = tl.arange(0, BK) + i_k * BK < K
+    b_dgk_last = tl.load(p_dgk_last, mask=m_k, other=0)
+    p_dgk_offset = tl.make_block_ptr(
+        dgk_offset, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)
+    )
+    p_dgk = tl.make_block_ptr(
+        dgk, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)
+    )
+    b_dgk = tl.load(p_dgk, boundary_check=(0, 1))
+    b_dgk_offset = tl.load(p_dgk_offset, boundary_check=(0, 1))
+    # m_inv_cumsum = (tl.arange(0, BT)[:, None] <= tl.arange(0, BT)[None, :]).to(tl.float32)
+    # b_dgk_cumsum = tl.dot(m_inv_cumsum, b_dgk, allow_tf32=False)
+    b_dgk_cumsum = tl.cumsum(b_dgk, 0, reverse=True)
+    b_dgk_cumsum += b_dgk_last[None, :]
+    b_dgk_cumsum -= b_dgk_offset
+    p_dgk_output = tl.make_block_ptr(
+        dgk_output, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)
+    )
+    tl.store(
+        p_dgk_output,
+        b_dgk_cumsum.to(p_dgk_output.dtype.element_ty),
+        boundary_check=(0, 1),
+    )

rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_fwd.py ADDED Viewed

@@ -0,0 +1,186 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import triton
+import triton.language as tl
+from ..triton_kernel.utils import exp, gather, use_cuda_graph
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BK", "BT"],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_dplr_fwd_A_kernel_intra_sub_intra(
+    q,
+    k,
+    a,
+    b,
+    gi,
+    ge,
+    T,
+    qg,
+    kg,
+    ag,
+    bg,
+    Aqk,
+    Aqb,
+    Aab,
+    Aak,
+    scale: tl.constexpr,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    GATHER_SUPPORTED: tl.constexpr,
+):
+    i_t, i_b, i_h = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    if False:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    if i_t * BT >= T:
+        return
+    o_i = tl.arange(0, BC)
+    o_k = tl.arange(0, BK)
+    m_k = o_k < K
+    m_A = (i_t * BT + tl.arange(0, BC)) < T
+    last_idx = min((i_t + 1) * BT, T) - 1
+    o_A = (bos + i_t * BT + tl.arange(0, BC)) * H * BT + i_h * BT
+    p_q = tl.make_block_ptr(
+        q + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, 0), (BC, BK), (1, 0)
+    )
+    p_k = tl.make_block_ptr(
+        k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, 0), (BC, BK), (1, 0)
+    )
+    p_a = tl.make_block_ptr(
+        a + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, 0), (BC, BK), (1, 0)
+    )
+    p_b = tl.make_block_ptr(
+        b + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, 0), (BC, BK), (1, 0)
+    )
+    p_gi = tl.make_block_ptr(
+        gi + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, 0), (BC, BK), (1, 0)
+    )
+    p_ge = tl.make_block_ptr(
+        ge + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, 0), (BC, BK), (1, 0)
+    )
+    p_g_last = gi + (bos * H + i_h) * K + last_idx * H * K + tl.arange(0, BK)
+    b_g_last = tl.load(p_g_last, mask=m_k, other=0)
+    p_qg = tl.make_block_ptr(
+        qg + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, 0), (BC, BK), (1, 0)
+    )
+    p_kg = tl.make_block_ptr(
+        kg + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, 0), (BC, BK), (1, 0)
+    )
+    p_ag = tl.make_block_ptr(
+        ag + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, 0), (BC, BK), (1, 0)
+    )
+    p_bg = tl.make_block_ptr(
+        bg + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, 0), (BC, BK), (1, 0)
+    )
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_q = b_q * scale
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+    b_a = tl.load(p_a, boundary_check=(0, 1))
+    b_b = tl.load(p_b, boundary_check=(0, 1))
+    b_gi = tl.load(p_gi, boundary_check=(0, 1)).to(tl.float32)
+    b_ge = tl.load(p_ge, boundary_check=(0, 1)).to(tl.float32)
+    # deal with decay term.
+    g_exp = exp(b_gi)
+    g_exp_inv = exp(-b_gi + b_g_last[None, :])
+    b_qg = b_q * g_exp
+    b_kg = b_k * g_exp_inv
+    b_bg = b_b * g_exp_inv
+    b_ag = b_a * exp(b_ge)
+    tl.store(
+        p_qg,
+        b_qg.to(p_qg.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_bg,
+        b_bg.to(p_bg.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_ag,
+        b_ag.to(p_ag.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_kg,
+        b_kg.to(p_kg.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    # tl.debug_barrier()
+    b_q = b_q.to(b_k.dtype)
+    # inner attn
+    for j in range(0, min(BC, T - i_t * BT)):
+        # a trick to index the j-th row of b_k, b_g, b_b
+        if GATHER_SUPPORTED:
+            row_idx = tl.full([1, BK], j, dtype=tl.int16)
+            # [1, BK]
+            b_k_j = gather(b_k, row_idx, axis=0)
+            b_gk_j = gather(b_gi, row_idx, axis=0)
+            b_b_j = gather(b_b, row_idx, axis=0)
+        else:
+            mask = tl.arange(0, BC) == j
+            b_k_j = tl.sum(tl.where(mask[:, None], b_k, 0), 0)[None, :]
+            b_gk_j = tl.sum(tl.where(mask[:, None], b_gi, 0), 0)[None, :]
+            b_b_j = tl.sum(tl.where(mask[:, None], b_b, 0), 0)[None, :]
+        tmp = exp(b_gi - b_gk_j)
+        b_A_qk = tl.sum(b_q * b_k_j * tmp, 1)
+        m_i = (o_i >= j).to(tl.float32)
+        b_A_qk = b_A_qk * m_i
+        b_A_qb = tl.sum(b_q * b_b_j * tmp, 1)
+        b_A_qb = b_A_qb * m_i
+        tmp2 = exp(b_ge - b_gk_j)
+        b_A_ak = tl.sum(b_a * b_k_j * tmp2, 1)
+        m_i2 = (o_i > j).to(tl.float32)
+        b_A_ak = b_A_ak * m_i2
+        b_A_ab = tl.sum(b_a * b_b_j * tmp2, 1)
+        b_A_ab = b_A_ab * m_i2
+        tl.store(
+            Aqk + o_A + j,
+            b_A_qk.to(dtype=Aqk.dtype.element_ty, fp_downcast_rounding="rtne"),
+            mask=m_A,
+        )
+        tl.store(
+            Aqb + o_A + j,
+            b_A_qb.to(dtype=Aqb.dtype.element_ty, fp_downcast_rounding="rtne"),
+            mask=m_A,
+        )
+        tl.store(
+            Aab + o_A + j,
+            b_A_ab.to(dtype=Aqb.dtype.element_ty, fp_downcast_rounding="rtne"),
+            mask=m_A,
+        )
+        tl.store(
+            Aak + o_A + j,
+            b_A_ak.to(dtype=Aqk.dtype.element_ty, fp_downcast_rounding="rtne"),
+            mask=m_A,
+        )

rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_bwd.py ADDED Viewed

@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import triton
+import triton.language as tl
+from ..triton_kernel.utils import exp, use_cuda_graph
+@triton.heuristics(
+    {
+        "USE_FINAL_STATE_GRADIENT": lambda args: args["dht"] is not None,
+        "USE_INITIAL_STATE": lambda args: args["dh0"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BT", "BK", "BV", "V"],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_dplr_bwd_kernel_dhu(
+    qg,
+    bg,
+    w,
+    gk,
+    dht,
+    dv,
+    do,
+    T,
+    dh,
+    dh0,
+    dv2,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_FINAL_STATE_GRADIENT: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_h = i_nh // H, i_nh % H
+    if False:
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+    # [BK, BV]
+    b_dh = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_FINAL_STATE_GRADIENT:
+        p_dht = tl.make_block_ptr(
+            dht + i_nh * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)
+        )
+        b_dh += tl.load(p_dht, boundary_check=(0, 1))
+    mask_k = tl.arange(0, BK) < K
+    for i_t in range(NT - 1, -1, -1):
+        p_dh = tl.make_block_ptr(
+            dh + ((boh + i_t) * H + i_h) * K * V,
+            (K, V),
+            (V, 1),
+            (i_k * BK, i_v * BV),
+            (BK, BV),
+            (1, 0),
+        )
+        tl.store(p_dh, b_dh.to(p_dh.dtype.element_ty), boundary_check=(0, 1))
+        b_dh_tmp = tl.zeros([BK, BV], dtype=tl.float32)
+        for i_c in range(tl.cdiv(BT, BC) - 1, -1, -1):
+            p_qg = tl.make_block_ptr(
+                qg + (bos * H + i_h) * K,
+                (K, T),
+                (1, H * K),
+                (i_k * BK, i_t * BT + i_c * BC),
+                (BK, BC),
+                (0, 1),
+            )
+            p_bg = tl.make_block_ptr(
+                bg + (bos * H + i_h) * K,
+                (T, K),
+                (H * K, 1),
+                (i_t * BT + i_c * BC, i_k * BK),
+                (BC, BK),
+                (1, 0),
+            )
+            p_w = tl.make_block_ptr(
+                w + (bos * H + i_h) * K,
+                (K, T),
+                (1, H * K),
+                (i_k * BK, i_t * BT + i_c * BC),
+                (BK, BC),
+                (0, 1),
+            )
+            p_dv = tl.make_block_ptr(
+                dv + (bos * H + i_h) * V,
+                (T, V),
+                (H * V, 1),
+                (i_t * BT + i_c * BC, i_v * BV),
+                (BC, BV),
+                (1, 0),
+            )
+            p_do = tl.make_block_ptr(
+                do + (bos * H + i_h) * V,
+                (T, V),
+                (H * V, 1),
+                (i_t * BT + i_c * BC, i_v * BV),
+                (BC, BV),
+                (1, 0),
+            )
+            p_dv2 = tl.make_block_ptr(
+                dv2 + (bos * H + i_h) * V,
+                (T, V),
+                (H * V, 1),
+                (i_t * BT + i_c * BC, i_v * BV),
+                (BC, BV),
+                (1, 0),
+            )
+            # [BK, BT]
+            b_qg = tl.load(p_qg, boundary_check=(0, 1))
+            # [BT, BK]
+            b_bg = tl.load(p_bg, boundary_check=(0, 1))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            # [BT, V]
+            b_do = tl.load(p_do, boundary_check=(0, 1))
+            b_dv = tl.load(p_dv, boundary_check=(0, 1))
+            b_dv2 = b_dv + tl.dot(b_bg, b_dh.to(b_bg.dtype))
+            tl.store(p_dv2, b_dv2.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+            # [BK, BV]
+            b_dh_tmp += tl.dot(b_qg, b_do.to(b_qg.dtype))
+            b_dh_tmp += tl.dot(b_w, b_dv2.to(b_qg.dtype))
+        last_idx = min((i_t + 1) * BT, T) - 1
+        bg_last = tl.load(
+            gk + ((bos + last_idx) * H + i_h) * K + tl.arange(0, BK), mask=mask_k
+        )
+        b_dh *= exp(bg_last)[:, None]
+        b_dh += b_dh_tmp
+    if USE_INITIAL_STATE:
+        p_dh0 = tl.make_block_ptr(
+            dh0 + i_nh * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)
+        )
+        tl.store(p_dh0, b_dh.to(p_dh0.dtype.element_ty), boundary_check=(0, 1))