PyPI - rwkv-ops - Versions diffs - 0.1.0__py3-none-any.whl - Mend

rwkv-ops 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rwkv-ops might be problematic. Click here for more details.

Files changed (43) hide show

rwkv_ops/__init__.py +26 -0
rwkv_ops/rwkv7_kernel/__init__.py +153 -0
rwkv_ops/rwkv7_kernel/get_jax_devices_info.py +221 -0
rwkv_ops/rwkv7_kernel/get_torch_devices_info.py +250 -0
rwkv_ops/rwkv7_kernel/jax_kernel/__init__.py +9 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_bwd.py +95 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_fwd.py +60 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_bwd.py +78 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_fwd.py +80 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_bwd.py +150 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_fwd.py +45 -0
rwkv_ops/rwkv7_kernel/jax_kernel/cumsum.py +34 -0
rwkv_ops/rwkv7_kernel/jax_kernel/wy_fast_bwd.py +61 -0
rwkv_ops/rwkv7_kernel/jax_kernel/wy_fast_fwd.py +86 -0
rwkv_ops/rwkv7_kernel/jax_op.py +382 -0
rwkv_ops/rwkv7_kernel/native_keras_op.py +95 -0
rwkv_ops/rwkv7_kernel/torch_kernel/__init__.py +13 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_bwd.py +96 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_fwd.py +64 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_bwd.py +74 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_fwd.py +75 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_bwd.py +148 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_fwd.py +44 -0
rwkv_ops/rwkv7_kernel/torch_kernel/cumsum.py +31 -0
rwkv_ops/rwkv7_kernel/torch_kernel/wy_fast_bwd.py +63 -0
rwkv_ops/rwkv7_kernel/torch_kernel/wy_fast_fwd.py +79 -0
rwkv_ops/rwkv7_kernel/torch_op.py +523 -0
rwkv_ops/rwkv7_kernel/triton_kernel/__init__.py +34 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_bwd.py +328 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_fwd.py +186 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_bwd.py +157 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_fwd.py +160 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_o_bwd.py +382 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_o_fwd.py +137 -0
rwkv_ops/rwkv7_kernel/triton_kernel/cumsum.py +86 -0
rwkv_ops/rwkv7_kernel/triton_kernel/utils.py +20 -0
rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_bwd.py +193 -0
rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_fwd.py +326 -0
rwkv_ops-0.1.0.dist-info/LICENSE.txt +201 -0
rwkv_ops-0.1.0.dist-info/METADATA +118 -0
rwkv_ops-0.1.0.dist-info/RECORD +43 -0
rwkv_ops-0.1.0.dist-info/WHEEL +5 -0
rwkv_ops-0.1.0.dist-info/top_level.txt +1 -0

rwkv_ops/rwkv7_kernel/jax_op.py ADDED Viewed

@@ -0,0 +1,382 @@
+import jax
+import jax.numpy as jnp
+import triton
+from .jax_kernel.chunk_A_bwd import chunk_dplr_bwd_dqk_intra
+from .jax_kernel.chunk_A_fwd import chunk_dplr_fwd_intra
+from .jax_kernel.chunk_h_bwd import chunk_dplr_bwd_dhu
+from .jax_kernel.chunk_h_fwd import chunk_dplr_fwd_h
+from .jax_kernel.chunk_o_bwd import (
+    chunk_dplr_bwd_dAu,
+    chunk_dplr_bwd_dv,
+    chunk_dplr_bwd_o,
+)
+from .jax_kernel.chunk_o_fwd import chunk_dplr_fwd_o
+from .jax_kernel.wy_fast_bwd import chunk_dplr_bwd_wy
+from .jax_kernel.wy_fast_fwd import prepare_wy_repr_fwd
+from .jax_kernel.cumsum import chunk_rwkv6_fwd_cumsum
+CHUNKSIZE = 16
+def chunk_dplr_fwd(
+    q: jax.Array,
+    k: jax.Array,
+    v: jax.Array,
+    a: jax.Array,
+    b: jax.Array,
+    gk: jax.Array,
+    scale: float,
+    initial_state: jax.Array,
+    output_final_state: bool,
+    chunk_size: int = 16,
+):
+    T = q.shape[1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    gi, ge = chunk_rwkv6_fwd_cumsum(gk, BT)
+    A_ab, A_qk, A_ak, A_qb, qg, kg, ag, bg = chunk_dplr_fwd_intra(
+        q=q,
+        k=k,
+        a=a,
+        b=b,
+        gi=gi,
+        ge=ge,
+        scale=scale,
+        chunk_size=BT,
+    )
+    del ge
+    # A_ab, A_ak, gi, ge torch.float32
+    # A_qk, A_qb, qg, kg, ag, bg, dtype=q.dtype, eg: bf16
+    w, u, _ = prepare_wy_repr_fwd(ag=ag, A_ab=A_ab, A_ak=A_ak, v=v, chunk_size=BT)
+    del A_ab, A_ak
+    h, v_new, final_state = chunk_dplr_fwd_h(
+        kg=kg,
+        bg=bg,
+        v=v,
+        w=w,
+        u=u,
+        gk=gi,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        chunk_size=BT,
+    )
+    del u, kg, bg, gi
+    o = chunk_dplr_fwd_o(
+        qg=qg, v=v, v_new=v_new, A_qk=A_qk, A_qb=A_qb, h=h, chunk_size=BT
+    )
+    del v_new, h, A_qk, A_qb
+    return o, final_state
+def chunk_dplr_delta_rule_fwd(
+    q: jax.Array,
+    k: jax.Array,
+    v: jax.Array,
+    a: jax.Array,
+    b: jax.Array,
+    gk: jax.Array,
+    scale=None,
+    initial_state=None,
+    output_final_state: bool = True,
+):
+    assert q.dtype == k.dtype == v.dtype
+    # assert q.dtype != torch.float32, "ChunkDeltaRuleFunction does not support float32. Please use bfloat16."
+    # gk = gk.float()
+    scale = k.shape[-1] ** -0.5 if scale is None else scale
+    chunk_size = CHUNKSIZE
+    o, final_state = chunk_dplr_fwd(
+        q=q,
+        k=k,
+        v=v,
+        a=a,
+        b=b,
+        gk=gk,
+        scale=scale,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        chunk_size=chunk_size,
+    )
+    return o, final_state
+def cal_log_w(w: jax.Array) -> jax.Array:
+    return -jnp.exp(w)
+@jax.custom_vjp
+def chunk_dplr(
+    r: jax.Array,
+    k: jax.Array,
+    v: jax.Array,
+    a: jax.Array,
+    b: jax.Array,
+    gk: jax.Array,
+    initial_state: jax.Array = None,
+):
+    return chunk_dplr_delta_rule_fwd(
+        q=r,
+        k=k,
+        v=v,
+        a=a,
+        b=b,
+        gk=gk,
+        scale=1,
+        initial_state=initial_state,
+        output_final_state=True,
+    )
+def chunk_dplr_fwd_jax(
+    r: jax.Array,
+    k: jax.Array,
+    v: jax.Array,
+    a: jax.Array,
+    b: jax.Array,
+    gk: jax.Array,
+    initial_state: jax.Array = None,
+):
+    o, state = chunk_dplr_delta_rule_fwd(
+        q=r,
+        k=k,
+        v=v,
+        a=a,
+        b=b,
+        gk=gk,
+        scale=1,
+        initial_state=initial_state,
+        output_final_state=True,
+    )
+    cache = (r, k, v, a, b, gk, initial_state)
+    return [o, state], cache
+def chunk_dplr_bwd(
+    q: jax.Array,
+    k: jax.Array,
+    v: jax.Array,
+    a: jax.Array,
+    b: jax.Array,
+    gk: jax.Array,
+    initial_state,
+    scale,
+    do: jax.Array,
+    dht: jax.Array,
+    chunk_size: int = CHUNKSIZE,
+):
+    # DTYPE = do.dtype
+    BT = chunk_size
+    scale = scale
+    # if do != None:
+    #    do = do, q.dtype)
+    # if dht != None:
+    #    dht = dht, q.dtype)
+    # ******* start recomputing everything, otherwise i believe the gpu memory will be exhausted *******
+    gi, ge = chunk_rwkv6_fwd_cumsum(gk, BT)
+    A_ab, A_qk, A_ak, A_qb, qg, kg, ag, bg = chunk_dplr_fwd_intra(
+        q=q,
+        k=k,
+        a=a,
+        b=b,
+        gi=gi,
+        ge=ge,
+        scale=scale,
+        chunk_size=BT,
+    )
+    w, u, A_ab_inv = prepare_wy_repr_fwd(
+        ag=ag, A_ab=A_ab, A_ak=A_ak, v=v, chunk_size=BT
+    )
+    del A_ab
+    h, v_new, _ = chunk_dplr_fwd_h(
+        kg=kg, bg=bg, v=v, w=w, u=u, gk=gi, initial_state=initial_state, chunk_size=BT
+    )
+    del u
+    # ******* end of recomputation *******
+    # A_ak, A_ab_inv, gi, ge torch.float32
+    # A_qk, A_qb, qg, kg, ag, bg, v_new dtype=q.dtype, eg: bf16
+    dv_new_intra, dA_qk, dA_qb = chunk_dplr_bwd_dAu(
+        v=v, v_new=v_new, do=do, A_qb=A_qb, scale=scale, chunk_size=BT
+    )
+    dh, dh0, dv_new = chunk_dplr_bwd_dhu(
+        qg=qg,
+        bg=bg,
+        w=w,
+        gk=gi,
+        h0=initial_state,
+        dht=dht,
+        do=do,
+        dv=dv_new_intra,
+        chunk_size=BT,
+    )
+    dv = chunk_dplr_bwd_dv(A_qk=A_qk, kg=kg, do=do, dh=dh, chunk_size=BT)
+    del A_qk
+    dqg, dkg, dw, dbg, dgk_last = chunk_dplr_bwd_o(
+        k=kg,
+        b=bg,
+        v=v,
+        v_new=v_new,
+        do=do,
+        h=h,
+        dh=dh,
+        dv=dv_new,
+        w=w,
+        gk=gi,
+        chunk_size=BT,
+        scale=scale,
+    )
+    del v_new
+    dA_ab, dA_ak, dv, dag = chunk_dplr_bwd_wy(
+        A_ab_inv=A_ab_inv,
+        A_ak=A_ak,
+        v=v,
+        ag=ag,
+        dw=dw,
+        du=dv_new,
+        dv0=dv,
+        chunk_size=BT,
+    )
+    del A_ak
+    dq, dk, da, db, dgk = chunk_dplr_bwd_dqk_intra(
+        q=q,
+        k=k,
+        a=a,
+        b=b,
+        gi=gi,
+        ge=ge,
+        dAqk=dA_qk,
+        dAqb=dA_qb,
+        dAak=dA_ak,
+        dAab=dA_ab,
+        dgk_last=dgk_last,
+        dqg=dqg,
+        dkg=dkg,
+        dag=dag,
+        dbg=dbg,
+        chunk_size=BT,
+        scale=scale,
+    )
+    return (
+        jnp.asarray(dq, q.dtype),
+        jnp.asarray(dk, k.dtype),
+        jnp.asarray(dv, v.dtype),
+        jnp.asarray(da, a.dtype),
+        jnp.asarray(db, b.dtype),
+        jnp.asarray(dgk, gk.dtype),
+        None if initial_state is None else jnp.asarray(dh0, initial_state.dtype),
+    )
+def chunk_dplr_bwd_jax(res, g):
+    q, k, v, a, b, gk, initial_state = res
+    do, dht = g
+    return chunk_dplr_bwd(
+        q,
+        k,
+        v,
+        a,
+        b,
+        gk,
+        initial_state,
+        scale=1,
+        do=do,
+        dht=dht,
+    )
+chunk_dplr.defvjp(chunk_dplr_fwd_jax, chunk_dplr_bwd_jax)
+def transpose_head(x, head_first):
+    # x = jnp.asarray(x,"bfloat16")
+    if head_first:
+        return jnp.transpose(x, (0, 2, 1, 3))
+    else:
+        return x
+# @partial(jax.jit, static_argnames=['initial_state',"output_final_state","head_first","use_chunk"])
+def generalized_delta_rule(
+    r: jax.Array,
+    w: jax.Array,
+    k: jax.Array,
+    v: jax.Array,
+    a: jax.Array,
+    b: jax.Array,
+    initial_state: jax.Array = None,
+    output_final_state: bool = True,
+    head_first: bool = False,
+):
+    r"""
+    Main interface function for chunked delta rule attention.
+    分块 Delta Rule 注意力机制的主要接口函数。
+    Args:
+        q (jax.Array):
+            queries of shape `[B, T, H, K]`
+        k (jax.Array):
+            keys of shape `[B, T, H, K]`
+        v (jax.Array):
+            values of shape `[B, T, H, V]`
+        a (jax.Array):
+            activations of shape `[B, T, H, K]`
+        b (jax.Array):
+            betas of shape `[B, T, H, K]`
+        gk (jax.Array):
+            gk of shape `[B, T, H, K]`  decay term in log space!
+        initial_state (Optional[jax.Array]):
+            Initial state of shape `[N, H, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `False`.
+    Returns:
+        o (jax.Array):
+            Outputs of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
+        final_state (jax.Array):
+            Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
+    """
+    DTYPE = r.dtype
+    r = transpose_head(r, head_first)
+    k = transpose_head(k, head_first)
+    v = transpose_head(v, head_first)
+    a = transpose_head(a, head_first)
+    b = transpose_head(b, head_first)
+    if w is not None:
+        log_w = cal_log_w(w)
+    else:
+        assert log_w is not None, "Either w or log_w must be provided!"
+    log_w = transpose_head(log_w, head_first)
+    o, final_state = chunk_dplr(
+        r=r,
+        k=k,
+        v=v,
+        a=a,
+        b=b,
+        gk=log_w,
+        initial_state=initial_state,
+    )
+    if output_final_state:
+        return jnp.asarray(o, DTYPE), final_state
+    return jnp.asarray(o, DTYPE)

rwkv_ops/rwkv7_kernel/native_keras_op.py ADDED Viewed

@@ -0,0 +1,95 @@
+import keras
+from keras import ops
+def transpose_head(x, head_first):
+    """
+    对输入张量进行转置操作。
+    参数:
+    x: 输入张量。
+    head_first: 布尔值，决定是否进行转置。
+    返回:
+    转置后的张量（如果head_first为True），否则返回原张量。
+    """
+    x = ops.cast(x, "float32")
+    if head_first:
+        return ops.transpose(x, (0, 2, 1, 3))
+    else:
+        return x
+def generalized_delta_rule(
+    r,
+    w,
+    k,
+    v,
+    a,
+    b,
+    initial_state=None,
+    output_final_state: bool = True,
+    head_first: bool = False,
+):
+    """
+    实现广义delta规则的函数。
+    参数:
+    r: 输入张量。
+    w: 权重张量。
+    k, v, a, b: 其他输入张量。
+    initial_state: 初始状态张量。
+    output_final_state: 是否输出最终状态。
+    head_first: 是否在计算中将head维度放在第一位。
+    返回:
+    根据output_final_state参数决定是否返回最终状态。
+    """
+    DTYPE = r.dtype
+    B, T, H, N = ops.shape(r)
+    r = transpose_head(r, head_first)
+    k = transpose_head(k, head_first)
+    v = transpose_head(v, head_first)
+    a = transpose_head(a, head_first)
+    b = transpose_head(b, head_first)
+    w = transpose_head(w, head_first)
+    w = ops.exp(-ops.exp(w))
+    if initial_state is not None:
+        state = initial_state
+        if ops.shape(state)[0] == 1:
+            state = ops.broadcast_to(state, (B, H, N, N))
+    else:
+        state = ops.zeros((B, H, N, N), dtype="float32")
+    out = ops.zeros((B, T, H, N), dtype=r.dtype)
+    def step(t, inputs):
+        """
+        执行单个时间步的计算。
+        参数:
+        t: 当前时间步。
+        inputs: 包含当前状态和输出的列表。
+        返回:
+        更新后的状态和输出。
+        """
+        state, out = inputs
+        kk = ops.reshape(k[:, t, :], (B, H, 1, N))
+        rr = ops.reshape(r[:, t, :], (B, H, N, 1))
+        vv = ops.reshape(v[:, t, :], (B, H, N, 1))
+        aa = ops.reshape(a[:, t, :], (B, H, N, 1))
+        bb = ops.reshape(b[:, t, :], (B, H, 1, N))
+        state = state * w[:, t, :, None, :] + state @ aa @ bb + vv @ kk
+        out = ops.slice_update(
+            out, [0, t, 0, 0], ops.reshape((state @ rr), (B, 1, H, N))
+        )
+        return [state, out]
+    state, out = ops.fori_loop(0, T, step, [state, out])
+    if output_final_state:
+        return ops.cast(out, DTYPE), state
+    return ops.cast(out, DTYPE)

rwkv_ops/rwkv7_kernel/torch_kernel/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from ..torch_kernel.chunk_A_fwd import *
+from ..torch_kernel.chunk_A_bwd import *
+# ---------- chunk_h ----------
+from ..torch_kernel.chunk_h_fwd import *
+from ..torch_kernel.chunk_h_bwd import *
+# ---------- chunk_o ----------
+from ..torch_kernel.chunk_o_fwd import *
+from ..torch_kernel.chunk_o_bwd import *
+from ..torch_kernel.cumsum import *
+from ..torch_kernel.wy_fast_fwd import *
+from ..torch_kernel.wy_fast_bwd import *

rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_bwd.py ADDED Viewed

@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import torch
+import triton
+from ..triton_kernel.chunk_A_bwd import *
+from ..triton_kernel.utils import is_gather_supported
+from ..get_torch_devices_info import check_shared_mem
+def chunk_dplr_bwd_dqk_intra(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    gi: torch.Tensor,
+    ge: torch.Tensor,
+    dAqk: torch.Tensor,
+    dAqb: torch.Tensor,
+    dAak: torch.Tensor,
+    dAab: torch.Tensor,
+    dqg: torch.Tensor,
+    dkg: torch.Tensor,
+    dag: torch.Tensor,
+    dbg: torch.Tensor,
+    dgk_last: torch.Tensor,
+    scale: float = 1.0,
+    chunk_size: int = 16,
+):
+    B, T, H, K = q.shape
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BK = (
+        min(64, triton.next_power_of_2(K))
+        if check_shared_mem()
+        else min(32, triton.next_power_of_2(K))
+    )
+    NT = triton.cdiv(T, BT)
+    NK = triton.cdiv(K, BK)
+    grid = (NK, NT, B * H)
+    dq = torch.empty_like(q)
+    dk = torch.empty_like(k)
+    da = torch.empty_like(a)
+    db = torch.empty_like(b)
+    dgk = torch.empty_like(gi, dtype=torch.float)
+    dgk_offset = torch.empty_like(gi, dtype=torch.float)
+    chunk_dplr_bwd_kernel_intra[grid](
+        q=q,
+        k=k,
+        a=a,
+        b=b,
+        gi=gi,
+        ge=ge,
+        dAqk=dAqk,
+        dAqb=dAqb,
+        dAak=dAak,
+        dAab=dAab,
+        dq=dq,
+        dk=dk,
+        dgk=dgk,
+        dgk_offset=dgk_offset,
+        dqg=dqg,
+        dkg=dkg,
+        dag=dag,
+        dbg=dbg,
+        da=da,
+        db=db,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BC=BT,
+        BK=BK,
+        GATHER_SUPPORTED=is_gather_supported,
+    )
+    dgk_output = torch.empty_like(dgk)
+    def grid(meta):
+        return (NT, triton.cdiv(K, meta["BK"]), B * H)
+    chunk_dplr_bwd_dgk_kernel[grid](
+        dgk=dgk,
+        dgk_offset=dgk_offset,
+        dgk_last=dgk_last,
+        dgk_output=dgk_output,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+    )
+    return dq, dk, da, db, dgk_output

rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_fwd.py ADDED Viewed

@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import torch
+import triton
+from ..triton_kernel.utils import is_gather_supported
+from ..triton_kernel.chunk_A_fwd import *
+def chunk_dplr_fwd_intra(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    gi: torch.Tensor,
+    ge: torch.Tensor,
+    scale: float,
+    chunk_size: int,
+):
+    B, T, H, K = k.shape
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    NT = triton.cdiv(T, BT)
+    Aqk = q.new_empty(B, T, H, BT, dtype=q.dtype)
+    Aqb = q.new_empty(B, T, H, BT, dtype=q.dtype)
+    # involving matrix inverse and it'd be better to use float here.
+    Aab = q.new_empty(B, T, H, BT, dtype=torch.float)
+    Aak = q.new_empty(B, T, H, BT, dtype=torch.float)
+    grid = (NT, B, H)
+    BK = triton.next_power_of_2(K)
+    qg = torch.empty_like(q)
+    kg = torch.empty_like(k, dtype=q.dtype)
+    ag = torch.empty_like(a, dtype=q.dtype)
+    bg = torch.empty_like(b, dtype=q.dtype)
+    chunk_dplr_fwd_A_kernel_intra_sub_intra[grid](
+        q=q,
+        k=k,
+        a=a,
+        b=b,
+        gi=gi,
+        ge=ge,
+        Aqk=Aqk,
+        Aqb=Aqb,
+        Aab=Aab,
+        Aak=Aak,
+        qg=qg,
+        kg=kg,
+        ag=ag,
+        bg=bg,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BC=BT,
+        BK=BK,
+        GATHER_SUPPORTED=is_gather_supported,
+    )
+    return Aab, Aqk, Aak, Aqb, qg, kg, ag, bg

rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_bwd.py ADDED Viewed

@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+from ..get_torch_devices_info import check_shared_mem
+from ..triton_kernel.chunk_h_bwd import *
+def chunk_dplr_bwd_dhu(
+    qg: torch.Tensor,
+    bg: torch.Tensor,
+    w: torch.Tensor,
+    gk: torch.Tensor,
+    h0: torch.Tensor,
+    dht: Optional[torch.Tensor],
+    do: torch.Tensor,
+    dv: torch.Tensor,
+    chunk_size: int = 64,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *qg.shape, do.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    BK = triton.next_power_of_2(K)
+    assert BK <= 256, (
+        "current kernel does not support head dimension being larger than 256."
+    )
+    # H100
+    if check_shared_mem("hopper", qg.device.index):
+        BV = 64
+        BC = 64 if K <= 128 else 32
+    elif check_shared_mem("ampere", qg.device.index):  # A100
+        BV = 32
+        BC = 32
+    else:  # Etc: 4090
+        BV = 16
+        BC = 16
+    N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    BC = min(BT, BC)
+    NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+    assert NK == 1, (
+        "NK > 1 is not supported because it involves time-consuming synchronization"
+    )
+    dh = qg.new_empty(B, NT, H, K, V)
+    dh0 = torch.empty_like(h0, dtype=torch.float32) if h0 is not None else None
+    dv2 = torch.zeros_like(dv)
+    grid = (NK, NV, N * H)
+    chunk_dplr_bwd_kernel_dhu[grid](
+        qg=qg,
+        bg=bg,
+        w=w,
+        gk=gk,
+        dht=dht,
+        dh0=dh0,
+        do=do,
+        dh=dh,
+        dv=dv,
+        dv2=dv2,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BC=BC,
+        BK=BK,
+        BV=BV,
+    )
+    return dh, dh0, dv2