PyPI - rwkv-ops - Versions diffs - 0.6.1__py3-none-any.whl - Mend

rwkv-ops 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

rwkv_ops/__init__.py +45 -0
rwkv_ops/mhc_kernel/__init__.py +50 -0
rwkv_ops/mhc_kernel/common_kernel/include/mhc_types.h +66 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/mhc_post_op.cuh +197 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/mhc_pre_op.cuh +212 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/rmsnorm.cuh +152 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/sinkhorn_knopp.cuh +158 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/stream_aggregate.cuh +141 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/stream_distribute.cuh +111 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/stream_mix.cuh +164 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/type_conversions.cuh +52 -0
rwkv_ops/mhc_kernel/jax_kernel/CMakeLists.txt +47 -0
rwkv_ops/mhc_kernel/jax_kernel/mhu_ffi.cu +652 -0
rwkv_ops/mhc_kernel/jax_kernel/mhu_jax.py +939 -0
rwkv_ops/mhc_kernel/native_keras_op.py +193 -0
rwkv_ops/mhc_kernel/torch_kernel/mhc_cuda.cu +207 -0
rwkv_ops/mhc_kernel/torch_kernel/mhc_op.cpp +296 -0
rwkv_ops/mhc_kernel/torch_kernel/mhc_torch.py +306 -0
rwkv_ops/rwkv6_kernel/__init__.py +120 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/gpu_ops.cpp +44 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/kernel_helpers.h +64 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/kernels.h +56 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/pybind11_kernel_helpers.h +41 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/rwkv_kernels.cu +512 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/gpu_ops.cpp +44 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/kernel_helpers.h +64 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/kernels.h +56 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/pybind11_kernel_helpers.h +41 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/rwkv_kernels.hip +514 -0
rwkv_ops/rwkv6_kernel/jax_rwkv_kernel.py +722 -0
rwkv_ops/rwkv6_kernel/ops_rwkv_kernel.py +90 -0
rwkv_ops/rwkv6_kernel/torch_kernel/wkv6_cuda.cu +397 -0
rwkv_ops/rwkv6_kernel/torch_kernel/wkv6_op.cpp +93 -0
rwkv_ops/rwkv6_kernel/torch_rwkv_kernel.py +305 -0
rwkv_ops/rwkv7_kernel/__init__.py +113 -0
rwkv_ops/rwkv7_kernel/get_jax_devices_info.py +220 -0
rwkv_ops/rwkv7_kernel/get_torch_devices_info.py +250 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/CMakeLists.txt +42 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_ffi.cu +399 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_jax.py +311 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel_single/CMakeLists.txt +42 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel_single/wkv7_single_step_ffi.cu +172 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel_single/wkv7_single_step_jax.py +190 -0
rwkv_ops/rwkv7_kernel/jax_kernel/__init__.py +9 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_bwd.py +95 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_fwd.py +60 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_bwd.py +78 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_fwd.py +80 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_bwd.py +150 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_fwd.py +45 -0
rwkv_ops/rwkv7_kernel/jax_kernel/cumsum.py +34 -0
rwkv_ops/rwkv7_kernel/jax_kernel/wy_fast_bwd.py +61 -0
rwkv_ops/rwkv7_kernel/jax_kernel/wy_fast_fwd.py +86 -0
rwkv_ops/rwkv7_kernel/jax_op.py +382 -0
rwkv_ops/rwkv7_kernel/mlx_op.py +118 -0
rwkv_ops/rwkv7_kernel/native_keras_op.py +108 -0
rwkv_ops/rwkv7_kernel/tf_eager_kernel.py +155 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_cuda.cu +235 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_op.cpp +63 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_torch.py +233 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_cuda.cu +101 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_op.cpp +56 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_torch.py +112 -0
rwkv_ops/rwkv7_kernel/torch_kernel/__init__.py +13 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_bwd.py +96 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_fwd.py +64 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_bwd.py +74 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_fwd.py +75 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_bwd.py +148 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_fwd.py +44 -0
rwkv_ops/rwkv7_kernel/torch_kernel/cumsum.py +31 -0
rwkv_ops/rwkv7_kernel/torch_kernel/wy_fast_bwd.py +63 -0
rwkv_ops/rwkv7_kernel/torch_kernel/wy_fast_fwd.py +79 -0
rwkv_ops/rwkv7_kernel/torch_op.py +504 -0
rwkv_ops/rwkv7_kernel/triton_kernel/__init__.py +34 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_bwd.py +328 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_fwd.py +186 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_bwd.py +157 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_fwd.py +160 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_o_bwd.py +382 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_o_fwd.py +137 -0
rwkv_ops/rwkv7_kernel/triton_kernel/cumsum.py +86 -0
rwkv_ops/rwkv7_kernel/triton_kernel/utils.py +20 -0
rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_bwd.py +193 -0
rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_fwd.py +326 -0
rwkv_ops-0.6.1.dist-info/METADATA +495 -0
rwkv_ops-0.6.1.dist-info/RECORD +89 -0
rwkv_ops-0.6.1.dist-info/WHEEL +4 -0
rwkv_ops-0.6.1.dist-info/licenses/LICENSE.txt +201 -0

rwkv_ops/rwkv7_kernel/native_keras_op.py ADDED Viewed

@@ -0,0 +1,108 @@
+import keras
+from keras import ops
+def transpose_head(x, head_first):
+    """
+    对输入张量进行转置操作。
+    参数:
+    x: 输入张量。
+    head_first: 布尔值，决定是否进行转置。
+    返回:
+    转置后的张量（如果head_first为True），否则返回原张量。
+    """
+    x = ops.cast(x, "float32")
+    if head_first:
+        return ops.transpose(x, (0, 2, 1, 3))
+    else:
+        return x
+def generalized_delta_rule(
+    r,
+    w,
+    k,
+    v,
+    a,
+    b,
+    initial_state=None,
+    output_final_state: bool = True,
+    head_first: bool = False,
+):
+    """
+    实现广义delta规则的函数。
+    参数:
+    r: 输入张量。
+    w: 权重张量。
+    k, v, a, b: 其他输入张量。
+    initial_state: 初始状态张量。
+    output_final_state: 是否输出最终状态。
+    head_first: 是否在计算中将head维度放在第一位。
+    返回:
+    根据output_final_state参数决定是否返回最终状态。
+    """
+    DTYPE = r.dtype
+    B, T, H, N = ops.shape(r)
+    r = transpose_head(r, head_first)
+    k = transpose_head(k, head_first)
+    v = transpose_head(v, head_first)
+    a = transpose_head(a, head_first)
+    b = transpose_head(b, head_first)
+    w = transpose_head(w, head_first)
+    w = ops.exp(-ops.exp(w))
+    if initial_state is not None:
+        state = initial_state
+        if ops.shape(state)[0] == 1:
+            state = ops.broadcast_to(state, (B, H, N, N))
+    else:
+        state = ops.zeros((B, H, N, N))
+    state = ops.cast(state, "float32")
+    keras_backend = keras.config.backend()
+    def step(t, inputs):
+        """
+        执行单个时间步的计算。
+        参数:
+        t: 当前时间步。
+        inputs: 包含当前状态和输出的列表。
+        返回:
+        更新后的状态和输出。
+        """
+        state, out = inputs
+        kk = ops.reshape(k[:, t, :], (B, H, 1, N))
+        rr = ops.reshape(r[:, t, :], (B, H, N, 1))
+        vv = ops.reshape(v[:, t, :], (B, H, N, 1))
+        aa = ops.reshape(a[:, t, :], (B, H, N, 1))
+        bb = ops.reshape(b[:, t, :], (B, H, 1, N))
+        state = state * w[:, t, :, None, :] + state @ aa @ bb + vv @ kk
+        o = ops.cast((state @ rr), out.dtype)
+        if keras_backend == "tensorflow":
+            out = out.write(t, ops.reshape(o, (B, H, N)))
+        elif keras_backend == "torch":
+            out[:, t : t + 1] = ops.reshape(o, (B, 1, H, N))
+        else:
+            out = ops.slice_update(out, [0, t, 0, 0], ops.reshape(o, (B, 1, H, N)))
+        return [state, out]
+    if keras_backend == "tensorflow":
+        import tensorflow as tf
+        out = tf.TensorArray(DTYPE, size=T)
+    else:
+        out = ops.zeros((B, T, H, N), DTYPE)
+    state, out = ops.fori_loop(0, T, step, [state, out])
+    if keras_backend == "tensorflow":
+        out = ops.transpose(out.stack(), [1, 0, 2, 3])
+    if output_final_state:
+        return ops.cast(out, DTYPE), state
+    return ops.cast(out, DTYPE)

rwkv_ops/rwkv7_kernel/tf_eager_kernel.py ADDED Viewed

@@ -0,0 +1,155 @@
+"""
+TensorFlow 版 generalized_delta_rule
+前向用 tf.py_function 调 JAX CUDA 内核，反向同样走 JAX。
+可 @tf.function 编译，可 tf.GradientTape 训练。
+"""
+import tensorflow as tf
+from typing import Optional, Tuple
+import jax.numpy as jnp
+from .jax_cuda_kernel.wkv7_jax import get_jax_generalized_delta_rule
+from .jax_cuda_kernel_single.wkv7_single_step_jax import (
+    get_jax_generalized_delta_rule_single_step,
+)
+def transpose_head(x, head_first: bool):
+    """(B, T, H, K) <-> (B, H, T, K)"""
+    x = tf.cast(x, dtype=tf.float32)
+    if head_first:
+        return tf.transpose(x, (0, 2, 1, 3))
+    return x
+def get_tf_generalized_delta_rule(HEAD_SIZE=64):
+    generalized_delta_rule_inference = get_jax_generalized_delta_rule(HEAD_SIZE)[1]
+    # ---------- 底层 kernel 包装 ----------
+    @tf.py_function(Tout=[tf.bfloat16, tf.float32])
+    def _tf_wkv7_fwd(w, q, k, v, a, b, h0):
+        """tf.py_function 包装 JAX 前向"""
+        y, s = generalized_delta_rule_inference(
+            w=jnp.asarray(w, jnp.bfloat16),
+            r=jnp.asarray(q, jnp.bfloat16),
+            k=jnp.asarray(k, jnp.bfloat16),
+            v=jnp.asarray(v, jnp.bfloat16),
+            a=jnp.asarray(a, jnp.bfloat16),
+            b=jnp.asarray(b, jnp.bfloat16),
+            initial_state=jnp.asarray(h0, jnp.float32),
+        )
+        return (
+            tf.convert_to_tensor(y, tf.bfloat16),
+            tf.convert_to_tensor(s, tf.float32),
+        )
+    # ---------- 用户接口 ----------
+    def generalized_delta_rule(
+        r: tf.Tensor,  # (B, T, H, K) 或 (B, H, T, K)
+        w: tf.Tensor,
+        k: tf.Tensor,
+        v: tf.Tensor,
+        a: tf.Tensor,
+        b: tf.Tensor,
+        initial_state: Optional[tf.Tensor] = None,
+        output_final_state: bool = True,
+        head_first: bool = False,
+        chunk_len: int = 16,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        """
+        与 JAX 版接口 1:1 对齐，返回 (out, last_state)
+        可 @tf.function  compile，可 tf.GradientTape 训练
+        """
+        dtype = r.dtype
+        r = transpose_head(r, head_first)
+        w = transpose_head(w, head_first)
+        k = transpose_head(k, head_first)
+        v = transpose_head(v, head_first)
+        a = transpose_head(a, head_first)
+        b = transpose_head(b, head_first)
+        B, T, H, K = tf.unstack(tf.shape(r), num=4)
+        if T % chunk_len != 0:
+            raise ValueError(f"T={T} must be divisible by chunk_len={chunk_len}")
+        if initial_state is None:
+            h0 = tf.zeros([B, H, K, K], dtype=tf.float32)
+        else:
+            h0 = tf.cast(initial_state, tf.float32)
+        # 带梯度前向
+        out, last_state = _tf_wkv7_fwd(w, r, k, v, a, b, h0)
+        # 转回用户期望 dtype
+        out = tf.cast(out, dtype)
+        return (out, last_state) if output_final_state else out
+    return generalized_delta_rule
+def get_tf_generalized_delta_rule_single_step(HEAD_SIZE=64):
+    # 获取 JAX 版本的单步 generalized delta rule
+    _wkv7_single_step_kernel = get_jax_generalized_delta_rule_single_step(HEAD_SIZE)
+    # ---------- 底层 kernel 包装 ----------
+    @tf.py_function(Tout=[tf.bfloat16, tf.float32])
+    def _tf_wkv7_single_step_fwd(w, r, k, v, a, b, h0):
+        """tf.py_function 包装 JAX 单步前向"""
+        y, s = _wkv7_single_step_kernel(
+            w=jnp.asarray(w, jnp.bfloat16),
+            r=jnp.asarray(r, jnp.bfloat16),
+            k=jnp.asarray(k, jnp.bfloat16),
+            v=jnp.asarray(v, jnp.bfloat16),
+            a=jnp.asarray(a, jnp.bfloat16),
+            b=jnp.asarray(b, jnp.bfloat16),
+            initial_state=jnp.asarray(h0, jnp.float32),
+        )
+        return (
+            tf.convert_to_tensor(y, tf.bfloat16),
+            tf.convert_to_tensor(s, tf.float32),
+        )
+    # ---------- 用户接口 ----------
+    def generalized_delta_rule_single_step(
+        r: tf.Tensor,  # (B, 1, H, K) 或 (B, H, 1, K)
+        w: tf.Tensor,
+        k: tf.Tensor,
+        v: tf.Tensor,
+        a: tf.Tensor,
+        b: tf.Tensor,
+        initial_state: Optional[tf.Tensor] = None,
+        output_final_state: bool = True,
+        head_first: bool = False,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        """
+        单步 generalized delta rule 实现
+        与 JAX 版单步接口对齐，返回 (out, last_state)
+        """
+        dtype = r.dtype
+        r = transpose_head(r, head_first)
+        w = transpose_head(w, head_first)
+        k = transpose_head(k, head_first)
+        v = transpose_head(v, head_first)
+        a = transpose_head(a, head_first)
+        b = transpose_head(b, head_first)
+        B, T, H, K = tf.unstack(tf.shape(r), num=4)
+        if T != 1:
+            raise ValueError(f"Single-step kernel requires T=1, but got T={T}")
+        if initial_state is None:
+            h0 = tf.zeros([B, H, K, K], dtype=tf.float32)
+        else:
+            h0 = tf.cast(initial_state, tf.float32)
+        # 前向计算
+        y, s = _tf_wkv7_single_step_fwd(w, r, k, v, a, b, h0)
+        # 转回用户期望 dtype
+        out = tf.cast(y, dtype)
+        return (out, s) if output_final_state else out
+    return generalized_delta_rule_single_step

rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_cuda.cu ADDED Viewed

@@ -0,0 +1,235 @@
+#include <cuda_bf16.h>
+#include <assert.h>
+#include <cstdint>
+// ref link:https://github.com/BlinkDL/RWKV-CUDA/tree/main/rwkv7_fast_fused
+using bf = __nv_bfloat16;
+__device__ inline float to_float(const bf & u) {
+    return __bfloat162float(u);
+}
+__device__ inline bf to_bf(const float & u) {
+    return __float2bfloat16_rn(u);
+}
+typedef bf * __restrict__ F_;
+/* -------------------- 前向传播 Kernel -------------------- */
+template<int C> __launch_bounds__(C, 2)  // 【优化1】显式指定 launch bounds，提升 Occupancy
+__global__ void forward_kernel(int T, int H,
+     F_ w_, F_ q_, F_ k_, F_ v_, F_ a_, F_ b_,
+      bf* y_, float* s_, float* sa_, float* h0_) {
+    int bb = blockIdx.y, hh = blockIdx.x, i = threadIdx.x;
+    float state[C] = {0};
+    __shared__ float q[C], k[C], w[C], a[C], b[C];
+    int64_t h0_base = ((int64_t)bb*H + hh)*C*C + i*C;
+    #pragma unroll
+    for (int j = 0; j < C; j++) {
+        state[j] = h0_[h0_base + j];
+    }
+    for (int t = 0; t < T; t++) {
+        int64_t ind = (int64_t)bb*T*H*C + (int64_t)t*H*C + hh * C + i;
+        __syncthreads();
+        q[i] = to_float(q_[ind]);
+        w[i] = __expf(-__expf(to_float(w_[ind])));
+        k[i] = to_float(k_[ind]);
+        a[i] = to_float(a_[ind]);
+        b[i] = to_float(b_[ind]);
+        __syncthreads();
+        float sa = 0;
+        #pragma unroll
+        for (int j = 0; j < C; j++) {
+            sa += a[j] * state[j];
+        }
+        sa_[ind] = sa;
+        float v_val = to_float(v_[ind]);
+        float y = 0;
+        #pragma unroll
+        for (int j = 0; j < C; j++) {
+            float &s = state[j];
+            s = s * w[j] + sa * b[j] + k[j] * v_val;
+            y += s * q[j];
+        }
+        y_[ind] = to_bf(y);
+        if ((t+1)%_CHUNK_LEN_ == 0) {
+            int64_t base = ((int64_t)bb*H+hh)*(T/_CHUNK_LEN_)*C*C + ((int64_t)t/_CHUNK_LEN_)*C*C + i;
+            #pragma unroll
+            for (int j = 0; j < C; j++) {
+                s_[base + j*C] = state[j];
+            }
+        }
+    }
+}
+/* -------------------- 反向传播 Kernel -------------------- */
+template<int C> __launch_bounds__(C, 2)  // 【优化1】显式指定 launch bounds
+__global__ void backward_kernel(int T, int H,
+    F_ w_, F_ q_, F_ k_, F_ v_, F_ a_, F_ b_, F_ dy_,
+    float * __restrict__ s_, float * __restrict__ sa_,
+    float * __restrict__ dht_, float * __restrict__ dh0_,
+    bf* dw_, bf* dq_, bf* dk_, bf* dv_, bf* da_, bf* db_) {
+    int bb = blockIdx.y, hh = blockIdx.x, i = threadIdx.x;
+    float stateT[C] = {0}, dstate[C] = {0}, dstateT[C] = {0};
+    int64_t dht_base = ((int64_t)bb*H + hh)*C*C + i*C;
+    #pragma unroll
+    for (int j = 0; j < C; j++) {
+        dstate[j] = dht_[dht_base + j];
+        dstateT[j] = dht_[dht_base + j];
+    }
+    __shared__ float w[C], q[C], k[C], v[C], a[C], b[C], dy[C], sa[C], dSb_shared[C];
+    float qi, wi, ki, ai, bi, dyi;
+    for (int t = T-1; t >= 0; t--) {
+        int64_t ind = (int64_t)bb*T*H*C + (int64_t)t*H*C + hh * C + i;
+        __syncthreads();
+        q[i] = qi = to_float(q_[ind]);
+        float wi_fac = -__expf(to_float(w_[ind]));
+        w[i] = wi = __expf(wi_fac);
+        k[i] = ki = to_float(k_[ind]);
+        v[i] = to_float(v_[ind]);
+        a[i] = ai = to_float(a_[ind]);
+        b[i] = bi = to_float(b_[ind]);
+        dy[i] = dyi = to_float(dy_[ind]);
+        sa[i] = sa_[ind];
+        __syncthreads();
+        if ((t+1)%_CHUNK_LEN_ == 0) {
+            int64_t base = ((int64_t)bb*H+hh)*(T/_CHUNK_LEN_)*C*C + ((int64_t)t/_CHUNK_LEN_)*C*C + i*C;
+            // 【优化2】使用 float4 向量加载，内存带宽提升 4倍
+            const float4* s4 = (const float4*)(s_ + base);
+            #pragma unroll
+            for (int j4 = 0; j4 < C/4; j4++) {
+                float4 q_vec = s4[j4];
+                const int j = j4 * 4;
+                stateT[j+0] = q_vec.x;
+                stateT[j+1] = q_vec.y;
+                stateT[j+2] = q_vec.z;
+                stateT[j+3] = q_vec.w;
+            }
+        }
+        float dq_val = 0;
+        #pragma unroll
+        for (int j = 0; j < C; j++) {
+            dq_val += stateT[j] * dy[j];
+        }
+        dq_[ind] = to_bf(dq_val);
+        float iwi = 1.0f/(wi + 0.000001f);
+        #pragma unroll
+        for (int j = 0; j < C; j++) {
+            stateT[j] = (stateT[j] - ki*v[j] - bi*sa[j]) * iwi;
+            dstate[j] += dyi * q[j];
+            dstateT[j] += qi * dy[j];
+        }
+        float dw = 0, dk = 0, dv = 0, db = 0, dSb = 0;
+        #pragma unroll
+        for (int j = 0; j < C; j++) {
+            dw += dstateT[j] * stateT[j];
+            dk += dstateT[j] * v[j];
+            dv += dstate[j] * k[j];
+            dSb += dstate[j] * b[j];
+            db += dstateT[j] * sa[j];
+        }
+        dw_[ind] = to_bf(dw * wi * wi_fac);
+        dk_[ind] = to_bf(dk);
+        dv_[ind] = to_bf(dv);
+        db_[ind] = to_bf(db);
+        __syncthreads();
+        dSb_shared[i] = dSb;
+        __syncthreads();
+        float da = 0;
+        #pragma unroll
+        for (int j = 0; j < C; j++) {
+            da += stateT[j] * dSb_shared[j];
+        }
+        da_[ind] = to_bf(da);
+        #pragma unroll
+        for (int j = 0; j < C; j++) {
+            dstate[j] = dstate[j] * w[j] + dSb * a[j];
+            dstateT[j] = dstateT[j] * wi + ai * dSb_shared[j];
+            if (t == 0) {
+                dh0_[dht_base + j] = dstate[j];
+            }
+        }
+    }
+}
+/* -------------------- 推理专用 Kernel -------------------- */
+template<int C> __launch_bounds__(C, 2)  // 【优化1】推理 kernel 同样优化
+__global__ void forward_inference_kernel(int T, int H,
+                                         F_ w_, F_ q_, F_ k_, F_ v_, F_ a_, F_ b_,
+                                         bf *y_, float *s_, float *h0_) {
+    int bb = blockIdx.y, hh = blockIdx.x, i = threadIdx.x;
+    float state[C] = {0};
+    __shared__ float q[C], k[C], w[C], a[C], b[C];
+    int64_t h0_base = ((int64_t)bb * H + hh) * C * C + i * C;
+    #pragma unroll
+    for (int j = 0; j < C; ++j) state[j] = h0_[h0_base + j];
+    for (int t = 0; t < T; ++t) {
+        int64_t ind = (int64_t)bb * T * H * C + (int64_t)t * H * C + hh * C + i;
+        __syncthreads();
+        q[i] = to_float(q_[ind]);
+        w[i] = __expf(-__expf(to_float(w_[ind])));
+        k[i] = to_float(k_[ind]);
+        a[i] = to_float(a_[ind]);
+        b[i] = to_float(b_[ind]);
+        __syncthreads();
+        float sa = 0.f;
+        #pragma unroll
+        for (int j = 0; j < C; ++j) sa += a[j] * state[j];
+        float v_val = to_float(v_[ind]);
+        float y = 0.f;
+        #pragma unroll
+        for (int j = 0; j < C; ++j) {
+            float &s = state[j];
+            s = s * w[j] + sa * b[j] + k[j] * v_val;
+            y += s * q[j];
+        }
+        y_[ind] = to_bf(y);
+    }
+    // 仅写入最终状态
+    int64_t base = ((int64_t)bb * H + hh) * C * C + i * C;
+    #pragma unroll
+    for (int j = 0; j < C; ++j) s_[base + j] = state[j];
+}
+/* -------------------- C 接口函数 -------------------- */
+void cuda_forward(int B, int T, int H, bf*w, bf*q, bf*k, bf*v, bf*z, bf*a, bf*y, float*s, float*sa, float* h0) {
+    forward_kernel<_C_><<<dim3(H,B), dim3(_C_)>>>(T,H,w,q,k,v,z,a,y,s,sa,h0);
+}
+void cuda_backward(int B, int T, int H,
+     bf*w, bf*q, bf*k, bf*v, bf*z, bf*a, bf*dy,
+    float*s, float*sa,float*dht,float*dh0,
+    bf*dw, bf*dq, bf*dk, bf*dv, bf*dz, bf*da
+    ) {
+    assert(T%_CHUNK_LEN_ == 0);
+    backward_kernel<_C_><<<dim3(H,B), dim3(_C_)>>>(T,H,w,q,k,v,z,a,dy,s,sa,dht,dh0,dw,dq,dk,dv,dz,da);
+}
+void cuda_forward_inference(int B, int T, int H,
+                            bf* w, bf* q, bf* k, bf* v, bf* a, bf* b,
+                            bf* y, float* s, float* h0) {
+    forward_inference_kernel<_C_><<<dim3(H, B), dim3(_C_)>>>(T, H, w, q, k, v, a, b, y, s, h0);
+}

rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_op.cpp ADDED Viewed

@@ -0,0 +1,63 @@
+#include <torch/extension.h>
+#include <cuda_bf16.h>
+using bf = __nv_bfloat16;
+// ---------- 原有函数声明 ----------
+void cuda_forward(int B, int T, int H, bf*w, bf*q, bf*k, bf*v, bf*z, bf*a, bf*y, float*s, float*sa, float* h0);
+void cuda_backward(int B, int T, int H, bf*w, bf*q, bf*k, bf*v, bf*z, bf*a, bf*dy, float*s, float*sa,float*dht,float*dh0, bf*dw, bf*dq, bf*dk, bf*dv, bf*dz, bf*da);
+// ---------- 新增推理函数声明（必须！）----------
+void cuda_forward_inference(int B, int T, int H, bf*w, bf*q, bf*k, bf*v, bf*a, bf*b, bf*y, float*s, float* h0);
+// ---------- 原有forward函数 ----------
+void forward(torch::Tensor &w, torch::Tensor &q, torch::Tensor &k, torch::Tensor &v,
+    torch::Tensor &z, torch::Tensor &a,
+    torch::Tensor &y,
+    torch::Tensor &s, torch::Tensor &sa,torch::Tensor &h0) {
+    int B = w.sizes()[0], T = w.sizes()[1], H = w.sizes()[2];
+    cuda_forward(B, T, H,
+        (bf*)w.data_ptr(), (bf*)q.data_ptr(), (bf*)k.data_ptr(), (bf*)v.data_ptr(), (bf*)z.data_ptr(), (bf*)a.data_ptr(), (bf*)y.data_ptr(),
+        (float*)s.data_ptr(), (float*)sa.data_ptr(),(float*)h0.data_ptr());
+}
+// ---------- 原有backward函数 ----------
+void backward(torch::Tensor &w, torch::Tensor &q, torch::Tensor &k, torch::Tensor &v, torch::Tensor &z, torch::Tensor &a, torch::Tensor &dy,
+torch::Tensor &s, torch::Tensor &sa,torch::Tensor &dht,torch::Tensor &dh0,
+ torch::Tensor &dw, torch::Tensor &dq, torch::Tensor &dk, torch::Tensor &dv, torch::Tensor &dz, torch::Tensor &da) {
+    int B = w.sizes()[0], T = w.sizes()[1], H = w.sizes()[2];
+    cuda_backward(B, T, H, (bf*)w.data_ptr(), (bf*)q.data_ptr(), (bf*)k.data_ptr(), (bf*)v.data_ptr(), (bf*)z.data_ptr(), (bf*)a.data_ptr(),
+     (bf*)dy.data_ptr(),
+    (float*)s.data_ptr(), (float*)sa.data_ptr(),(float*)dht.data_ptr(),(float*)dh0.data_ptr(),
+     (bf*)dw.data_ptr(), (bf*)dq.data_ptr(), (bf*)dk.data_ptr(), (bf*)dv.data_ptr(), (bf*)dz.data_ptr(), (bf*)da.data_ptr());
+}
+// ---------- 新增推理forward函数 ----------
+void forward_inference(torch::Tensor &w, torch::Tensor &q, torch::Tensor &k, torch::Tensor &v,
+                      torch::Tensor &a, torch::Tensor &b,
+                      torch::Tensor &y,
+                      torch::Tensor &s, torch::Tensor &h0) {
+    int B = w.sizes()[0], T = w.sizes()[1], H = w.sizes()[2];
+    cuda_forward_inference(B, T, H,
+        (bf*)w.data_ptr(), (bf*)q.data_ptr(), (bf*)k.data_ptr(), (bf*)v.data_ptr(),
+        (bf*)a.data_ptr(), (bf*)b.data_ptr(),
+        (bf*)y.data_ptr(),
+        (float*)s.data_ptr(), (float*)h0.data_ptr());
+}
+// ---------- 合并的算子注册（不要分开写！）----------
+TORCH_LIBRARY(wind_backstepping, m) {
+    // 训练算子
+    m.def("forward(Tensor w, Tensor q, Tensor k, Tensor v, Tensor z, Tensor a, Tensor(a!) y, Tensor(b!) s, Tensor(c!) sa, Tensor(f!) h0) -> ()");
+    m.def("backward(Tensor w, Tensor q, Tensor k, Tensor v, Tensor z, Tensor a, Tensor dy, Tensor s, Tensor sa,Tensor dht,Tensor(a!) dh0, Tensor(b!) dw, Tensor(c!) dq, Tensor(d!) dk, Tensor(e!) dv, Tensor(f!) dz, Tensor(g!) da) -> ()");
+    // 推理算子（追加到同一个块内）
+    m.def("forward_inference(Tensor w, Tensor q, Tensor k, Tensor v, Tensor a, Tensor b, Tensor(a!) y, Tensor(b!) s, Tensor(c!) h0) -> ()");
+}
+// ---------- 合并的实现注册 ----------
+TORCH_LIBRARY_IMPL(wind_backstepping, CUDA, m) {
+    m.impl("forward", &forward);
+    m.impl("backward", &backward);
+    m.impl("forward_inference", &forward_inference);
+}