PyPI - rwkv-ops - Versions diffs - 0.6.1__py3-none-any.whl - Mend

rwkv-ops 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

rwkv_ops/__init__.py +45 -0
rwkv_ops/mhc_kernel/__init__.py +50 -0
rwkv_ops/mhc_kernel/common_kernel/include/mhc_types.h +66 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/mhc_post_op.cuh +197 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/mhc_pre_op.cuh +212 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/rmsnorm.cuh +152 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/sinkhorn_knopp.cuh +158 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/stream_aggregate.cuh +141 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/stream_distribute.cuh +111 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/stream_mix.cuh +164 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/type_conversions.cuh +52 -0
rwkv_ops/mhc_kernel/jax_kernel/CMakeLists.txt +47 -0
rwkv_ops/mhc_kernel/jax_kernel/mhu_ffi.cu +652 -0
rwkv_ops/mhc_kernel/jax_kernel/mhu_jax.py +939 -0
rwkv_ops/mhc_kernel/native_keras_op.py +193 -0
rwkv_ops/mhc_kernel/torch_kernel/mhc_cuda.cu +207 -0
rwkv_ops/mhc_kernel/torch_kernel/mhc_op.cpp +296 -0
rwkv_ops/mhc_kernel/torch_kernel/mhc_torch.py +306 -0
rwkv_ops/rwkv6_kernel/__init__.py +120 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/gpu_ops.cpp +44 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/kernel_helpers.h +64 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/kernels.h +56 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/pybind11_kernel_helpers.h +41 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/rwkv_kernels.cu +512 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/gpu_ops.cpp +44 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/kernel_helpers.h +64 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/kernels.h +56 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/pybind11_kernel_helpers.h +41 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/rwkv_kernels.hip +514 -0
rwkv_ops/rwkv6_kernel/jax_rwkv_kernel.py +722 -0
rwkv_ops/rwkv6_kernel/ops_rwkv_kernel.py +90 -0
rwkv_ops/rwkv6_kernel/torch_kernel/wkv6_cuda.cu +397 -0
rwkv_ops/rwkv6_kernel/torch_kernel/wkv6_op.cpp +93 -0
rwkv_ops/rwkv6_kernel/torch_rwkv_kernel.py +305 -0
rwkv_ops/rwkv7_kernel/__init__.py +113 -0
rwkv_ops/rwkv7_kernel/get_jax_devices_info.py +220 -0
rwkv_ops/rwkv7_kernel/get_torch_devices_info.py +250 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/CMakeLists.txt +42 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_ffi.cu +399 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_jax.py +311 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel_single/CMakeLists.txt +42 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel_single/wkv7_single_step_ffi.cu +172 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel_single/wkv7_single_step_jax.py +190 -0
rwkv_ops/rwkv7_kernel/jax_kernel/__init__.py +9 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_bwd.py +95 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_fwd.py +60 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_bwd.py +78 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_fwd.py +80 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_bwd.py +150 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_fwd.py +45 -0
rwkv_ops/rwkv7_kernel/jax_kernel/cumsum.py +34 -0
rwkv_ops/rwkv7_kernel/jax_kernel/wy_fast_bwd.py +61 -0
rwkv_ops/rwkv7_kernel/jax_kernel/wy_fast_fwd.py +86 -0
rwkv_ops/rwkv7_kernel/jax_op.py +382 -0
rwkv_ops/rwkv7_kernel/mlx_op.py +118 -0
rwkv_ops/rwkv7_kernel/native_keras_op.py +108 -0
rwkv_ops/rwkv7_kernel/tf_eager_kernel.py +155 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_cuda.cu +235 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_op.cpp +63 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_torch.py +233 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_cuda.cu +101 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_op.cpp +56 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_torch.py +112 -0
rwkv_ops/rwkv7_kernel/torch_kernel/__init__.py +13 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_bwd.py +96 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_fwd.py +64 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_bwd.py +74 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_fwd.py +75 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_bwd.py +148 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_fwd.py +44 -0
rwkv_ops/rwkv7_kernel/torch_kernel/cumsum.py +31 -0
rwkv_ops/rwkv7_kernel/torch_kernel/wy_fast_bwd.py +63 -0
rwkv_ops/rwkv7_kernel/torch_kernel/wy_fast_fwd.py +79 -0
rwkv_ops/rwkv7_kernel/torch_op.py +504 -0
rwkv_ops/rwkv7_kernel/triton_kernel/__init__.py +34 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_bwd.py +328 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_fwd.py +186 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_bwd.py +157 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_fwd.py +160 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_o_bwd.py +382 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_o_fwd.py +137 -0
rwkv_ops/rwkv7_kernel/triton_kernel/cumsum.py +86 -0
rwkv_ops/rwkv7_kernel/triton_kernel/utils.py +20 -0
rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_bwd.py +193 -0
rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_fwd.py +326 -0
rwkv_ops-0.6.1.dist-info/METADATA +495 -0
rwkv_ops-0.6.1.dist-info/RECORD +89 -0
rwkv_ops-0.6.1.dist-info/WHEEL +4 -0
rwkv_ops-0.6.1.dist-info/licenses/LICENSE.txt +201 -0

rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_torch.py ADDED Viewed

@@ -0,0 +1,233 @@
+import os
+import torch
+from torch.utils.cpp_extension import load
+from keras.src.backend.torch.core import cast
+from keras.src.backend.torch.numpy import transpose, zeros
+def transpose_head(x, head_first):
+    if head_first:
+        return transpose(x, (0, 2, 1, 3))
+    else:
+        return x
+def get_torch_generalized_delta_rule(HEAD_SIZE=64):
+    CHUNK_LEN = 16
+    flags = [
+        "-res-usage",
+        f"-D_C_={HEAD_SIZE}",
+        f"-D_CHUNK_LEN_={CHUNK_LEN}",
+        "--use_fast_math",
+        "-O3",
+        "-Xptxas -O3",
+        "--extra-device-vectorization",
+    ]
+    # 获取当前文件的绝对路径
+    current_file_path = os.path.abspath(__file__)
+    # 获取当前文件的目录路径
+    current_dir_path = os.path.dirname(current_file_path)
+    load(
+        name="wind_backstepping",
+        sources=[
+            os.path.join(current_dir_path, "wkv7_cuda.cu"),
+            os.path.join(current_dir_path, "wkv7_op.cpp"),
+        ],
+        is_python_module=False,
+        verbose=True,
+        extra_cuda_cflags=flags,
+    )
+    class WindBackstepping(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, w, q, k, v, z, b, h0):
+            B, T, H, N = w.shape
+            DTYPE = q.dtype
+            q = cast(q, "bfloat16")
+            k = cast(k, "bfloat16")
+            v = cast(v, "bfloat16")
+            z = cast(z, "bfloat16")
+            b = cast(b, "bfloat16")
+            w = cast(w, "bfloat16")
+            if T % CHUNK_LEN != 0:
+                raise ValueError(
+                    "RWKV输入的序列长度必须可以被16整除"
+                    "Please make sure the sequence length is divisible by 16"
+                )
+            assert all(i.is_contiguous() for i in [w, q, k, v, z, b])
+            y = torch.empty_like(v)
+            s = torch.empty(
+                B, H, T // CHUNK_LEN, N, N, dtype=torch.float32, device=w.device
+            )
+            sa = torch.empty(B, T, H, N, dtype=torch.float32, device=w.device)
+            torch.ops.wind_backstepping.forward(w, q, k, v, z, b, y, s, sa, h0)
+            ctx.save_for_backward(w, q, k, v, z, b, s, sa)
+            last_state = torch.empty_like(h0)
+            last_state.copy_(transpose(s[:, :, -1], [0, 1, 3, 2]))
+            return cast(y, DTYPE), last_state
+        @staticmethod
+        def backward(ctx, dy, dht):
+            DTYPE = dy.dtype
+            dy = cast(dy, torch.bfloat16)
+            dy = dy.contiguous()
+            w, q, k, v, z, b, s, sa = ctx.saved_tensors
+            dht = cast(dht, "float32")
+            dht = dht.contiguous()
+            assert all(i.dtype == torch.bfloat16 for i in [dy])
+            assert all(i.is_contiguous() for i in [dy, dht])
+            dh0 = torch.empty(dht.shape, dtype=dht.dtype, device=dht.device)
+            dw, dq, dk, dv, dz, db = [torch.empty_like(x) for x in [w, q, k, v, z, b]]
+            torch.ops.wind_backstepping.backward(
+                w, q, k, v, z, b, dy, s, sa, dht, dh0, dw, dq, dk, dv, dz, db
+            )
+            return (
+                cast(dw, DTYPE),
+                cast(dq, DTYPE),
+                cast(dk, DTYPE),
+                cast(dv, DTYPE),
+                cast(dz, DTYPE),
+                cast(db, DTYPE),
+                dh0,
+            )
+    def RUN_CUDA_RWKV7g(q, w, k, v, a, b, h0):
+        B, T, H, C = q.shape
+        q = q.contiguous()
+        w = w.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        a = a.contiguous()
+        b = b.contiguous()
+        out, state = WindBackstepping.apply(w, q, k, v, a, b, h0)
+        return out, state
+    def generalized_delta_rule(
+        r: torch.Tensor,
+        w: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        initial_state: torch.Tensor = None,
+        output_final_state: bool = True,
+        head_first: bool = False,
+        use_chunk: bool = True,
+    ):
+        if w.device.type != "cuda":
+            from ..native_keras_op import generalized_delta_rule
+            return generalized_delta_rule(
+                r=r,
+                k=k,
+                v=v,
+                a=a,
+                b=b,
+                w=w,
+                initial_state=initial_state,
+                output_final_state=output_final_state,
+            )
+        r = transpose_head(r, head_first)
+        k = transpose_head(k, head_first)
+        v = transpose_head(v, head_first)
+        a = transpose_head(a, head_first)
+        b = transpose_head(b, head_first)
+        w = transpose_head(w, head_first)
+        B, T, H, N = w.shape
+        if initial_state is None:
+            initial_state = zeros((B, H, N, N), "float32")
+        else:
+            initial_state = cast(initial_state, "float32")
+        out, state = RUN_CUDA_RWKV7g(r, w, k, v, a, b, initial_state)
+        if output_final_state:
+            return out, state
+        return out
+    class Wkv7Inference(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, w, q, k, v, a, b, h0):
+            B, T, H, N = w.shape
+            DTYPE = q.dtype
+            # 类型转换
+            q = cast(q, "bfloat16")
+            k = cast(k, "bfloat16")
+            v = cast(v, "bfloat16")
+            a = cast(a, "bfloat16")
+            b = cast(b, "bfloat16")
+            w = cast(w, "bfloat16")
+            assert all(i.is_contiguous() for i in [w, q, k, v, a, b])
+            # **关键：s 的形状从 (B, H, chunk_num, N, N) 变为 (B, H, N, N) **
+            y = torch.empty_like(v)
+            s = torch.empty(B, H, N, N, dtype=torch.float32, device=w.device)
+            # 调用推理算子（无 sa）
+            torch.ops.wind_backstepping.forward_inference(w, q, k, v, a, b, y, s, h0)
+            return cast(y, DTYPE), s
+        @staticmethod
+        def backward(ctx, dy, dht):
+            raise NotImplementedError("Inference kernel does not support backward")
+    def RUN_CUDA_RWKV7g_inference(q, w, k, v, a, b, h0):
+        B, T, H, C = q.shape
+        q = q.contiguous()
+        w = w.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        a = a.contiguous()
+        b = b.contiguous()
+        out, state = Wkv7Inference.apply(w, q, k, v, a, b, h0)
+        return out, state
+    # -------------------- 公共推理 API --------------------
+    def generalized_delta_rule_inference(
+        r: torch.Tensor,
+        w: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        initial_state: torch.Tensor = None,
+        head_first: bool = False,
+        output_final_state: bool = True,
+    ):
+        """
+        纯推理版本，显存占用降低 90%+
+        参数:
+            r,w,k,v,a,b: 输入张量，形状 (B, T, H, K) 或 (B, H, T, K)
+            initial_state: (B, H, K, K) 初始状态，None 则零初始化
+            head_first: 是否将 head 维提前
+        返回:
+            out: (B, T, H, K) 输出
+            final_state: (B, H, K, K) 仅最终状态
+        """
+        if w.device.type != "cuda":
+            raise NotImplementedError("Inference kernel only supports CUDA")
+        r = transpose_head(r, head_first)
+        k = transpose_head(k, head_first)
+        v = transpose_head(v, head_first)
+        a = transpose_head(a, head_first)
+        b = transpose_head(b, head_first)
+        w = transpose_head(w, head_first)
+        B, T, H, N = w.shape
+        if initial_state is None:
+            initial_state = zeros((B, H, N, N), "float32")
+        else:
+            initial_state = cast(initial_state, "float32")
+        out, final_state = RUN_CUDA_RWKV7g_inference(r, w, k, v, a, b, initial_state)
+        return out, final_state if output_final_state else out
+    # 返回两个函数，用户按需选择
+    return [generalized_delta_rule, generalized_delta_rule_inference]

rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_cuda.cu ADDED Viewed

@@ -0,0 +1,101 @@
+#include <cuda_bf16.h>
+#include <assert.h>
+#include <cstdint>
+using bf = __nv_bfloat16;
+__device__ inline float to_float(const bf &u) {
+    return __bfloat162float(u);
+}
+__device__ inline bf to_bf(const float &u) {
+    return __float2bfloat16_rn(u);
+}
+typedef bf *__restrict__ F_;
+// Single-step forward kernel for T=1
+template<int C>
+__launch_bounds__(C, 2)
+__global__ void forward_single_step_kernel(
+    int H,  // Number of heads
+    F_ w_, F_ q_, F_ k_, F_ v_, F_ a_, F_ b_,
+    float *h0_,  // (B, H, C, C) - input state
+    bf *y_,      // (B, H, C) - output
+    float *h1_   // (B, H, C, C) - output state
+) {
+    int bb = blockIdx.y;  // Batch index
+    int hh = blockIdx.x;  // Head index
+    int i = threadIdx.x;  // Row index (0..C-1)
+    // Load parameters for this (bb, hh, i)
+    // Shape: (B, H, C)
+    int64_t param_idx = (int64_t)bb * H * C + hh * C + i;
+    float w_val = to_float(w_[param_idx]);
+    w_val = __expf(-__expf(w_val));  // Decay factor
+    float q_val = to_float(q_[param_idx]);
+    float k_val = to_float(k_[param_idx]);
+    float v_val = to_float(v_[param_idx]);  // Load per-thread v
+    float a_val = to_float(a_[param_idx]);
+    float b_val = to_float(b_[param_idx]);
+    // Load state row i from h0_: (B, H, C, C)
+    int64_t h0_base = (int64_t)bb * H * C * C + hh * C * C + i * C;
+    float state_row[C];
+#pragma unroll
+    for (int j = 0; j < C; j++) {
+        state_row[j] = h0_[h0_base + j];
+    }
+    // Share vectors across threads in block (each thread loads one element)
+    __shared__ float shared_a[C], shared_b[C], shared_w[C], shared_k[C], shared_q[C];
+    shared_a[i] = a_val;
+    shared_b[i] = b_val;
+    shared_w[i] = w_val;
+    shared_k[i] = k_val;
+    shared_q[i] = q_val;
+    __syncthreads();
+    // Compute sa = sum_j(a[j] * state[i][j])
+    float sa = 0.0f;
+#pragma unroll
+    for (int j = 0; j < C; j++) {
+        sa += shared_a[j] * state_row[j];
+    }
+    // Update state row i and compute output element i
+    float y = 0.0f;
+#pragma unroll
+    for (int j = 0; j < C; j++) {
+        state_row[j] = state_row[j] * shared_w[j] + sa * shared_b[j] + shared_k[j] * v_val;
+        y += state_row[j] * shared_q[j];
+    }
+    // Write output y[i]: (B, H, C)
+    int64_t y_idx = (int64_t)bb * H * C + hh * C + i;
+    y_[y_idx] = to_bf(y);
+    // Write new state row i to h1_: (B, H, C, C)
+    int64_t h1_base = (int64_t)bb * H * C * C + hh * C * C + i * C;
+#pragma unroll
+    for (int j = 0; j < C; j++) {
+        h1_[h1_base + j] = state_row[j];
+    }
+}
+void cuda_forward_single_step(
+    int B, int H,
+    bf *w, bf *q, bf *k, bf *v, bf *a, bf *b,
+    float *h0, bf *y, float *h1
+) {
+    dim3 blocks(H, B);  // (num_heads, batch_size)
+    dim3 threads(_C_);  // HEAD_SIZE
+    forward_single_step_kernel<_C_><<<blocks, threads>>>(
+        H, w, q, k, v, a, b, h0, y, h1
+    );
+}

rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_op.cpp ADDED Viewed

@@ -0,0 +1,56 @@
+#include <torch/extension.h>
+#include <cuda_bf16.h>
+using bf = __nv_bfloat16;
+/* 前向声明：与 CUDA 侧一致 */
+void cuda_forward_single_step(
+    int B, int H,
+    bf* w, bf* q, bf* k, bf* v, bf* a, bf* b,
+    float* h0, bf* y, float* h1);
+/* PyTorch 入口：只负责张量解包与类型转换 */
+void forward_single_step(
+    torch::Tensor w,   // (B, H, K)  bfloat16
+    torch::Tensor q,   // (B, H, K)  bfloat16
+    torch::Tensor k,   // (B, H, K)  bfloat16
+    torch::Tensor v,   // (B, H, K)  bfloat16
+    torch::Tensor a,   // (B, H, K)  bfloat16
+    torch::Tensor b,   // (B, H, K)  bfloat16
+    torch::Tensor h0,  // (B, H, K, K)  float32
+    torch::Tensor y,   // (B, H, K)  bfloat16  输出
+    torch::Tensor h1)  // (B, H, K, K)  float32  输出
+{
+    /* 基本校验 */
+    TORCH_CHECK(w.device().is_cuda(), "All tensors must be CUDA");
+    TORCH_CHECK(w.dtype() == torch::kBFloat16, "w/q/k/v/a/b must be bfloat16");
+    TORCH_CHECK(h0.dtype() == torch::kFloat32, "h0/h1 must be float32");
+    TORCH_CHECK(w.is_contiguous(), "All tensors must be contiguous");
+    const int B = w.size(0);
+    const int H = w.size(1);
+    const int K = w.size(2);
+    cuda_forward_single_step(
+        B, H,
+        reinterpret_cast<bf*>(w.data_ptr()),
+        reinterpret_cast<bf*>(q.data_ptr()),
+        reinterpret_cast<bf*>(k.data_ptr()),
+        reinterpret_cast<bf*>(v.data_ptr()),
+        reinterpret_cast<bf*>(a.data_ptr()),
+        reinterpret_cast<bf*>(b.data_ptr()),
+        h0.data_ptr<float>(),
+        reinterpret_cast<bf*>(y.data_ptr()),
+        h1.data_ptr<float>());
+}
+/* 注册算子 */
+TORCH_LIBRARY(wind_backstepping_single_step, m) {
+    m.def("forward_single_step("
+          "Tensor w, Tensor q, Tensor k, Tensor v, Tensor a, Tensor b, "
+          "Tensor h0, Tensor(a!) y, Tensor(b!) h1) -> ()");
+}
+TORCH_LIBRARY_IMPL(wind_backstepping_single_step, CUDA, m) {
+    m.impl("forward_single_step", forward_single_step);
+}

rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_torch.py ADDED Viewed

@@ -0,0 +1,112 @@
+import os
+import torch
+from torch.utils.cpp_extension import load
+def get_torch_generalized_delta_rule_single_step(HEAD_SIZE=64):
+    flags = [
+        "-res-usage",
+        f"-D_C_={HEAD_SIZE}",
+        "-D_CHUNK_LEN_=1",
+        "--use_fast_math",
+        "-O3",
+        "-Xptxas -O3",
+        "--extra-device-vectorization",
+    ]
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    load(
+        name="wind_backstepping_single_step",
+        sources=[
+            os.path.join(current_dir, "wkv7_single_step_cuda.cu"),
+            os.path.join(current_dir, "wkv7_single_step_op.cpp"),
+        ],
+        is_python_module=False,
+        verbose=False,
+        extra_cuda_cflags=flags,
+    )
+    class WindBacksteppingSingleStep(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, w, q, k, v, a, b, h0):
+            DTYPE = q.dtype
+            w = w.contiguous().bfloat16()
+            q = q.contiguous().bfloat16()
+            k = k.contiguous().bfloat16()
+            v = v.contiguous().bfloat16()
+            a = a.contiguous().bfloat16()
+            b = b.contiguous().bfloat16()
+            h0 = h0.contiguous().float()
+            y = torch.empty_like(v)
+            h1 = torch.empty_like(h0)
+            torch.ops.wind_backstepping_single_step.forward_single_step(
+                w, q, k, v, a, b, h0, y, h1
+            )
+            return y.to(DTYPE), h1
+        @staticmethod
+        def backward(ctx, *grads):
+            raise NotImplementedError("single-step kernel does not support backward")
+    def run_single_step(w, q, k, v, a, b, h0):
+        return WindBacksteppingSingleStep.apply(w, q, k, v, a, b, h0)
+    def generalized_delta_rule(
+        r: torch.Tensor,
+        w: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        *,
+        initial_state: torch.Tensor = None,
+        output_final_state: bool = True,
+        head_first: bool = False,
+    ):
+        """
+        单步 RWKV7 前向，输入形状：
+          head_first=False  -> (B, 1, H, K)   **默认**
+          head_first=True   -> (B, H, 1, K)
+        输出形状与输入保持一致。
+        """
+        if w.device.type != "cuda":
+            from ..native_keras_op import generalized_delta_rule
+            return generalized_delta_rule(
+                r=r,
+                k=k,
+                v=v,
+                a=a,
+                b=b,
+                w=w,
+                initial_state=initial_state,
+                output_final_state=output_final_state,
+            )
+        # 1. 统一先转成 (B, H, K)
+        if head_first:  # (B, H, 1, K) -> (B, H, K)
+            r = r.squeeze(2)
+            w = w.squeeze(2)
+            k = k.squeeze(2)
+            v = v.squeeze(2)
+            a = a.squeeze(2)
+            b = b.squeeze(2)
+        else:  # (B, 1, H, K) -> (B, H, K)
+            r = r.squeeze(1)
+            w = w.squeeze(1)
+            k = k.squeeze(1)
+            v = v.squeeze(1)
+            a = a.squeeze(1)
+            b = b.squeeze(1)
+        B, H, K = r.shape
+        if initial_state is None:
+            initial_state = torch.zeros(
+                B, H, K, K, dtype=torch.float32, device=r.device
+            )
+        # 2. 计算
+        y, h1 = run_single_step(w, r, k, v, a, b, initial_state)  # y:(B,H,K)
+        y = y.unsqueeze(1)  # (B, 1, H, K)
+        return (y, h1) if output_final_state else y
+    return generalized_delta_rule

rwkv_ops/rwkv7_kernel/torch_kernel/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from ..torch_kernel.chunk_A_fwd import *
+from ..torch_kernel.chunk_A_bwd import *
+# ---------- chunk_h ----------
+from ..torch_kernel.chunk_h_fwd import *
+from ..torch_kernel.chunk_h_bwd import *
+# ---------- chunk_o ----------
+from ..torch_kernel.chunk_o_fwd import *
+from ..torch_kernel.chunk_o_bwd import *
+from ..torch_kernel.cumsum import *
+from ..torch_kernel.wy_fast_fwd import *
+from ..torch_kernel.wy_fast_bwd import *

rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_bwd.py ADDED Viewed

@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import torch
+import triton
+from ..triton_kernel.chunk_A_bwd import *
+from ..triton_kernel.utils import is_gather_supported
+from ..get_torch_devices_info import check_shared_mem
+def chunk_dplr_bwd_dqk_intra(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    gi: torch.Tensor,
+    ge: torch.Tensor,
+    dAqk: torch.Tensor,
+    dAqb: torch.Tensor,
+    dAak: torch.Tensor,
+    dAab: torch.Tensor,
+    dqg: torch.Tensor,
+    dkg: torch.Tensor,
+    dag: torch.Tensor,
+    dbg: torch.Tensor,
+    dgk_last: torch.Tensor,
+    scale: float = 1.0,
+    chunk_size: int = 16,
+):
+    B, T, H, K = q.shape
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    BK = (
+        min(64, triton.next_power_of_2(K))
+        if check_shared_mem()
+        else min(32, triton.next_power_of_2(K))
+    )
+    NT = triton.cdiv(T, BT)
+    NK = triton.cdiv(K, BK)
+    grid = (NK, NT, B * H)
+    dq = torch.empty_like(q)
+    dk = torch.empty_like(k)
+    da = torch.empty_like(a)
+    db = torch.empty_like(b)
+    dgk = torch.empty_like(gi, dtype=torch.float)
+    dgk_offset = torch.empty_like(gi, dtype=torch.float)
+    chunk_dplr_bwd_kernel_intra[grid](
+        q=q,
+        k=k,
+        a=a,
+        b=b,
+        gi=gi,
+        ge=ge,
+        dAqk=dAqk,
+        dAqb=dAqb,
+        dAak=dAak,
+        dAab=dAab,
+        dq=dq,
+        dk=dk,
+        dgk=dgk,
+        dgk_offset=dgk_offset,
+        dqg=dqg,
+        dkg=dkg,
+        dag=dag,
+        dbg=dbg,
+        da=da,
+        db=db,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BC=BT,
+        BK=BK,
+        GATHER_SUPPORTED=is_gather_supported,
+    )
+    dgk_output = torch.empty_like(dgk)
+    def grid(meta):
+        return (NT, triton.cdiv(K, meta["BK"]), B * H)
+    chunk_dplr_bwd_dgk_kernel[grid](
+        dgk=dgk,
+        dgk_offset=dgk_offset,
+        dgk_last=dgk_last,
+        dgk_output=dgk_output,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+    )
+    return dq, dk, da, db, dgk_output

rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_fwd.py ADDED Viewed

@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import torch
+import triton
+from ..triton_kernel.utils import is_gather_supported
+from ..triton_kernel.chunk_A_fwd import *
+def chunk_dplr_fwd_intra(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    gi: torch.Tensor,
+    ge: torch.Tensor,
+    scale: float,
+    chunk_size: int,
+):
+    B, T, H, K = k.shape
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    NT = triton.cdiv(T, BT)
+    Aqk = q.new_empty(B, T, H, BT, dtype=q.dtype)
+    Aqb = q.new_empty(B, T, H, BT, dtype=q.dtype)
+    # involving matrix inverse and it'd be better to use float here.
+    Aab = q.new_empty(B, T, H, BT, dtype=torch.float)
+    Aak = q.new_empty(B, T, H, BT, dtype=torch.float)
+    grid = (NT, B, H)
+    BK = triton.next_power_of_2(K)
+    qg = torch.empty_like(q)
+    kg = torch.empty_like(k, dtype=q.dtype)
+    ag = torch.empty_like(a, dtype=q.dtype)
+    bg = torch.empty_like(b, dtype=q.dtype)
+    chunk_dplr_fwd_A_kernel_intra_sub_intra[grid](
+        q=q,
+        k=k,
+        a=a,
+        b=b,
+        gi=gi,
+        ge=ge,
+        Aqk=Aqk,
+        Aqb=Aqb,
+        Aab=Aab,
+        Aak=Aak,
+        qg=qg,
+        kg=kg,
+        ag=ag,
+        bg=bg,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BC=BT,
+        BK=BK,
+        GATHER_SUPPORTED=is_gather_supported,
+    )
+    return Aab, Aqk, Aak, Aqb, qg, kg, ag, bg