PyPI - rwkv-ops - Versions diffs - 0.6.1__py3-none-any.whl - Mend

rwkv-ops 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

rwkv_ops/__init__.py +45 -0
rwkv_ops/mhc_kernel/__init__.py +50 -0
rwkv_ops/mhc_kernel/common_kernel/include/mhc_types.h +66 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/mhc_post_op.cuh +197 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/mhc_pre_op.cuh +212 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/rmsnorm.cuh +152 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/sinkhorn_knopp.cuh +158 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/stream_aggregate.cuh +141 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/stream_distribute.cuh +111 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/stream_mix.cuh +164 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/type_conversions.cuh +52 -0
rwkv_ops/mhc_kernel/jax_kernel/CMakeLists.txt +47 -0
rwkv_ops/mhc_kernel/jax_kernel/mhu_ffi.cu +652 -0
rwkv_ops/mhc_kernel/jax_kernel/mhu_jax.py +939 -0
rwkv_ops/mhc_kernel/native_keras_op.py +193 -0
rwkv_ops/mhc_kernel/torch_kernel/mhc_cuda.cu +207 -0
rwkv_ops/mhc_kernel/torch_kernel/mhc_op.cpp +296 -0
rwkv_ops/mhc_kernel/torch_kernel/mhc_torch.py +306 -0
rwkv_ops/rwkv6_kernel/__init__.py +120 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/gpu_ops.cpp +44 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/kernel_helpers.h +64 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/kernels.h +56 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/pybind11_kernel_helpers.h +41 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/rwkv_kernels.cu +512 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/gpu_ops.cpp +44 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/kernel_helpers.h +64 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/kernels.h +56 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/pybind11_kernel_helpers.h +41 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/rwkv_kernels.hip +514 -0
rwkv_ops/rwkv6_kernel/jax_rwkv_kernel.py +722 -0
rwkv_ops/rwkv6_kernel/ops_rwkv_kernel.py +90 -0
rwkv_ops/rwkv6_kernel/torch_kernel/wkv6_cuda.cu +397 -0
rwkv_ops/rwkv6_kernel/torch_kernel/wkv6_op.cpp +93 -0
rwkv_ops/rwkv6_kernel/torch_rwkv_kernel.py +305 -0
rwkv_ops/rwkv7_kernel/__init__.py +113 -0
rwkv_ops/rwkv7_kernel/get_jax_devices_info.py +220 -0
rwkv_ops/rwkv7_kernel/get_torch_devices_info.py +250 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/CMakeLists.txt +42 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_ffi.cu +399 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_jax.py +311 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel_single/CMakeLists.txt +42 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel_single/wkv7_single_step_ffi.cu +172 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel_single/wkv7_single_step_jax.py +190 -0
rwkv_ops/rwkv7_kernel/jax_kernel/__init__.py +9 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_bwd.py +95 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_fwd.py +60 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_bwd.py +78 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_fwd.py +80 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_bwd.py +150 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_fwd.py +45 -0
rwkv_ops/rwkv7_kernel/jax_kernel/cumsum.py +34 -0
rwkv_ops/rwkv7_kernel/jax_kernel/wy_fast_bwd.py +61 -0
rwkv_ops/rwkv7_kernel/jax_kernel/wy_fast_fwd.py +86 -0
rwkv_ops/rwkv7_kernel/jax_op.py +382 -0
rwkv_ops/rwkv7_kernel/mlx_op.py +118 -0
rwkv_ops/rwkv7_kernel/native_keras_op.py +108 -0
rwkv_ops/rwkv7_kernel/tf_eager_kernel.py +155 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_cuda.cu +235 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_op.cpp +63 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_torch.py +233 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_cuda.cu +101 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_op.cpp +56 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_torch.py +112 -0
rwkv_ops/rwkv7_kernel/torch_kernel/__init__.py +13 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_bwd.py +96 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_fwd.py +64 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_bwd.py +74 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_fwd.py +75 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_bwd.py +148 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_fwd.py +44 -0
rwkv_ops/rwkv7_kernel/torch_kernel/cumsum.py +31 -0
rwkv_ops/rwkv7_kernel/torch_kernel/wy_fast_bwd.py +63 -0
rwkv_ops/rwkv7_kernel/torch_kernel/wy_fast_fwd.py +79 -0
rwkv_ops/rwkv7_kernel/torch_op.py +504 -0
rwkv_ops/rwkv7_kernel/triton_kernel/__init__.py +34 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_bwd.py +328 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_fwd.py +186 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_bwd.py +157 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_fwd.py +160 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_o_bwd.py +382 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_o_fwd.py +137 -0
rwkv_ops/rwkv7_kernel/triton_kernel/cumsum.py +86 -0
rwkv_ops/rwkv7_kernel/triton_kernel/utils.py +20 -0
rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_bwd.py +193 -0
rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_fwd.py +326 -0
rwkv_ops-0.6.1.dist-info/METADATA +495 -0
rwkv_ops-0.6.1.dist-info/RECORD +89 -0
rwkv_ops-0.6.1.dist-info/WHEEL +4 -0
rwkv_ops-0.6.1.dist-info/licenses/LICENSE.txt +201 -0

rwkv_ops/rwkv7_kernel/get_torch_devices_info.py ADDED Viewed

@@ -0,0 +1,250 @@
+import functools
+import os
+from functools import lru_cache
+from typing import Literal
+import triton
+from packaging import version
+import torch
+from enum import Enum
+import contextlib
+@lru_cache(maxsize=None)
+def get_multiprocessor_count(tensor_idx: int = 0) -> int:
+    return triton.runtime.driver.active.utils.get_device_properties(tensor_idx)[
+        "multiprocessor_count"
+    ]
+@lru_cache(maxsize=None)
+def get_available_device() -> str:
+    try:
+        return triton.runtime.driver.active.get_current_target().backend
+    except BaseException:
+        import warnings
+        warnings.warn(
+            ("Triton is not supported on current platform, roll back to CPU."),
+            stacklevel=1,
+        )
+        return "cpu"
+@lru_cache(maxsize=None)
+def _check_platform() -> Literal["nvidia", "amd", "intel", "musa"]:
+    device = get_available_device()
+    if device == "cuda":
+        return "nvidia"
+    elif device == "hip":
+        return "amd"
+    elif device == "xpu":
+        return "intel"
+    else:
+        return device
+# For AMD GPUs, the triton backend is 'hip', while for Nvidia GPUs, the triton backend is 'cuda'.
+# However, the torch backend is 'cuda' for both Nvidia and AMD GPUs.
+# Therefore, we need to check the triton backend to determine the actual GPU vendor.
+device = get_available_device() if get_available_device() != "hip" else "cuda"
+device_platform = _check_platform()
+is_intel = device_platform == "intel"
+is_nvidia = device_platform == "nvidia"
+is_amd = device_platform == "amd"
+use_cuda_graph = is_nvidia and os.environ.get("FLA_USE_CUDA_GRAPH", "0") == "1"
+@lru_cache(maxsize=None)
+def check_pytorch_version(version_s: str = "2.4") -> bool:
+    return version.parse(torch.__version__) >= version.parse(version_s)
+is_intel_a770 = is_intel and "Intel(R) Arc(TM) A" in torch.xpu.get_device_name(0)
+device = get_available_device() if get_available_device() != "hip" else "cuda"
+device_torch_lib = getattr(torch, device)
+if check_pytorch_version("2.4"):
+    device = "cuda" if device == "cpu" else device
+    autocast_custom_fwd = functools.partial(torch.amp.custom_fwd, device_type=device)
+    autocast_custom_bwd = functools.partial(torch.amp.custom_bwd, device_type=device)
+    def custom_device_ctx(index: int):
+        return device_torch_lib.device(index)
+else:
+    assert device == "cuda", (
+        "Only cuda device is supported for PyTorch version < 2.4.0."
+    )
+    autocast_custom_fwd = device_torch_lib.amp.custom_fwd
+    autocast_custom_bwd = device_torch_lib.amp.custom_bwd
+    def custom_device_ctx(index: int):
+        return torch.cuda.device(index)
+# Nvidia Ampere or newer, haven't check AMD and intel yet.
+is_tf32_supported = is_nvidia and torch.cuda.get_device_capability(0)[0] >= 8
+def get_all_max_shared_memory():
+    return [
+        triton.runtime.driver.active.utils.get_device_properties(i)["max_shared_mem"]
+        for i in range(device_torch_lib.device_count())
+    ]
+device_shared_mem_list = get_all_max_shared_memory()
+@lru_cache(maxsize=None)
+def is_triton_shared_mem_enough(
+    max_shared_mem: int = 102400, tensor_idx: int = 0
+) -> bool:
+    max_shared_memory = device_shared_mem_list[tensor_idx]
+    return max_shared_memory >= max_shared_mem
+device_capacity = is_triton_shared_mem_enough()
+def _cpu_device_warning():
+    import warnings
+    warnings.warn(
+        ("Triton is not supported on current platform, roll back to CPU."), stacklevel=1
+    )
+def get_all_max_shared_mem():
+    try:
+        return [
+            triton.runtime.driver.active.utils.get_device_properties(i)[
+                "max_shared_mem"
+            ]
+            for i in range(device_torch_lib.device_count())
+        ]
+    except BaseException:
+        _cpu_device_warning()
+        return [-1]
+class Backend(Enum):
+    ADA = 101376  # RTX 4090
+    AMPERE = 166912  # A100
+    HOPPER = 232448  # H100
+    DEFAULT = 102400  # Default
+    @classmethod
+    def get_shared_memory(cls, arch: str) -> int:
+        try:
+            return cls[arch.upper()].value
+        except KeyError:
+            return cls.DEFAULT.value
+@lru_cache(maxsize=None)
+def check_shared_mem(arch: str = "none", tensor_idx: int = 0) -> bool:
+    try:
+        device_shared_mem_list = get_all_max_shared_mem()
+        max_shared_memory = device_shared_mem_list[tensor_idx]
+        return max_shared_memory >= Backend.get_shared_memory(arch)
+    except Exception:
+        return False
+def tensor_cache(fn):
+    """
+    A decorator that caches the most recent result of a function with tensor inputs.
+    This decorator will store the output of the decorated function for the most recent set of input tensors.
+    If the function is called again with the same input tensors, it will return the cached result.
+    Args:
+        fn (Callable[..., torch.Tensor]):
+            The function to be decorated. It should take tensor inputs and return tensor outputs.
+    Returns:
+        Callable[..., torch.Tensor]:
+            A wrapped version of the input function with single-entry caching.
+    """
+    last_args = None
+    last_kwargs = None
+    last_result = None
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        nonlocal last_args, last_kwargs, last_result
+        if last_args is not None and last_kwargs is not None:
+            if len(args) == len(last_args) and len(kwargs) == len(last_kwargs):
+                if all(a is b for a, b in zip(args, last_args)) and all(
+                    k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items()
+                ):
+                    return last_result
+        result = fn(*args, **kwargs)
+        last_args, last_kwargs, last_result = args, kwargs, result
+        return result
+    return wrapper
+@tensor_cache
+def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return cu_seqlens[1:] - cu_seqlens[:-1]
+@tensor_cache
+def prepare_chunk_indices(
+    cu_seqlens: torch.LongTensor, chunk_size: int
+) -> torch.LongTensor:
+    indices = torch.cat(
+        [
+            torch.arange(n)
+            for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist()
+        ]
+    )
+    return torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(cu_seqlens)
+def input_guard(fn):
+    """
+    A decorator to make sure all input tensors are contiguous and set the device based on input tensors.
+    """
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        contiguous_args = (
+            i if not isinstance(i, torch.Tensor) else i.contiguous() for i in args
+        )
+        contiguous_kwargs = {
+            k: (v if not isinstance(v, torch.Tensor) else v.contiguous())
+            for k, v in kwargs.items()
+        }
+        tensor = None
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                tensor = arg
+                break
+        if tensor is None:
+            for value in kwargs.values():
+                if isinstance(value, torch.Tensor):
+                    tensor = value
+                    break
+        if tensor is not None:
+            ctx = custom_device_ctx(tensor.device.index)
+        else:
+            ctx = contextlib.nullcontext()
+        with ctx:
+            return fn(*contiguous_args, **contiguous_kwargs)
+    return wrapper
+is_intel_alchemist = is_intel and "Intel(R) Arc(TM) A" in torch.xpu.get_device_name(0)

rwkv_ops/rwkv7_kernel/jax_cuda_kernel/CMakeLists.txt ADDED Viewed

@@ -0,0 +1,42 @@
+cmake_minimum_required(VERSION 3.18)
+project(wkv7 LANGUAGES CXX CUDA)
+find_package(CUDAToolkit REQUIRED)
+# ---------- 1. 找到 Python ----------
+find_package(Python3 REQUIRED COMPONENTS Interpreter)
+# ---------- 2. 取 XLA 头文件路径 ----------
+execute_process(
+  COMMAND "${Python3_EXECUTABLE}" -c "from jax import ffi; print(ffi.include_dir())"
+  OUTPUT_VARIABLE XLA_INCLUDE_DIR
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+if(NOT XLA_INCLUDE_DIR)
+  message(FATAL_ERROR "Cannot get XLA include dir from jax.ffi")
+endif()
+message(STATUS "XLA include directory: ${XLA_INCLUDE_DIR}")
+# ---------- 3. 生成共享库 ----------
+add_library(wkv7 SHARED wkv7_ffi.cu)
+# 3-1. 头文件搜索路径
+target_include_directories(wkv7 PRIVATE ${XLA_INCLUDE_DIR})
+# 3-2. 链接 CUDA 运行时
+target_link_libraries(wkv7 PRIVATE CUDA::cudart)
+# 3-3. 关键：C++17 / CUDA17 标准
+target_compile_features(wkv7 PUBLIC cxx_std_17)
+set_target_properties(wkv7 PROPERTIES
+    CUDA_STANDARD          17
+    CUDA_SEPARABLE_COMPILATION ON
+    POSITION_INDEPENDENT_CODE ON
+    PREFIX                 ""        # 去掉默认的 "lib" 前缀
+)
+# ---------- 4. 安装 ----------
+# 把 .so 直接装到源码目录（与 wkv7_jax.py 同一级），方便 ctypes.CDLL 加载
+install(TARGETS wkv7
+        LIBRARY DESTINATION "${CMAKE_SOURCE_DIR}"
+        RUNTIME DESTINATION "${CMAKE_SOURCE_DIR}")   # Windows 用 RUNTIME

rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_ffi.cu ADDED Viewed

@@ -0,0 +1,399 @@
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+#include <xla/ffi/api/ffi.h>
+#include <vector>
+#include <cstdint>
+// ref link:https://github.com/BlinkDL/RWKV-CUDA/tree/main/rwkv7_fast_fused
+namespace ffi = xla::ffi;
+/* -------------------- 类型别名 -------------------- */
+using bf = __nv_bfloat16;
+/* -------------------- 设备端辅助 -------------------- */
+__device__ inline float to_float(const bf &u) {
+    return __bfloat162float(u);
+}
+__device__ inline bf to_bf(const float &u) {
+    return __float2bfloat16_rn(u);
+}
+typedef bf *__restrict__ F_;
+/* -------------------- Kernel -------------------- */
+// 【优化1】模板化 + launch_bounds，提升 Occupancy
+template<int C> __launch_bounds__(C, 2)
+__global__ void forward_kernel(int T, int H,
+                               F_ w_, F_ q_, F_ k_, F_ v_, F_ a_, F_ b_,
+                               bf *y_, float *s_, float *sa_, float *h0_) {
+    int bb = blockIdx.y, hh = blockIdx.x, i = threadIdx.x;
+    float state[C] = {0};
+    __shared__ float q[C], k[C], w[C], a[C], b[C];
+    int64_t h0_base = ((int64_t)bb * H + hh) * C * C + i * C;
+    #pragma unroll
+    for (int j = 0; j < C; ++j) state[j] = h0_[h0_base + j];
+    for (int t = 0; t < T; ++t) {
+        // 【优化2】强制 int64_t 防止溢出
+        int64_t ind = (int64_t)bb * T * H * C + (int64_t)t * H * C + hh * C + i;
+        __syncthreads();
+        q[i] = to_float(q_[ind]);
+        w[i] = __expf(-__expf(to_float(w_[ind])));
+        k[i] = to_float(k_[ind]);
+        a[i] = to_float(a_[ind]);
+        b[i] = to_float(b_[ind]);
+        __syncthreads();
+        float sa = 0.f;
+        #pragma unroll
+        for (int j = 0; j < C; ++j) sa += a[j] * state[j];
+        sa_[ind] = sa;
+        float v_val = to_float(v_[ind]);
+        float y = 0.f;
+        #pragma unroll
+        for (int j = 0; j < C; ++j) {
+            float &s = state[j];
+            s = s * w[j] + sa * b[j] + k[j] * v_val;
+            y += s * q[j];
+        }
+        y_[ind] = to_bf(y);
+        if ((t + 1) % _CHUNK_LEN_ == 0) {
+            int64_t base = ((int64_t)bb * H + hh) * (T / _CHUNK_LEN_) * C * C +
+                           ((int64_t)t / _CHUNK_LEN_) * C * C + i;
+            #pragma unroll
+            for (int j = 0; j < C; ++j) s_[base + j * C] = state[j];
+        }
+    }
+}
+// 【优化3】反向 Kernel：模板化 + launch_bounds + float4 向量加载
+template<int C> __launch_bounds__(C, 2)
+__global__ void backward_kernel(int T, int H,
+                                F_ w_, F_ q_, F_ k_, F_ v_, F_ a_, F_ b_, F_ dy_,
+                                float *s_, float *sa_, float *dht_, float *dh0_,
+                                bf *dw_, bf *dq_, bf *dk_, bf *dv_, bf *da_, bf *db_) {
+    int bb = blockIdx.y, hh = blockIdx.x, i = threadIdx.x;
+    float stateT[C] = {0}, dstate[C] = {0}, dstateT[C] = {0};
+    int64_t dht_base = ((int64_t)bb * H + hh) * C * C + i * C;
+    #pragma unroll
+    for (int j = 0; j < C; ++j) {
+        dstate[j]  = dht_[dht_base + j];
+        dstateT[j] = dht_[dht_base + j];
+    }
+    __shared__ float w[C], q[C], k[C], v[C], a[C], b[C], dy[C], sa[C], dSb_shared[C];
+    float qi, wi, ki, ai, bi, dyi;
+    for (int t = T - 1; t >= 0; --t) {
+        int64_t ind = (int64_t)bb * T * H * C + (int64_t)t * H * C + hh * C + i;
+        __syncthreads();
+        q[i] = qi = to_float(q_[ind]);
+        float wi_fac = -__expf(to_float(w_[ind]));
+        w[i] = wi = __expf(wi_fac);
+        k[i] = ki = to_float(k_[ind]);
+        a[i] = ai = to_float(a_[ind]);
+        b[i] = bi = to_float(b_[ind]);
+        v[i] = to_float(v_[ind]);
+        dy[i] = dyi = to_float(dy_[ind]);
+        sa[i] = sa_[ind];
+        __syncthreads();
+        if ((t + 1) % _CHUNK_LEN_ == 0) {
+            int64_t base = ((int64_t)bb * H + hh) * (T / _CHUNK_LEN_) * C * C +
+                           ((int64_t)t / _CHUNK_LEN_) * C * C + i * C;
+            // 【优化4】float4 向量加载，带宽利用率提升 4倍
+            const float4* s4 = (const float4*)(s_ + base);
+            #pragma unroll
+            for (int j4 = 0; j4 < C / 4; ++j4) {
+                float4 q_vec = s4[j4];
+                const int j = j4 * 4;
+                stateT[j + 0] = q_vec.x;
+                stateT[j + 1] = q_vec.y;
+                stateT[j + 2] = q_vec.z;
+                stateT[j + 3] = q_vec.w;
+            }
+        }
+        float dq_val = 0.f;
+        #pragma unroll
+        for (int j = 0; j < C; ++j) dq_val += stateT[j] * dy[j];
+        dq_[ind] = to_bf(dq_val);
+        float iwi = 1.f / (wi + 1e-6f);
+        #pragma unroll
+        for (int j = 0; j < C; ++j) {
+            stateT[j] = (stateT[j] - ki * v[j] - bi * sa[j]) * iwi;
+            dstate[j] += dyi * q[j];
+            dstateT[j] += qi * dy[j];
+        }
+        float dw = 0.f, dk = 0.f, dv = 0.f, db = 0.f, dSb = 0.f;
+        #pragma unroll
+        for (int j = 0; j < C; ++j) {
+            dw += dstateT[j] * stateT[j];
+            dk += dstateT[j] * v[j];
+            dv += dstate[j] * k[j];
+            dSb += dstate[j] * b[j];
+            db += dstateT[j] * sa[j];
+        }
+        dw_[ind] = to_bf(dw * wi * wi_fac);
+        dk_[ind] = to_bf(dk);
+        dv_[ind] = to_bf(dv);
+        db_[ind] = to_bf(db);
+        __syncthreads();
+        dSb_shared[i] = dSb;
+        __syncthreads();
+        float da = 0.f;
+        #pragma unroll
+        for (int j = 0; j < C; ++j) da += stateT[j] * dSb_shared[j];
+        da_[ind] = to_bf(da);
+        #pragma unroll
+        for (int j = 0; j < C; ++j) {
+            dstate[j]  = dstate[j] * w[j] + dSb * a[j];
+            dstateT[j] = dstateT[j] * wi + ai * dSb_shared[j];
+            if (t == 0) dh0_[dht_base + j] = dstate[j];
+        }
+    }
+}
+/* -------------------- 推理专用 Kernel -------------------- */
+template<int C> __launch_bounds__(C, 2)
+__global__ void forward_inference_kernel(int T, int H,
+                                         F_ w_, F_ q_, F_ k_, F_ v_, F_ a_, F_ b_,
+                                         bf *y_, float *s_, float *h0_) {
+    int bb = blockIdx.y, hh = blockIdx.x, i = threadIdx.x;
+    float state[C] = {0};
+    __shared__ float q[C], k[C], w[C], a[C], b[C];
+    int64_t h0_base = ((int64_t)bb * H + hh) * C * C + i * C;
+    #pragma unroll
+    for (int j = 0; j < C; ++j) state[j] = h0_[h0_base + j];
+    for (int t = 0; t < T; ++t) {
+        int64_t ind = (int64_t)bb * T * H * C + (int64_t)t * H * C + hh * C + i;
+        __syncthreads();
+        q[i] = to_float(q_[ind]);
+        w[i] = __expf(-__expf(to_float(w_[ind])));
+        k[i] = to_float(k_[ind]);
+        a[i] = to_float(a_[ind]);
+        b[i] = to_float(b_[ind]);
+        __syncthreads();
+        float sa = 0.f;
+        #pragma unroll
+        for (int j = 0; j < C; ++j) sa += a[j] * state[j];
+        float v_val = to_float(v_[ind]);
+        float y = 0.f;
+        #pragma unroll
+        for (int j = 0; j < C; ++j) {
+            float &s = state[j];
+            s = s * w[j] + sa * b[j] + k[j] * v_val;
+            y += s * q[j];
+        }
+        y_[ind] = to_bf(y);
+    }
+    int64_t base = ((int64_t)bb * H + hh) * C * C + i * C;
+    #pragma unroll
+    for (int j = 0; j < C; ++j) s_[base + j] = state[j];
+}
+/* -------------------- Host 函数（参数名已统一） -------------------- */
+static ffi::Error WKV7FwdHost(
+    cudaStream_t stream,
+    ffi::Buffer<ffi::BF16> w,
+    ffi::Buffer<ffi::BF16> q,
+    ffi::Buffer<ffi::BF16> k,
+    ffi::Buffer<ffi::BF16> v,
+    ffi::Buffer<ffi::BF16> a,  // 原'z'，直接对应 kernel 的 a_
+    ffi::Buffer<ffi::BF16> b,  // 原'a'，直接对应 kernel 的 b_
+    ffi::Buffer<ffi::F32>  h0,
+    ffi::ResultBuffer<ffi::BF16> y,
+    ffi::ResultBuffer<ffi::F32>  s,
+    ffi::ResultBuffer<ffi::F32>  sa)
+{
+    constexpr int C = _C_;
+    auto dims = w.dimensions();
+    int B = dims[0], T = dims[1], H = dims[2];
+    dim3 block(C);
+    dim3 grid(H, B);
+    // 【关键】模板实例化调用，参数直接映射
+    forward_kernel<_C_><<<grid, block, 0, stream>>>(
+        T, H,
+        reinterpret_cast<bf *>(w.typed_data()),
+        reinterpret_cast<bf *>(q.typed_data()),
+        reinterpret_cast<bf *>(k.typed_data()),
+        reinterpret_cast<bf *>(v.typed_data()),
+        reinterpret_cast<bf *>(a.typed_data()),  // 直接映射到 a_
+        reinterpret_cast<bf *>(b.typed_data()),  // 直接映射到 b_
+        reinterpret_cast<bf *>(y->typed_data()),
+        s->typed_data(),
+        sa->typed_data(),
+        h0.typed_data());
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+        return ffi::Error::Internal(
+            std::string("CUDA forward_kernel error: ") + cudaGetErrorString(err));
+    return ffi::Error::Success();
+}
+static ffi::Error WKV7BwdHost(
+    cudaStream_t stream,
+    ffi::Buffer<ffi::BF16> w,
+    ffi::Buffer<ffi::BF16> q,
+    ffi::Buffer<ffi::BF16> k,
+    ffi::Buffer<ffi::BF16> v,
+    ffi::Buffer<ffi::BF16> a,  // 原'z'，直接对应 kernel 的 a_
+    ffi::Buffer<ffi::BF16> b,  // 原'a'，直接对应 kernel 的 b_
+    ffi::Buffer<ffi::BF16> dy,
+    ffi::Buffer<ffi::F32>  s,
+    ffi::Buffer<ffi::F32>  sa,
+    ffi::Buffer<ffi::F32>  dht,
+    ffi::ResultBuffer<ffi::F32> dh0,
+    ffi::ResultBuffer<ffi::BF16> dw,
+    ffi::ResultBuffer<ffi::BF16> dq,
+    ffi::ResultBuffer<ffi::BF16> dk,
+    ffi::ResultBuffer<ffi::BF16> dv,
+    ffi::ResultBuffer<ffi::BF16> da,
+    ffi::ResultBuffer<ffi::BF16> db)
+{
+    auto dims = w.dimensions();
+    int B = dims[0], T = dims[1], H = dims[2];
+    constexpr int C = _C_;
+    dim3 block(C);
+    dim3 grid(H, B);
+    // 【关键】模板实例化调用，参数直接映射
+    backward_kernel<_C_><<<grid, block, 0, stream>>>(
+        T, H,
+        reinterpret_cast<bf *>(w.typed_data()),
+        reinterpret_cast<bf *>(q.typed_data()),
+        reinterpret_cast<bf *>(k.typed_data()),
+        reinterpret_cast<bf *>(v.typed_data()),
+        reinterpret_cast<bf *>(a.typed_data()),  // 直接映射到 a_
+        reinterpret_cast<bf *>(b.typed_data()),  // 直接映射到 b_
+        reinterpret_cast<bf *>(dy.typed_data()),
+        s.typed_data(),
+        sa.typed_data(),
+        dht.typed_data(),
+        dh0->typed_data(),
+        reinterpret_cast<bf *>(dw->typed_data()),
+        reinterpret_cast<bf *>(dq->typed_data()),
+        reinterpret_cast<bf *>(dk->typed_data()),
+        reinterpret_cast<bf *>(dv->typed_data()),
+        reinterpret_cast<bf *>(da->typed_data()),
+        reinterpret_cast<bf *>(db->typed_data()));
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+        return ffi::Error::Internal(
+            std::string("CUDA backward_kernel error: ") + cudaGetErrorString(err));
+    return ffi::Error::Success();
+}
+static ffi::Error WKV7InferenceHost(
+    cudaStream_t stream,
+    ffi::Buffer<ffi::BF16> w,
+    ffi::Buffer<ffi::BF16> q,
+    ffi::Buffer<ffi::BF16> k,
+    ffi::Buffer<ffi::BF16> v,
+    ffi::Buffer<ffi::BF16> a,  // 直接对应 kernel 的 a_
+    ffi::Buffer<ffi::BF16> b,  // 直接对应 kernel 的 b_
+    ffi::Buffer<ffi::F32>  h0,
+    ffi::ResultBuffer<ffi::BF16> y,
+    ffi::ResultBuffer<ffi::F32>  s)
+{
+    constexpr int C = _C_;
+    auto dims = w.dimensions();
+    int B = dims[0], T = dims[1], H = dims[2];
+    dim3 block(C);
+    dim3 grid(H, B);
+    // 【关键】模板实例化调用，参数直接映射
+    forward_inference_kernel<_C_><<<grid, block, 0, stream>>>(
+        T, H,
+        reinterpret_cast<bf *>(w.typed_data()),
+        reinterpret_cast<bf *>(q.typed_data()),
+        reinterpret_cast<bf *>(k.typed_data()),
+        reinterpret_cast<bf *>(v.typed_data()),
+        reinterpret_cast<bf *>(a.typed_data()),  // 直接映射到 a_
+        reinterpret_cast<bf *>(b.typed_data()),  // 直接映射到 b_
+        reinterpret_cast<bf *>(y->typed_data()),
+        s->typed_data(),
+        h0.typed_data());
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+        return ffi::Error::Internal(
+            std::string("CUDA forward_inference_kernel error: ") + cudaGetErrorString(err));
+    return ffi::Error::Success();
+}
+/* -------------------- FFI 注册（参数名已对齐） -------------------- */
+XLA_FFI_DEFINE_HANDLER_SYMBOL(
+    Wkv7Fwd, WKV7FwdHost,
+    ffi::Ffi::Bind()
+        .Ctx<ffi::PlatformStream<cudaStream_t>>()
+        .Arg<ffi::Buffer<ffi::BF16>>()   // w
+        .Arg<ffi::Buffer<ffi::BF16>>()   // q
+        .Arg<ffi::Buffer<ffi::BF16>>()   // k
+        .Arg<ffi::Buffer<ffi::BF16>>()   // v
+        .Arg<ffi::Buffer<ffi::BF16>>()   // a (原z)
+        .Arg<ffi::Buffer<ffi::BF16>>()   // b (原a)
+        .Arg<ffi::Buffer<ffi::F32>>()    // h0
+        .Ret<ffi::Buffer<ffi::BF16>>()   // y
+        .Ret<ffi::Buffer<ffi::F32>>()    // s
+        .Ret<ffi::Buffer<ffi::F32>>()    // sa
+, {ffi::Traits::kCmdBufferCompatible});
+XLA_FFI_DEFINE_HANDLER_SYMBOL(
+    Wkv7Bwd, WKV7BwdHost,
+    ffi::Ffi::Bind()
+        .Ctx<ffi::PlatformStream<cudaStream_t>>()
+        .Arg<ffi::Buffer<ffi::BF16>>()   // w
+        .Arg<ffi::Buffer<ffi::BF16>>()   // q
+        .Arg<ffi::Buffer<ffi::BF16>>()   // k
+        .Arg<ffi::Buffer<ffi::BF16>>()   // v
+        .Arg<ffi::Buffer<ffi::BF16>>()   // a (原z)
+        .Arg<ffi::Buffer<ffi::BF16>>()   // b (原a)
+        .Arg<ffi::Buffer<ffi::BF16>>()   // dy
+        .Arg<ffi::Buffer<ffi::F32>>()    // s
+        .Arg<ffi::Buffer<ffi::F32>>()    // sa
+        .Arg<ffi::Buffer<ffi::F32>>()    // dht
+        .Ret<ffi::Buffer<ffi::F32>>()   // dh0
+        .Ret<ffi::Buffer<ffi::BF16>>()   // dw
+        .Ret<ffi::Buffer<ffi::BF16>>()   // dq
+        .Ret<ffi::Buffer<ffi::BF16>>()   // dk
+        .Ret<ffi::Buffer<ffi::BF16>>()   // dv
+        .Ret<ffi::Buffer<ffi::BF16>>()   // da
+        .Ret<ffi::Buffer<ffi::BF16>>()   // db
+, {ffi::Traits::kCmdBufferCompatible});
+XLA_FFI_DEFINE_HANDLER_SYMBOL(
+    Wkv7Inference, WKV7InferenceHost,
+    ffi::Ffi::Bind()
+        .Ctx<ffi::PlatformStream<cudaStream_t>>()
+        .Arg<ffi::Buffer<ffi::BF16>>()   // w
+        .Arg<ffi::Buffer<ffi::BF16>>()   // q
+        .Arg<ffi::Buffer<ffi::BF16>>()   // k
+        .Arg<ffi::Buffer<ffi::BF16>>()   // v
+        .Arg<ffi::Buffer<ffi::BF16>>()   // a
+        .Arg<ffi::Buffer<ffi::BF16>>()   // b
+        .Arg<ffi::Buffer<ffi::F32>>()    // h0
+        .Ret<ffi::Buffer<ffi::BF16>>()   // y
+        .Ret<ffi::Buffer<ffi::F32>>()    // s (final state)
+, {ffi::Traits::kCmdBufferCompatible});