PyPI - rwkv-ops - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

rwkv-ops 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rwkv-ops might be problematic. Click here for more details.

Files changed (31) hide show

rwkv_ops/__init__.py +5 -6
rwkv_ops/rwkv6_kernel/__init__.py +0 -6
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/gpu_ops.cpp +44 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/kernel_helpers.h +64 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/kernels.h +56 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/pybind11_kernel_helpers.h +41 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/rwkv_kernels.cu +512 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/gpu_ops.cpp +44 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/kernel_helpers.h +64 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/kernels.h +56 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/pybind11_kernel_helpers.h +41 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/rwkv_kernels.hip +514 -0
rwkv_ops/rwkv6_kernel/jax_rwkv_kernel.py +21 -23
rwkv_ops/rwkv6_kernel/ops_rwkv_kernel.py +14 -10
rwkv_ops/rwkv6_kernel/torch_kernel/wkv6_cuda.cu +397 -0
rwkv_ops/rwkv6_kernel/torch_kernel/wkv6_op.cpp +93 -0
rwkv_ops/rwkv6_kernel/torch_rwkv_kernel.py +4 -4
rwkv_ops/rwkv7_kernel/__init__.py +77 -29
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/CMakeLists.txt +42 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_ffi.cu +279 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_jax.py +237 -0
rwkv_ops/rwkv7_kernel/jax_op.py +6 -5
rwkv_ops/rwkv7_kernel/native_keras_op.py +5 -6
rwkv_ops/rwkv7_kernel/tf_eager_kernel.py +123 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_cuda.cu +165 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_op.cpp +35 -0
{rwkv_ops-0.2.2.dist-info → rwkv_ops-0.3.0.dist-info}/METADATA +28 -27
{rwkv_ops-0.2.2.dist-info → rwkv_ops-0.3.0.dist-info}/RECORD +30 -13
{rwkv_ops-0.2.2.dist-info → rwkv_ops-0.3.0.dist-info}/WHEEL +1 -2
rwkv_ops-0.2.2.dist-info/top_level.txt +0 -1
{rwkv_ops-0.2.2.dist-info → rwkv_ops-0.3.0.dist-info/licenses}/LICENSE.txt +0 -0

rwkv_ops/rwkv6_kernel/jax_kernel_cuda/rwkv_kernels.cu ADDED Viewed

@@ -0,0 +1,512 @@
+/* Copyright 2024 The JAX Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "kernel_helpers.h"
+#include "kernels.h"
+#include "stdio.h"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <iostream>
+#include <assert.h>
+namespace {
+#define DISPATCH_Vector_TYPES(TYPEIN, TYPEOUT,NAME, ...)                              \
+  switch (TYPEIN) {                                                            \
+  case gpu_ops::ElementType::F32: {                                            \
+    using input_type = float;                                                  \
+    switch (TYPEOUT) {                                                         \
+      case gpu_ops::ElementType::F32: {                                          \
+        using output_type = float;                                              \
+        __VA_ARGS__;                                                             \
+        break;                                                                   \
+      }                                                                          \
+      case gpu_ops::ElementType::F16: {                                          \
+        using output_type = __half;                                             \
+        __VA_ARGS__;                                                             \
+        break;                                                                   \
+      }                                                                          \
+      case gpu_ops::ElementType::BF16: {                                         \
+        using output_type = __nv_bfloat16;                                      \
+        __VA_ARGS__;                                                             \
+        break;                                                                   \
+      }                                                                          \
+      default:                                                                   \
+        break;                                                                   \
+    }                                                                          \
+    break;                                                                     \
+  }                                                                            \
+  case gpu_ops::ElementType::F16: {                                            \
+    using input_type = __half;                                                \
+    switch (TYPEOUT) {                                                         \
+    case gpu_ops::ElementType::F32: {                                          \
+      using output_type = float;                                              \
+      __VA_ARGS__;                                                             \
+      break;                                                                   \
+    }                                                                          \
+    case gpu_ops::ElementType::F16: {                                          \
+      using output_type = __half;                                             \
+      __VA_ARGS__;                                                             \
+      break;                                                                   \
+    }                                                                          \
+    case gpu_ops::ElementType::BF16: {                                         \
+      using output_type = __nv_bfloat16;                                      \
+      __VA_ARGS__;                                                             \
+      break;                                                                   \
+    }                                                                          \
+    default:                                                                   \
+      break;                                                                   \
+    }                                                                          \
+    break;                                                                     \
+  }                                                                            \
+  case gpu_ops::ElementType::BF16: {                                           \
+    using input_type = __nv_bfloat16;                                          \
+    switch (TYPEOUT) {                                                         \
+    case gpu_ops::ElementType::F32: {                                          \
+      using output_type = float;                                              \
+      __VA_ARGS__;                                                             \
+      break;                                                                   \
+    }                                                                          \
+    case gpu_ops::ElementType::F16: {                                          \
+      using output_type = __half;                                             \
+      __VA_ARGS__;                                                             \
+      break;                                                                   \
+    }                                                                          \
+    case gpu_ops::ElementType::BF16: {                                         \
+      using output_type = __nv_bfloat16;                                      \
+      __VA_ARGS__;                                                             \
+      break;                                                                   \
+    }                                                                          \
+    default:                                                                   \
+      break;                                                                   \
+    }                                                                          \
+    break;                                                                     \
+  }                                                                            \
+  default:                                                                     \
+    break;                                                                     \
+  }
+static_assert(_N_ % 4 ==0,"the size of head must be the times of 4.");
+template <typename F_in,typename F_out>
+__device__ void kernel_forward_core(const int B, const int T, const int C, const int H,const int b, const int h, const int i, const float* state,
+                               const F_in *__restrict__ const _r, const F_in *__restrict__ const _k, const F_in *__restrict__ const _v, const F_in *__restrict__ _w, const F_in *__restrict__ _u,
+                               F_out *__restrict__ const _y)
+{
+    _u += h*_N_;
+    __shared__ float r[_N_], k[_N_], u[_N_], w[_N_];
+    //float state[_N_] = {0};
+    __syncthreads();
+    u[i] = float(_u[i]);
+    __syncthreads();
+    for (int t = b*T*C + h*_N_ + i; t < (b+1)*T*C + h*_N_ + i; t += C)
+    {
+        __syncthreads();
+        w[i] = __expf(-__expf(float(_w[t])));
+        r[i] = float(_r[t]);
+        k[i] = float(_k[t]);
+        __syncthreads();
+        const float v = float(_v[t]);
+        float y = 0;
+        #pragma unroll
+        for (int j = 0; j < _N_; j+=4)
+        {
+            const float4& r_ = (float4&)(r[j]);
+            const float4& k_ = (float4&)(k[j]);
+            const float4& w_ = (float4&)(w[j]);
+            const float4& u_ = (float4&)(u[j]);
+            float4& s = (float4&)(state[j]);
+            float4 x;
+            x.x = k_.x * v;
+            x.y = k_.y * v;
+            x.z = k_.z * v;
+            x.w = k_.w * v;
+            y += r_.x * (u_.x * x.x + s.x);
+            y += r_.y * (u_.y * x.y + s.y);
+            y += r_.z * (u_.z * x.z + s.z);
+            y += r_.w * (u_.w * x.w + s.w);
+            s.x = s.x * w_.x + x.x;
+            s.y = s.y * w_.y + x.y;
+            s.z = s.z * w_.z + x.z;
+            s.w = s.w * w_.w + x.w;
+        }
+        _y[t] = F_out(y);
+    }
+}
+template <typename F_in,typename F_out>
+__global__ void kernel_forward_state(const int B, const int T, const int C, const int H,const bool is_custom_state,const int32_t* map,
+                               const F_in *__restrict__ const _r, const F_in *__restrict__ const _k, const F_in *__restrict__ const _v, const F_in *__restrict__ _w, const F_in *__restrict__ _u,
+                               const F_out *__restrict__ _s, F_out *__restrict__ const _y, F_out *__restrict__ const _ys)
+{
+  const int b = blockIdx.x / H;
+  const int h = blockIdx.x % H;
+  const int i = threadIdx.x;
+  float state[_N_] = {0};
+  if(is_custom_state){
+    assert(map[b] >=0 && map[b] < B);
+    const int64_t input_state_offset = map[b] * H * _N_ *_N_ + h * _N_ * _N_ + i;
+    for(int j= 0; j< _N_; j++){
+      state[j] = float(_s[j*_N_ + input_state_offset]);
+    }
+  }
+  const int64_t current_state_offset = b * H * _N_ *_N_ + h * _N_ * _N_ + i;
+  kernel_forward_core(B, T, C, H, b, h, i, state, _r, _k, _v, _w, _u, _y);
+  for(int j=0; j< _N_; j++){
+    _ys[j*_N_ + current_state_offset] = F_out(state[j]);
+  }
+}
+template <typename F_in,typename F_out>
+__global__ void kernel_forward(const int B, const int T, const int C, const int H,
+                               const F_in *__restrict__ const _r, const F_in *__restrict__ const _k, const F_in *__restrict__ const _v, const F_in *__restrict__ _w, const F_in *__restrict__ _u,
+                               F_out *__restrict__ const _y)
+{
+  const int b = blockIdx.x / H;
+  const int h = blockIdx.x % H;
+  const int i = threadIdx.x;
+  float state[_N_] = {0};
+  kernel_forward_core(B, T, C, H, b, h, i, state, _r, _k, _v, _w, _u, _y);
+}
+template <typename F_in, typename F_out>
+__global__ void kernel_backward_101(const int B, const int T, const int C, const int H,
+    const F_in *__restrict__ const _r, const F_in *__restrict__ const _k, const F_in *__restrict__ const _v, const F_in *__restrict__ _w,
+    const F_in *__restrict__ _u, const F_out *__restrict__ const _gy,
+    F_out *__restrict__ const _gr, F_out *__restrict__ const _gu)
+{
+    const int b = blockIdx.x / H;
+    const int h = blockIdx.x % H;
+    const int i = threadIdx.x;
+    __shared__ float v[_N_], gy[_N_];
+    const float u = float(_u[h*_N_ + i]);
+    float state[_N_] = {0};
+    const int t_0 = b*T*C + h*_N_ + i;
+    const int t_T = t_0 + T*C;
+    float gu = 0;
+    for (int t = t_0; t < t_T; t += C)
+    {
+        __syncthreads();
+        v[i] = float(_v[t]);
+        gy[i] = float(_gy[t]);
+        __syncthreads();
+        const float k = float(_k[t]);
+        const float w = __expf(-__expf(float(_w[t])));
+        float gr = 0, gu_ = 0;
+        #pragma unroll
+        for (int j = 0; j < _N_; j++)
+        {
+            float& s = state[j];
+            float x = k * v[j];
+            gr += (u * x + s) * gy[j];
+            gu_ += x * gy[j];
+            s = s * w + x;
+        }
+        _gr[t] = F_out(gr);
+        gu += float(_r[t]) * gu_;
+    }
+    _gu[b*C + h*_N_ + i] = F_out(gu);
+}
+template <typename F_in, typename F_out>
+__global__ void kernel_backward_102(const int B, const int T, const int C, const int H,
+    const F_in *__restrict__ const _r, const F_in *__restrict__ const _k, const F_in *__restrict__ const _v,
+    const F_in *__restrict__ _w, const F_in *__restrict__ _u, const F_out *__restrict__ const _gy,
+    F_out *__restrict__ const _gk)
+{
+    const int b = blockIdx.x / H;
+    const int h = blockIdx.x % H;
+    const int i = threadIdx.x;
+    __shared__ float v[_N_], gy[_N_];
+    const float u = float(_u[h*_N_ + i]);
+    float scccc[_N_] = {0};
+    const int t_0 = b*T*C + h*_N_ + i;
+    const int t_T_1 = t_0 + (T-1)*C;
+    for (int t = t_T_1; t >= t_0; t -= C)
+    {
+        __syncthreads();
+        v[i] = float(_v[t]);
+        gy[i] = float(_gy[t]);
+        __syncthreads();
+        const float rr = float(_r[t]);
+        const float w = __expf(-__expf(float(_w[t])));
+        float gk = 0;
+        #pragma unroll
+        for (int j = 0; j < _N_; j++)
+        {
+            float& s = scccc[j];
+            float x = rr * gy[j];
+            gk += (u * x + s) * v[j];
+            s = x + s * w;
+        }
+        _gk[t] = F_out(gk);
+    }
+}
+template <typename F_in, typename F_out>
+__global__ void kernel_backward_103(const int B, const int T, const int C, const int H,
+    const F_in *__restrict__ const _r, const F_in *__restrict__ const _k, const F_in *__restrict__ const _v,
+    const F_in *__restrict__ _w, const F_in *__restrict__ _u, const F_out *__restrict__ const _gy,
+    F_out *__restrict__ const _gv)
+{
+    const int b = blockIdx.x / H;
+    const int h = blockIdx.x % H;
+    const int i = threadIdx.x;
+    _u += h*_N_;
+    __shared__ float u_[_N_], r[_N_], k[_N_], w_[_N_];
+    __syncthreads();
+    u_[i] = float(_u[i]);
+    __syncthreads();
+    float sdddd[_N_] = {0};
+    const int t_0 = b*T*C + h*_N_ + i;
+    const int t_T_1 = t_0 + (T-1)*C;
+    for (int t = t_T_1; t >= t_0; t -= C)
+    {
+        __syncthreads();
+        r[i] = float(_r[t]);
+        k[i] = float(_k[t]);
+        w_[i] = __expf(-__expf(float(_w[t])));
+        __syncthreads();
+        const float gyy = float(_gy[t]);
+        float gv = 0;
+        #pragma unroll
+        for (int j = 0; j < _N_; j++)
+        {
+            float& s = sdddd[j];
+            float x = gyy * r[j];
+            gv += (u_[j] * x + s) * k[j];
+            s = x + s * w_[j];
+        }
+        _gv[t] = F_out(gv);
+    }
+}
+template <typename F_in, typename F_out>
+__global__ void kernel_backward_201(const int B, const int T, const int C, const int H,
+    const F_in *__restrict__ const _r, const F_in *__restrict__ const _k, const F_in *__restrict__ const _v, const F_in *__restrict__ _w,
+    const F_in *__restrict__ _u, const F_out *__restrict__ const _gy,
+    F_out *__restrict__ const _gw)
+{
+    const int b = blockIdx.x / H;
+    const int h = blockIdx.x % H;
+    const int i = threadIdx.x;
+    __shared__ float v[_N_], gy[_N_];
+    float saaaa[_N_] = {0}, sbbbb[_T_-2] = {0}, scccc[_N_] = {0};
+    const int t_0 = b*T*C + h*_N_ + i;
+    const int t_1 = t_0 + C;
+    const int t_2 = t_0 + 2*C;
+    const int t_T_1 = t_0 + (T-1)*C;
+    for (int t = t_T_1; t > t_1; t -= C)
+    {
+        __syncthreads();
+        gy[i] = float(_gy[t]);
+        v[i] = float(_v[t-2*C]);
+        __syncthreads();
+        const float r = float(_r[t]);
+        const float w = __expf(-__expf(float(_w[t-C])));
+        float sum = 0.0f;
+        #pragma unroll
+        for (int j = 0; j < _N_; j++)
+        {
+            float& s = saaaa[j];
+            float x = r * gy[j];
+            s = (s + x) * w;
+            sum += s * v[j];
+        }
+        sbbbb[(t-t_2)/C] = sum * float(_k[t-2*C]);
+    }
+    float sss = sbbbb[0];
+    _gw[t_0] = 0;
+    _gw[t_1] = F_out(sss * -__expf(float(_w[t_1])));
+    for (int t = t_2; t < t_T_1; t += C)
+    {
+        __syncthreads();
+        gy[i] = float(_gy[t]);
+        v[i] = float(_v[t-2*C]);
+        __syncthreads();
+        const float w = __expf(-__expf(float(_w[t-C])));
+        const float k = float(_k[t-2*C]);
+        float sum = 0.0f;
+        #pragma unroll
+        for (int j = 0; j < _N_; j++)
+        {
+            float& s = scccc[j];
+            float x = k * v[j];
+            s = (s + x) * w;
+            sum += s * gy[j];
+        }
+        sss += sbbbb[(t-t_1)/C] - (sum * float(_r[t]));
+        _gw[t] = F_out(sss * -__expf(float(_w[t])));
+    }
+    _gw[t_T_1] = 0;
+}
+template <typename T_in, typename T_out>
+void HostApplyRWKVWithState(cudaStream_t stream,int B, int T, int C, int H, bool S, const int32_t* state_map,
+    const T_in *input_r,const T_in *input_k,const T_in *input_v,
+    const T_in *input_w,const T_in *input_u,T_out *input_s, T_out *output_y, T_out *output_s) {
+  assert(H*_N_ == C);
+  //assert(_N_%4 == 0);
+  kernel_forward_state<<<dim3(B * H), dim3(_N_), _N_ * 4 * sizeof(float),stream>>>(B, T, C, H, S, state_map, input_r, input_k, input_v, input_w, input_u,input_s, output_y,output_s);
+}
+template <typename T_in, typename T_out>
+void HostApplyRWKV(cudaStream_t stream,int B, int T, int C, int H,
+    const T_in *input_r,const T_in *input_k,const T_in *input_v,
+    const T_in *input_w,const T_in *input_u,T_out *output_y) {
+  assert(H*_N_ == C);
+  //assert(_N_%4 == 0);
+  kernel_forward<<<dim3(B * H), dim3(_N_), _N_ * 4 * sizeof(float),stream>>>(B, T, C, H, input_r, input_k, input_v, input_w, input_u, output_y);
+}
+//todo 为kernel设置正确的sharememory大小
+template <typename T_in, typename T_out>
+void HostApplyGradient(cudaStream_t stream,int B, int T, int C, int H,
+T_in *r, T_in *k, T_in *v, T_in *w, T_in *u, T_out *gy, T_out *gr, T_out *gk, T_out *gv, T_out *gw, T_out *gu)
+{
+    assert(H*_N_ == C);
+    kernel_backward_101<<<dim3(B * H), dim3(_N_),_N_ * 2 * sizeof(float),stream >>>(B, T, C, H, r, k, v, w, u, gy, gr, gu);
+    kernel_backward_102<<<dim3(B * H), dim3(_N_),_N_ * 2 * sizeof(float),stream  >>>(B, T, C, H, r, k, v, w, u, gy, gk);
+    kernel_backward_103<<<dim3(B * H), dim3(_N_),_N_ * 4 * sizeof(float),stream >>>(B, T, C, H, r, k, v, w, u, gy, gv);
+    kernel_backward_201<<<dim3(B * H), dim3(_N_),_N_ * 2 * sizeof(float),stream  >>>(B, T, C, H, r, k, v, w, u, gy, gw);
+}
+}
+namespace gpu_ops {
+void rwkv_forward_fn(cudaStream_t stream, void **buffers,
+                                     const char *opaque,
+                                     std::size_t opaque_len) {
+  const WKVDescriptor &d = *UnpackDescriptor<WKVDescriptor>(opaque, opaque_len);
+  DISPATCH_Vector_TYPES(
+    d.x_type, d.y_type, "rwkv_forward_kernel",
+    HostApplyRWKV<input_type, output_type>(
+        stream, d.B, d.T, d.C, d.H,
+        static_cast<input_type *>(buffers[0]),static_cast<input_type *>(buffers[1]),static_cast<input_type *>(buffers[2]),
+        static_cast<input_type *>(buffers[3]),static_cast<input_type *>(buffers[4]),static_cast<output_type *>(buffers[5])
+    );
+  )
+}
+void rwkv_forward_with_state_fn(cudaStream_t stream, void **buffers,
+                                     const char *opaque,
+                                     std::size_t opaque_len) {
+  const WKVDescriptor &d = *UnpackDescriptor<WKVDescriptor>(opaque, opaque_len);
+  DISPATCH_Vector_TYPES(
+    d.x_type, d.y_type, "rwkv_forward_with_state_kernel",
+    if(d.S){
+      HostApplyRWKVWithState<input_type, output_type>(
+          stream, d.B, d.T, d.C, d.H, true, static_cast<int32_t *>(buffers[0])/*map*/,
+          static_cast<input_type *>(buffers[1])/*r*/,static_cast<input_type *>(buffers[2])/*k*/,static_cast<input_type *>(buffers[3])/*v*/,
+          static_cast<input_type *>(buffers[4])/*w*/,static_cast<input_type *>(buffers[5])/*u*/,static_cast<output_type *>(buffers[6])/*s*/,
+          static_cast<output_type *>(buffers[7])/*y*/,static_cast<output_type *>(buffers[8])/*ys*/
+      );
+    }else{
+      HostApplyRWKVWithState<input_type, output_type>(
+          stream, d.B, d.T, d.C, d.H, false, nullptr,
+          static_cast<input_type *>(buffers[0])/*r*/,static_cast<input_type *>(buffers[1])/*k*/,static_cast<input_type *>(buffers[2])/*v*/,
+          static_cast<input_type *>(buffers[3])/*w*/,static_cast<input_type *>(buffers[4])/*u*/,nullptr/*s*/,
+          static_cast<output_type *>(buffers[5])/*y*/,static_cast<output_type *>(buffers[6])/*ys*/
+      );
+    }
+  )
+}
+void rwkv_backward_fn(cudaStream_t stream, void **buffers,
+                                     const char *opaque,
+                                     std::size_t opaque_len) {
+  const WKVDescriptor &d = *UnpackDescriptor<WKVDescriptor>(opaque, opaque_len);
+  DISPATCH_Vector_TYPES(
+    d.x_type, d.y_type, "rwkv_backward_kernel",
+    HostApplyGradient<input_type, output_type>(
+        stream, d.B, d.T, d.C, d.H,
+        static_cast<input_type *>(buffers[0]),static_cast<input_type *>(buffers[1]),static_cast<input_type *>(buffers[2]),
+        static_cast<input_type *>(buffers[3]),static_cast<input_type *>(buffers[4]),static_cast<output_type *>(buffers[5]),
+        static_cast<output_type *>(buffers[6]),static_cast<output_type *>(buffers[7]),static_cast<output_type *>(buffers[8]),
+        static_cast<output_type *>(buffers[9]),static_cast<output_type *>(buffers[10])
+    );
+  )
+}
+} // namespace gpu_ops

rwkv_ops/rwkv6_kernel/jax_kernel_hip/gpu_ops.cpp ADDED Viewed

@@ -0,0 +1,44 @@
+/* Copyright 2024 The JAX Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "kernels.h"
+#include "pybind11_kernel_helpers.h"
+namespace {
+pybind11::dict WKVRegistrations() {
+  pybind11::dict dict;
+  dict["wkv_forward"] =
+      gpu_ops::EncapsulateFunction(gpu_ops::rwkv_forward_fn);
+  dict["wkv_backward"] =
+      gpu_ops::EncapsulateFunction(gpu_ops::rwkv_backward_fn);
+  dict["wkv_forward_with_state"] =
+      gpu_ops::EncapsulateFunction(gpu_ops::rwkv_forward_with_state_fn);
+  return dict;
+}
+PYBIND11_MODULE(gpu_ops, m) {
+  m.def("get_rwkv_registrations", &WKVRegistrations);
+  m.def("create_rwkv_descriptor",
+        [](int B, int T,int C, int H,bool S, gpu_ops::ElementType input_type,gpu_ops::ElementType output_type) {
+          return gpu_ops::PackDescriptor(gpu_ops::WKVDescriptor{B, T, C, H, S, input_type, output_type});
+        });
+  pybind11::enum_<gpu_ops::ElementType>(m, "ElementType")
+      .value("BF16", gpu_ops::ElementType::BF16)
+      .value("F16", gpu_ops::ElementType::F16)
+      .value("F32", gpu_ops::ElementType::F32);
+}
+} // namespace

rwkv_ops/rwkv6_kernel/jax_kernel_hip/kernel_helpers.h ADDED Viewed

@@ -0,0 +1,64 @@
+/* Copyright 2024 The JAX Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This header is not specific to our application and you'll probably want
+// something like this for any extension you're building. This includes the
+// infrastructure needed to serialize descriptors that are used with the
+// "opaque" parameter of the GPU custom call. In our example we'll use this
+// parameter to pass the size of our problem.
+#ifndef _GPU_OPS_KERNEL_HELPERS_H_
+#define _GPU_OPS_KERNEL_HELPERS_H_
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#define JAX_APEX_WARP_SIZE 32
+namespace gpu_ops {
+// https://en.cppreference.com/w/cpp/numeric/bit_cast
+template <class To, class From>
+typename std::enable_if<sizeof(To) == sizeof(From) &&
+                            std::is_trivially_copyable<From>::value &&
+                            std::is_trivially_copyable<To>::value,
+                        To>::type
+bit_cast(const From &src) noexcept {
+  static_assert(std::is_trivially_constructible<To>::value,
+                "This implementation additionally requires destination type to "
+                "be trivially constructible");
+  To dst;
+  memcpy(&dst, &src, sizeof(To));
+  return dst;
+}
+template <typename T> std::string PackDescriptorAsString(const T &descriptor) {
+  return std::string(bit_cast<const char *>(&descriptor), sizeof(T));
+}
+template <typename T>
+const T *UnpackDescriptor(const char *opaque, std::size_t opaque_len) {
+  if (opaque_len != sizeof(T)) {
+    throw std::runtime_error("Invalid opaque object size");
+  }
+  return bit_cast<const T *>(opaque);
+}
+} // namespace gpu_ops
+#endif

rwkv_ops/rwkv6_kernel/jax_kernel_hip/kernels.h ADDED Viewed

@@ -0,0 +1,56 @@
+/* Copyright 2024 The JAX Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef _GPU_OPS_KERNELS_H_
+#define _GPU_OPS_KERNELS_H_
+#include <hip/hip_runtime_api.h>
+#include <cstddef>
+#include <cstdint>
+#ifndef _N_
+  #define _N_ 8
+#endif
+#ifndef _T_
+  #define _T_ 16
+#endif
+namespace gpu_ops {
+enum ElementType { BF16, F16, F32 };
+struct WKVDescriptor {
+  int B;
+  int T;
+  int C;
+  int H;
+  bool S;
+  ElementType x_type;
+  ElementType y_type;
+};
+void rwkv_forward_fn(hipStream_t stream, void **buffers,
+                      const char *opaque,
+                      std::size_t opaque_len);
+void rwkv_backward_fn(hipStream_t stream, void **buffers,
+                      const char *opaque,
+                      std::size_t opaque_len);
+void rwkv_forward_with_state_fn(hipStream_t stream, void **buffers,
+                      const char *opaque,
+                      std::size_t opaque_len);
+} // namespace gpu_ops
+#endif

rwkv-ops 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

rwkv-ops 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl