PyPI - rwkv-ops - Versions diffs - 0.6.1__py3-none-any.whl - Mend

rwkv-ops 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

rwkv_ops/__init__.py +45 -0
rwkv_ops/mhc_kernel/__init__.py +50 -0
rwkv_ops/mhc_kernel/common_kernel/include/mhc_types.h +66 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/mhc_post_op.cuh +197 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/mhc_pre_op.cuh +212 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/rmsnorm.cuh +152 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/sinkhorn_knopp.cuh +158 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/stream_aggregate.cuh +141 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/stream_distribute.cuh +111 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/stream_mix.cuh +164 -0
rwkv_ops/mhc_kernel/common_kernel/kernels/type_conversions.cuh +52 -0
rwkv_ops/mhc_kernel/jax_kernel/CMakeLists.txt +47 -0
rwkv_ops/mhc_kernel/jax_kernel/mhu_ffi.cu +652 -0
rwkv_ops/mhc_kernel/jax_kernel/mhu_jax.py +939 -0
rwkv_ops/mhc_kernel/native_keras_op.py +193 -0
rwkv_ops/mhc_kernel/torch_kernel/mhc_cuda.cu +207 -0
rwkv_ops/mhc_kernel/torch_kernel/mhc_op.cpp +296 -0
rwkv_ops/mhc_kernel/torch_kernel/mhc_torch.py +306 -0
rwkv_ops/rwkv6_kernel/__init__.py +120 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/gpu_ops.cpp +44 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/kernel_helpers.h +64 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/kernels.h +56 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/pybind11_kernel_helpers.h +41 -0
rwkv_ops/rwkv6_kernel/jax_kernel_cuda/rwkv_kernels.cu +512 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/gpu_ops.cpp +44 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/kernel_helpers.h +64 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/kernels.h +56 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/pybind11_kernel_helpers.h +41 -0
rwkv_ops/rwkv6_kernel/jax_kernel_hip/rwkv_kernels.hip +514 -0
rwkv_ops/rwkv6_kernel/jax_rwkv_kernel.py +722 -0
rwkv_ops/rwkv6_kernel/ops_rwkv_kernel.py +90 -0
rwkv_ops/rwkv6_kernel/torch_kernel/wkv6_cuda.cu +397 -0
rwkv_ops/rwkv6_kernel/torch_kernel/wkv6_op.cpp +93 -0
rwkv_ops/rwkv6_kernel/torch_rwkv_kernel.py +305 -0
rwkv_ops/rwkv7_kernel/__init__.py +113 -0
rwkv_ops/rwkv7_kernel/get_jax_devices_info.py +220 -0
rwkv_ops/rwkv7_kernel/get_torch_devices_info.py +250 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/CMakeLists.txt +42 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_ffi.cu +399 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_jax.py +311 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel_single/CMakeLists.txt +42 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel_single/wkv7_single_step_ffi.cu +172 -0
rwkv_ops/rwkv7_kernel/jax_cuda_kernel_single/wkv7_single_step_jax.py +190 -0
rwkv_ops/rwkv7_kernel/jax_kernel/__init__.py +9 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_bwd.py +95 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_fwd.py +60 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_bwd.py +78 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_fwd.py +80 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_bwd.py +150 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_fwd.py +45 -0
rwkv_ops/rwkv7_kernel/jax_kernel/cumsum.py +34 -0
rwkv_ops/rwkv7_kernel/jax_kernel/wy_fast_bwd.py +61 -0
rwkv_ops/rwkv7_kernel/jax_kernel/wy_fast_fwd.py +86 -0
rwkv_ops/rwkv7_kernel/jax_op.py +382 -0
rwkv_ops/rwkv7_kernel/mlx_op.py +118 -0
rwkv_ops/rwkv7_kernel/native_keras_op.py +108 -0
rwkv_ops/rwkv7_kernel/tf_eager_kernel.py +155 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_cuda.cu +235 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_op.cpp +63 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_torch.py +233 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_cuda.cu +101 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_op.cpp +56 -0
rwkv_ops/rwkv7_kernel/torch_cuda_kernel_single/wkv7_single_step_torch.py +112 -0
rwkv_ops/rwkv7_kernel/torch_kernel/__init__.py +13 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_bwd.py +96 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_fwd.py +64 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_bwd.py +74 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_fwd.py +75 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_bwd.py +148 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_fwd.py +44 -0
rwkv_ops/rwkv7_kernel/torch_kernel/cumsum.py +31 -0
rwkv_ops/rwkv7_kernel/torch_kernel/wy_fast_bwd.py +63 -0
rwkv_ops/rwkv7_kernel/torch_kernel/wy_fast_fwd.py +79 -0
rwkv_ops/rwkv7_kernel/torch_op.py +504 -0
rwkv_ops/rwkv7_kernel/triton_kernel/__init__.py +34 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_bwd.py +328 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_A_fwd.py +186 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_bwd.py +157 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_h_fwd.py +160 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_o_bwd.py +382 -0
rwkv_ops/rwkv7_kernel/triton_kernel/chunk_o_fwd.py +137 -0
rwkv_ops/rwkv7_kernel/triton_kernel/cumsum.py +86 -0
rwkv_ops/rwkv7_kernel/triton_kernel/utils.py +20 -0
rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_bwd.py +193 -0
rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_fwd.py +326 -0
rwkv_ops-0.6.1.dist-info/METADATA +495 -0
rwkv_ops-0.6.1.dist-info/RECORD +89 -0
rwkv_ops-0.6.1.dist-info/WHEEL +4 -0
rwkv_ops-0.6.1.dist-info/licenses/LICENSE.txt +201 -0

rwkv_ops/rwkv6_kernel/__init__.py ADDED Viewed

@@ -0,0 +1,120 @@
+# copy right from https://github.com/infiy-quine/RWKV6_Keras_Operator
+import os
+import keras
+from keras import ops
+from distutils.util import strtobool
+from packaging import version
+def get_rwkv6_kernel(KERNEL_TYPE="native"):
+    ops_kernel = True
+    if KERNEL_TYPE == "cuda":
+        if keras.config.backend() == "jax":
+            import jax
+            if version.parse(jax.__version__) < version.parse("0.6.0"):
+                from .jax_rwkv_kernel import RWKVKernelOperator as CudaOperator
+                ops_kernel = False
+            else:
+                CudaOperator = None
+        elif keras.config.backend() == "torch":
+            from .torch_rwkv_kernel import RWKVKernelOperator as CudaOperator
+            ops_kernel = False
+        else:
+            CudaOperator = None
+    else:
+        CudaOperator = None
+    from .ops_rwkv_kernel import RWKVKernelOperator as OpsOperator
+    class RWKVKernelOperator:
+        def __init__(self, head_size, max_sequence_length, ops_loop=False):
+            self.enbale_cuda = CudaOperator is not None
+            if self.enbale_cuda:
+                self.cuda_operator = CudaOperator(head_size, max_sequence_length)
+            self.ops_operator = OpsOperator(head_size, max_sequence_length)
+            self.ops_loop = ops_loop
+        def __call__(
+            self, r, k, v, w, u, with_state=False, init_state=None, state_map=None
+        ):
+            seq_len = r.shape[1]
+            def call_parallel():
+                if self.enbale_cuda:
+                    return self.cuda_operator(
+                        r=r,
+                        k=k,
+                        v=v,
+                        w=w,
+                        u=u,
+                        with_state=with_state,
+                        init_state=init_state,
+                        state_map=state_map,
+                    )
+                else:
+                    return self.ops_operator(
+                        r=r,
+                        k=k,
+                        v=v,
+                        w=w,
+                        u=u,
+                        with_state=with_state,
+                        init_state=init_state,
+                        state_map=state_map,
+                    )
+            def call_one_step():
+                return self.ops_operator(
+                    r=r,
+                    k=k,
+                    v=v,
+                    w=w,
+                    u=u,
+                    with_state=with_state,
+                    init_state=init_state,
+                    state_map=state_map,
+                )
+            if not self.ops_loop:
+                return ops.cond(
+                    seq_len != 1 and not ops_kernel, call_parallel, call_one_step
+                )
+            else:
+                return call_parallel()
+    return RWKVKernelOperator
+# from .ops_rwkv_kernal import RWKVKernelOperator as OPSKernelOperator
+"""
+新增三个参数
+return_state 布尔类型 是否返回最终的state,如果想自定义init_state也需要启用这个开关
+init_state
+    当init_state省缺时，则使用全零初始化BatchSize维度上的状态。
+    形状: (state_kinds,num_heads,head_size, head_size)， 其中state_kinds为小于等于Batch_Size的正整数
+    精度: 在r为fp16时 init_state为fp32 其余时候类型与r相同
+state_map
+    形状: (Batch_Size,)
+    精度: int64, list[int]
+    这个数组定义了state到r上每个Batch维度切片间的映射关系
+    取值范围: [0, state_kinds)
+返回:
+    output, output_state
+def __call__(self,r, k, v, w, u, return_state=False, init_state=None, state_map=None):
+"""

rwkv_ops/rwkv6_kernel/jax_kernel_cuda/gpu_ops.cpp ADDED Viewed

@@ -0,0 +1,44 @@
+/* Copyright 2024 The JAX Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "kernels.h"
+#include "pybind11_kernel_helpers.h"
+namespace {
+pybind11::dict WKVRegistrations() {
+  pybind11::dict dict;
+  dict["wkv_forward"] =
+      gpu_ops::EncapsulateFunction(gpu_ops::rwkv_forward_fn);
+  dict["wkv_backward"] =
+      gpu_ops::EncapsulateFunction(gpu_ops::rwkv_backward_fn);
+  dict["wkv_forward_with_state"] =
+      gpu_ops::EncapsulateFunction(gpu_ops::rwkv_forward_with_state_fn);
+  return dict;
+}
+PYBIND11_MODULE(gpu_ops, m) {
+  m.def("get_rwkv_registrations", &WKVRegistrations);
+  m.def("create_rwkv_descriptor",
+        [](int B, int T,int C, int H,bool S, gpu_ops::ElementType input_type,gpu_ops::ElementType output_type) {
+          return gpu_ops::PackDescriptor(gpu_ops::WKVDescriptor{B, T, C, H, S, input_type, output_type});
+        });
+  pybind11::enum_<gpu_ops::ElementType>(m, "ElementType")
+      .value("BF16", gpu_ops::ElementType::BF16)
+      .value("F16", gpu_ops::ElementType::F16)
+      .value("F32", gpu_ops::ElementType::F32);
+}
+} // namespace

rwkv_ops/rwkv6_kernel/jax_kernel_cuda/kernel_helpers.h ADDED Viewed

@@ -0,0 +1,64 @@
+/* Copyright 2024 The JAX Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This header is not specific to our application and you'll probably want
+// something like this for any extension you're building. This includes the
+// infrastructure needed to serialize descriptors that are used with the
+// "opaque" parameter of the GPU custom call. In our example we'll use this
+// parameter to pass the size of our problem.
+#ifndef _GPU_OPS_KERNEL_HELPERS_H_
+#define _GPU_OPS_KERNEL_HELPERS_H_
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#define JAX_APEX_WARP_SIZE 32
+namespace gpu_ops {
+// https://en.cppreference.com/w/cpp/numeric/bit_cast
+template <class To, class From>
+typename std::enable_if<sizeof(To) == sizeof(From) &&
+                            std::is_trivially_copyable<From>::value &&
+                            std::is_trivially_copyable<To>::value,
+                        To>::type
+bit_cast(const From &src) noexcept {
+  static_assert(std::is_trivially_constructible<To>::value,
+                "This implementation additionally requires destination type to "
+                "be trivially constructible");
+  To dst;
+  memcpy(&dst, &src, sizeof(To));
+  return dst;
+}
+template <typename T> std::string PackDescriptorAsString(const T &descriptor) {
+  return std::string(bit_cast<const char *>(&descriptor), sizeof(T));
+}
+template <typename T>
+const T *UnpackDescriptor(const char *opaque, std::size_t opaque_len) {
+  if (opaque_len != sizeof(T)) {
+    throw std::runtime_error("Invalid opaque object size");
+  }
+  return bit_cast<const T *>(opaque);
+}
+} // namespace gpu_ops
+#endif

rwkv_ops/rwkv6_kernel/jax_kernel_cuda/kernels.h ADDED Viewed

@@ -0,0 +1,56 @@
+/* Copyright 2024 The JAX Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef _GPU_OPS_KERNELS_H_
+#define _GPU_OPS_KERNELS_H_
+#include <cuda_runtime_api.h>
+#include <cstddef>
+#include <cstdint>
+#ifndef _N_
+  #define _N_ 8
+#endif
+#ifndef _T_
+  #define _T_ 16
+#endif
+namespace gpu_ops {
+enum ElementType { BF16, F16, F32 };
+struct WKVDescriptor {
+  int B;
+  int T;
+  int C;
+  int H;
+  bool S;
+  ElementType x_type;
+  ElementType y_type;
+};
+void rwkv_forward_fn(cudaStream_t stream, void **buffers,
+                      const char *opaque,
+                      std::size_t opaque_len);
+void rwkv_backward_fn(cudaStream_t stream, void **buffers,
+                      const char *opaque,
+                      std::size_t opaque_len);
+void rwkv_forward_with_state_fn(cudaStream_t stream, void **buffers,
+                      const char *opaque,
+                      std::size_t opaque_len);
+} // namespace gpu_ops
+#endif

rwkv_ops/rwkv6_kernel/jax_kernel_cuda/pybind11_kernel_helpers.h ADDED Viewed

@@ -0,0 +1,41 @@
+/* Copyright 2024 The JAX Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This header extends kernel_helpers.h with the pybind11 specific interface to
+// serializing descriptors. It also adds a pybind11 function for wrapping our
+// custom calls in a Python capsule. This is separate from kernel_helpers so
+// that the CUDA code itself doesn't include pybind11. I don't think that this
+// is strictly necessary, but they do it in jaxlib, so let's do it here too.
+#ifndef _GPU_OPS_PYBIND11_KERNEL_HELPERS_H_
+#define _GPU_OPS_PYBIND11_KERNEL_HELPERS_H_
+#include <pybind11/pybind11.h>
+#include "kernel_helpers.h"
+namespace gpu_ops {
+template <typename T> pybind11::bytes PackDescriptor(const T &descriptor) {
+  return pybind11::bytes(PackDescriptorAsString(descriptor));
+}
+template <typename T> pybind11::capsule EncapsulateFunction(T *fn) {
+  return pybind11::capsule(bit_cast<void *>(fn), "xla._CUSTOM_CALL_TARGET");
+}
+} // namespace gpu_ops
+#endif