PyPI - rwkv-ops - Versions diffs - 0.1.1__py3-none-any.whl → 0.2__py3-none-any.whl - Mend

rwkv-ops 0.1.1py3-none-any.whl → 0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rwkv-ops might be problematic. Click here for more details.

Files changed (13) hide show

rwkv_ops/__init__.py +3 -1
rwkv_ops/rwkv6_kernel/__init__.py +126 -0
rwkv_ops/rwkv6_kernel/jax_rwkv_kernel.py +724 -0
rwkv_ops/rwkv6_kernel/ops_rwkv_kernel.py +86 -0
rwkv_ops/rwkv6_kernel/torch_rwkv_kernel.py +305 -0
rwkv_ops/rwkv7_kernel/__init__.py +3 -7
rwkv_ops/rwkv7_kernel/torch_op.py +67 -29
rwkv_ops-0.2.dist-info/METADATA +258 -0
{rwkv_ops-0.1.1.dist-info → rwkv_ops-0.2.dist-info}/RECORD +12 -8
rwkv_ops-0.1.1.dist-info/METADATA +0 -119
{rwkv_ops-0.1.1.dist-info → rwkv_ops-0.2.dist-info}/LICENSE.txt +0 -0
{rwkv_ops-0.1.1.dist-info → rwkv_ops-0.2.dist-info}/WHEEL +0 -0
{rwkv_ops-0.1.1.dist-info → rwkv_ops-0.2.dist-info}/top_level.txt +0 -0

rwkv_ops/rwkv6_kernel/ops_rwkv_kernel.py ADDED Viewed

@@ -0,0 +1,86 @@
+from keras import ops
+import keras
+class RWKVKernelOperator:
+    def __init__(self, head_size, max_sequence_length):
+        self.head_size = head_size
+        self.max_sequence_length = max_sequence_length
+    def __call__(
+        self, r, k, v, w, u, with_state=False, init_state=None, state_map=None
+    ):
+        B, T, C = ops.shape(r)
+        assert C % self.head_size == 0
+        H = C // self.head_size
+        w = ops.reshape(w, [B, T, H, self.head_size, 1])
+        k = ops.reshape(k, [B, T, H, self.head_size, 1])
+        v = ops.reshape(v, [B, T, H, 1, self.head_size])
+        r = ops.reshape(r, [B, T, H, 1, self.head_size])
+        u = ops.reshape(u, [1, H, self.head_size, 1])
+        if init_state is not None:
+            assert len(init_state.shape) in [3, 4], (
+                "init_state的形状必须为(state_kinds,num_heads,head_size,head_size)"
+            )
+            if len(init_state.shape) == 3:
+                assert init_state.shape == (H, self.head_size, self.head_size), (
+                    "state_kinds的形状必须为(BatchSize,num_heads,head_size,head_size)"
+                )
+                init_state = init_state[None, :]
+            else:
+                assert init_state.shape[1:] == (H, self.head_size, self.head_size), (
+                    "state_kinds的形状必须为(BatchSize,num_heads,head_size,head_size)"
+                )
+                state_kinds = init_state.shape[0]
+            if state_map is None:
+                state_kinds = init_state.shape[0]
+                if state_kinds == 1:
+                    state_map = ops.zeros(shape=(B,), dtype="int32")
+                elif state_kinds == B:
+                    state_map = ops.convert_to_tensor(
+                        [i for i in range(B)], dtype="int32"
+                    )
+                else:
+                    raise ValueError(
+                        "无法为您推断state_map的形状，请您手动指定state_map"
+                    )
+            else:
+                if isinstance(state_map, list):
+                    state_map = ops.convert_to_tensor(state_map, dtype="int32")
+                state_map = ops.cast(state_map, "int32")
+                assert (state_map >= 0).all() and (state_map < state_kinds).all(), (
+                    f"请确保state_map的值域为[0, {state_kinds})"
+                )
+            s = ops.take(init_state, state_map, axis=0)
+        else:
+            assert state_map is None
+            s = ops.zeros((B, H, self.head_size, self.head_size), dtype=u.dtype)
+        w = ops.exp(-ops.exp(w))
+        def cond(i, k, v, w, r, s, y):
+            return i < T
+        def body(i, k, v, w, r, s, y):
+            k_t = ops.take(k, i, 1)
+            v_t = ops.take(v, i, 1)
+            kv_t = k_t @ v_t
+            w_t = ops.take(w, i, 1)
+            r_t = ops.take(r, i, 1)
+            y_t = r_t @ (u * kv_t + s)
+            y_t = ops.reshape(y_t, (B, 1, C))
+            s = kv_t + w_t * s
+            y = ops.slice_update(y, [0, i, 0], y_t)
+            return i + 1, k, v, w, r, s, y
+        y = ops.zeros([B, T, C], r.dtype)
+        i, k, v, w, r, s, y = ops.while_loop(cond, body, (0, k, v, w, r, s, y), T)
+        if with_state:
+            return y, s
+        return y, None

rwkv_ops/rwkv6_kernel/torch_rwkv_kernel.py ADDED Viewed

@@ -0,0 +1,305 @@
+import os
+import torch
+from torch.utils.cpp_extension import load
+from keras import ops
+kernel_dir_name = "torch_kernel"
+use_rocm = "RWKV_USE_ROCM" in os.environ and os.environ["RWKV_USE_ROCM"] == "1"
+class RWKVKernelOperator:
+    def __init__(self, head_size, max_sequence_length):
+        current_dir = os.path.dirname(__file__)
+        # current_dir = os.pat
+        if use_rocm:
+            wkv6_cuda = load(
+                name="wkv6",
+                sources=[
+                    os.path.join(current_dir, f"{kernel_dir_name}/wkv6_op.cpp"),
+                    os.path.join(current_dir, f"{kernel_dir_name}/wkv6_cuda.cu"),
+                ],
+                # verbose=True, extra_cuda_cflags=[f"-D_N_={head_size}", f"-D_T_={max_sequence_length}"])
+                verbose=True,
+                extra_cuda_cflags=[
+                    "-fopenmp -ffast-math -munsafe-fp-atomics --gpu-max-threads-per-block=120 -enable-vectorize-compares",
+                    f"-D_N_={head_size}",
+                    f"-D_T_={max_sequence_length}",
+                ],
+            )
+        else:
+            wkv6_cuda = load(
+                name="wkv6",
+                sources=[
+                    os.path.join(current_dir, f"{kernel_dir_name}/wkv6_op.cpp"),
+                    os.path.join(current_dir, f"{kernel_dir_name}/wkv6_cuda.cu"),
+                ],
+                # verbose=True, extra_cuda_cflags=[f"-D_N_={head_size}", f"-D_T_={max_sequence_length}"])
+                verbose=True,
+                extra_cuda_cflags=[
+                    "-res-usage",
+                    "--use_fast_math",
+                    "-O3",
+                    "-Xptxas -O3",
+                    "--extra-device-vectorization",
+                    f"-D_N_={head_size}",
+                    f"-D_T_={max_sequence_length}",
+                ],
+            )
+        class RWKV_6(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, B, T, C, H, r, k, v, w, u):
+                if not isinstance(u, torch.Tensor):
+                    u = u.value
+                with torch.no_grad():
+                    assert r.dtype == k.dtype == v.dtype == w.dtype == u.dtype
+                    assert r.dtype in [torch.float32, torch.bfloat16, torch.float16]
+                    assert head_size == C // H
+                    ctx.B = B
+                    ctx.T = T
+                    ctx.C = C
+                    ctx.H = H
+                    assert r.is_contiguous()
+                    assert k.is_contiguous()
+                    assert v.is_contiguous()
+                    assert w.is_contiguous()
+                    assert u.is_contiguous()
+                    ctx.save_for_backward(r, k, v, w, u)
+                    y_dtype = r.dtype if r.dtype != torch.float16 else torch.float32
+                    y = torch.empty(
+                        (B, T, C),
+                        device=r.device,
+                        dtype=y_dtype,
+                        memory_format=torch.contiguous_format,
+                    )  # .uniform_(-100, 100)
+                    if r.dtype == torch.float32:
+                        wkv6_cuda.forward_fp32(B, T, C, H, r, k, v, w, u, y)
+                    elif r.dtype == torch.bfloat16:
+                        wkv6_cuda.forward_bf16(B, T, C, H, r, k, v, w, u, y)
+                    else:
+                        wkv6_cuda.forward_fp16(B, T, C, H, r, k, v, w, u, y)
+                    return y
+            @staticmethod
+            def backward(ctx, gy):
+                assert gy.is_cuda
+                with torch.no_grad():
+                    assert gy.dtype in [torch.bfloat16, torch.float32]
+                    B = ctx.B
+                    T = ctx.T
+                    C = ctx.C
+                    H = ctx.H
+                    assert gy.is_contiguous()
+                    r, k, v, w, u = ctx.saved_tensors
+                    y_dtype = r.dtype if r.dtype != torch.float16 else torch.float32
+                    gr = torch.empty(
+                        (B, T, C),
+                        device=gy.device,
+                        requires_grad=False,
+                        dtype=y_dtype,
+                        memory_format=torch.contiguous_format,
+                    )  # .uniform_(-100, 100)
+                    gk = torch.empty(
+                        (B, T, C),
+                        device=gy.device,
+                        requires_grad=False,
+                        dtype=y_dtype,
+                        memory_format=torch.contiguous_format,
+                    )  # .uniform_(-100, 100)
+                    gv = torch.empty(
+                        (B, T, C),
+                        device=gy.device,
+                        requires_grad=False,
+                        dtype=y_dtype,
+                        memory_format=torch.contiguous_format,
+                    )  # .uniform_(-100, 100)
+                    gw = torch.empty(
+                        (B, T, C),
+                        device=gy.device,
+                        requires_grad=False,
+                        dtype=y_dtype,
+                        memory_format=torch.contiguous_format,
+                    )  # .uniform_(-100, 100)
+                    gu = torch.empty(
+                        (B, C),
+                        device=gy.device,
+                        requires_grad=False,
+                        dtype=y_dtype,
+                        memory_format=torch.contiguous_format,
+                    )  # .uniform_(-100, 100)
+                    if r.dtype == torch.float32:
+                        wkv6_cuda.backward_fp32(
+                            B, T, C, H, r, k, v, w, u, gy, gr, gk, gv, gw, gu
+                        )
+                    elif r.dtype == torch.bfloat16:
+                        wkv6_cuda.backward_bf16(
+                            B, T, C, H, r, k, v, w, u, gy, gr, gk, gv, gw, gu
+                        )
+                    else:
+                        wkv6_cuda.backward_fp16(
+                            B, T, C, H, r, k, v, w, u, gy, gr, gk, gv, gw, gu
+                        )
+                    gu = torch.sum(gu, 0).view(H, C // H)
+                    return (None, None, None, None, gr, gk, gv, gw, gu)
+        class RWKV_6_with_state:
+            @staticmethod
+            def apply(B, T, C, H, S, s_map, r, k, v, w, u, s):
+                with torch.no_grad():
+                    assert s_map.dtype == torch.int64, (
+                        "s_map 必须为None 或者是长度为B的，int64类型的数组。"
+                    )
+                    assert (s is None and s_map is None) or (
+                        s is not None and s_map is not None
+                    ), "init_state与s_map必须同时为None 或者同时不为None"
+                    assert (
+                        r.dtype == k.dtype == v.dtype == w.dtype == u.dtype
+                        and r.dtype in [torch.float16, torch.float32, torch.bfloat16]
+                    ), " r, k, v, w, u 必须为fp16 fp32 bf16中的一种 并且类型相同"
+                    if r.dtype in [torch.float32, torch.bfloat16]:
+                        o_dtype = r.dtype
+                    else:
+                        o_dtype = torch.float32
+                    assert (
+                        r.device
+                        == k.device
+                        == v.device
+                        == w.device
+                        == u.device
+                        == s.device
+                        == s_map.device
+                    ), "what kan i say? 请确保r k v w u s s_map在同一设备上，快去检查！"
+                    y = torch.empty(
+                        (B, T, C),
+                        device=r.device,
+                        dtype=o_dtype,
+                        memory_format=torch.contiguous_format,
+                    )
+                    ys = torch.empty(
+                        (B, H, head_size, head_size),
+                        device=r.device,
+                        dtype=o_dtype,
+                        memory_format=torch.contiguous_format,
+                    )
+                    # print(ys)
+                    if r.dtype == torch.bfloat16:
+                        wkv6_cuda.forward_with_state_bf16(
+                            B, T, C, H, S, s_map, r, k, v, w, u, s, y, ys
+                        )
+                    elif r.dtype == torch.float32:
+                        wkv6_cuda.forward_with_state_fp32(
+                            B, T, C, H, S, s_map, r, k, v, w, u, s, y, ys
+                        )
+                    else:
+                        wkv6_cuda.forward_with_state_fp16(
+                            B, T, C, H, S, s_map, r, k, v, w, u, s, y, ys
+                        )
+                return y, ys
+        self.head_size = head_size
+        self.normal_kernenl = RWKV_6
+        self.kernel_with_state = RWKV_6_with_state
+    def __call__(
+        self, r, k, v, w, u, with_state=False, init_state=None, state_map=None
+    ):
+        B, T, C = r.shape
+        assert C % self.head_size == 0
+        H = C // self.head_size
+        if not isinstance(u, torch.Tensor):
+            u = u.value
+        assert r.is_cuda
+        assert k.is_cuda
+        assert v.is_cuda
+        assert w.is_cuda
+        assert u.is_cuda
+        if isinstance(r, torch.Tensor):
+            assert r.device == k.device == v.device == w.device == u.device
+        else:
+            r.get_device() == k.get_device() == v.get_device() == w.get_device() == u.get_device()
+        assert r.dtype == k.dtype == v.dtype == w.dtype == u.dtype
+        if r.dtype in [torch.float32, torch.bfloat16]:
+            s_dtype = r.dtype
+        else:
+            s_dtype = torch.float32
+        is_custom_init = init_state is not None
+        if init_state is not None:
+            assert len(init_state.shape) in [3, 4], (
+                "init_state 的形状必须为(state_kinds /*<= Batch_size*/,num_heads,head_size,head_size) 或者(num_heads,head_size,head_size)"
+            )
+            if len(init_state.shape) == 3:
+                init_state = init_state[None, :]
+            assert (
+                init_state.shape[1:] == (H, self.head_size, self.head_size)
+                and init_state.shape[0] <= B
+            ), (
+                "init_state 的形状必须为(state_kinds /*<= Batch_size*/,num_heads,head_size,head_size) 或者(num_heads,head_size,head_size)"
+            )
+            assert init_state.dtype == s_dtype, f"init_state的数值类型应为: {s_dtype}"
+            assert init_state.device == r.device
+        if state_map is not None:
+            if isinstance(state_map, list):
+                state_map = torch.tensor(state_map, dtype=torch.int64)
+            elif isinstance(state_map, torch.Tensor):
+                assert state_map.dtype in [torch.int32, torch.int64], (
+                    "state_map是一个长度为Batch_Size的int64类型的映射数组"
+                )
+                state_map = state_map.to(torch.int64)
+            assert state_map.shape == (B,), "state_map的shape必须为(Batch_Size,)"
+            assert state_map.device == r.deivec
+        if with_state:
+            if init_state is None:
+                assert state_map is None, (
+                    "您必须在指定了init_state的情况下才能使用state_map"
+                )
+                init_state = torch.zeros((0,), device=r.device, dtype=s_dtype)
+                state_map = torch.zeros((0,), device=r.device, dtype=torch.int64)
+            else:
+                n_state = init_state.shape[0]
+                if state_map is None:
+                    assert n_state == 1 or n_state == B, (
+                        "我无法为您推断state_map的形状，请手动指定。"
+                    )
+                    if n_state == 1:
+                        state_map = torch.tensor(
+                            [0] * B, dtype=torch.int64, device=r.device
+                        )
+                    elif n_state == B:
+                        state_map = torch.tensor(
+                            [i for i in range(B)], dtype=torch.int64, device=r.device
+                        )
+                    else:
+                        assert False, "未实现"
+                else:
+                    assert state_map.shape == (B,), "state_map的形状必须为(batch_size,)"
+                    assert (state_map >= 0).all() and (state_map < n_state).all(), (
+                        f"state_map的取值范围为[0,{n_state})之间的整数，您的输入显然不满足。"
+                    )
+            # print('state map:',state_map)
+            o, ys = self.kernel_with_state.apply(
+                B, T, C, H, is_custom_init, state_map, r, k, v, w, u, init_state
+            )
+            return o, ys
+        else:
+            o = self.normal_kernenl.apply(B, T, C, H, r, k, v, w, u)
+            return o, None

rwkv_ops/rwkv7_kernel/__init__.py CHANGED Viewed

@@ -25,6 +25,7 @@ def get_generalized_delta_rule(HEAD_SIZE=64, KERNEL_TYPE="native"):
             CHUNK_LEN = 16
             USE_KERNEL = True
             from torch.utils.cpp_extension import load
+            import os
             flags = [
                 "-res-usage",
@@ -40,16 +41,11 @@ def get_generalized_delta_rule(HEAD_SIZE=64, KERNEL_TYPE="native"):
             # 获取当前文件的目录路径
             current_dir_path = os.path.dirname(current_file_path)
-            # 获取上一级目录的路径
-            parent_dir_path = os.path.abspath(
-                os.path.join(current_dir_path, os.path.pardir)
-            )
             load(
                 name="wind_backstepping",
                 sources=[
-                    os.path.join(parent_dir_path, "cuda_kernel/wkv7_cuda.cu"),
-                    os.path.join(parent_dir_path, "cuda_kernel/wkv7_op.cpp"),
+                    os.path.join(current_dir_path, "cuda_kernel/wkv7_cuda.cu"),
+                    os.path.join(current_dir_path, "cuda_kernel/wkv7_op.cpp"),
                 ],
                 is_python_module=False,
                 verbose=True,

rwkv_ops/rwkv7_kernel/torch_op.py CHANGED Viewed

@@ -1,3 +1,15 @@
+# -*- coding: utf-8 -*-
+"""
+This file implements the forward and backward pass of a chunked delta rule attention mechanism,
+optimized with Triton kernels for GPU acceleration. It includes functions for forward propagation,
+backward gradient computation, and integration with PyTorch's autograd system.
+该文件实现了分块 Delta Rule 注意力机制的前向与反向传播，
+使用 Triton 内核进行 GPU 加速优化。包括前向传播、梯度反向传播函数，
+并集成了 PyTorch 的自动求导系统。
+"""
 import warnings
 from typing import Optional
@@ -43,6 +55,27 @@ def chunk_dplr_fwd(
     output_final_state: bool = True,
     chunk_size: int = 16,
 ):
+    """
+    Forward pass of chunked delta rule attention.
+    分块 Delta Rule 注意力机制的前向传播。
+    Args:
+        q (torch.Tensor): Queries tensor [B, T, H, K]
+        k (torch.Tensor): Keys tensor [B, T, H, K]
+        v (torch.Tensor): Values tensor [B, T, H, V]
+        a (torch.Tensor): Activations tensor [B, T, H, K]
+        b (torch.Tensor): Betas tensor [B, T, H, K]
+        gk (torch.Tensor): Log decay tensor [B, T, H, K]
+        scale (float): Scale factor for attention scores
+        initial_state (Optional[torch.Tensor]): Initial state for recurrent processing
+        output_final_state (bool): Whether to return final state
+        chunk_size (int): Chunk size for processing
+    Returns:
+        o (torch.Tensor): Output tensor [B, T, H, V]
+        final_state (Optional[torch.Tensor]): Final state if requested
+    """
     T = q.shape[1]
     BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
     gi, ge = chunk_rwkv6_fwd_cumsum(gk, BT)
@@ -100,6 +133,33 @@ def chunk_dplr_bwd(
     dht,
     BT: int = 16,
 ):
+    """
+    Backward pass of chunked delta rule attention.
+    分块 Delta Rule 注意力机制的反向传播。
+    Args:
+        q (torch.Tensor): Queries tensor [B, T, H, K]
+        k (torch.Tensor): Keys tensor [B, T, H, K]
+        v (torch.Tensor): Values tensor [B, T, H, V]
+        a (torch.Tensor): Activations tensor [B, T, H, K]
+        b (torch.Tensor): Betas tensor [B, T, H, K]
+        gk (torch.Tensor): Log decay tensor [B, T, H, K]
+        initial_state (torch.Tensor): Initial state for recurrent processing
+        scale (float): Scale factor for attention scores
+        do (torch.Tensor): Gradient of outputs
+        dht (torch.Tensor): Gradient of final hidden state
+        BT (int): Chunk size for processing
+    Returns:
+        dq (torch.Tensor): Gradient of queries
+        dk (torch.Tensor): Gradient of keys
+        dv (torch.Tensor): Gradient of values
+        da (torch.Tensor): Gradient of activations
+        db (torch.Tensor): Gradient of betas
+        dgk (torch.Tensor): Gradient of log decays
+        dh0 (torch.Tensor): Gradient of initial state
+    """
     # ******* start recomputing everything, otherwise i believe the gpu memory will be exhausted *******
     gi, ge = chunk_rwkv6_fwd_cumsum(gk, BT)
     A_ab, A_qk, A_ak, A_qb, qg, kg, ag, bg = chunk_dplr_fwd_intra(
@@ -279,6 +339,10 @@ def chunk_dplr_delta_rule(
     cu_seqlens: Optional[torch.LongTensor] = None,
 ):
     r"""
+    Main interface function for chunked delta rule attention.
+    分块 Delta Rule 注意力机制的主要接口函数。
     Args:
         q (torch.Tensor):
             queries of shape `[B, T, H, K]`
@@ -361,35 +425,9 @@ def chunk_rwkv7(
     output_final_state: bool = True,
 ):
     """
-    Args:
-        r (torch.Tensor):
-            r of shape `[B, H, T, K]` .
-        k (torch.Tensor):
-            k of shape `[B, H, T, K]` .
-        v (torch.Tensor):
-            v of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
-        a (torch.Tensor):
-            a of shape `[B, H, T, K]` .
-        b (torch.Tensor):
-            b of shape `[B, H, T, K]` .
-        w (torch.Tensor):
-            decay of shape `[B, H, T, K]` , kernel
-            will apply log_w = -torch.exp(w)
-        log_w (torch.Tensor):
-            log decay of shape `[B, H, T, K]` .
-        scale (float):
-            scale of the attention.
-        initial_state (Optional[torch.Tensor]):
-            Initial state of shape `[N, H, K, V]` for `N` input sequences.
-            For equal-length input sequences, `N` equals the batch size `B`.
-            Default: `None`.
-        output_final_state (Optional[bool]):
-            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
-        cu_seqlens (torch.LongTensor):
-            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
-            consistent with the FlashAttention API.
-        head_first (bool):
-            whether to use head first. Recommended to be False to avoid extra transposes.
+    Interface function for RWKV-7 attention.
+    RWKV-7 注意力机制的接口函数。
     """
     if w is not None:

rwkv-ops 0.1.1__py3-none-any.whl → 0.2__py3-none-any.whl

Potentially problematic release.

rwkv-ops 0.1.1py3-none-any.whl → 0.2py3-none-any.whl