PyPI - rwkv-ops - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

rwkv-ops 0.3.2py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rwkv-ops might be problematic. Click here for more details.

Files changed (16) hide show

rwkv_ops/__init__.py +1 -1
rwkv_ops/rwkv6_kernel/jax_rwkv_kernel.py +54 -54
rwkv_ops/rwkv6_kernel/ops_rwkv_kernel.py +3 -3
rwkv_ops/rwkv6_kernel/torch_rwkv_kernel.py +16 -18
rwkv_ops/rwkv7_kernel/__init__.py +7 -3
rwkv_ops/rwkv7_kernel/get_torch_devices_info.py +3 -3
rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_jax.py +1 -0
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_bwd.py +6 -6
rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_fwd.py +3 -3
rwkv_ops/rwkv7_kernel/mlx_op.py +132 -0
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_bwd.py +6 -6
rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_fwd.py +3 -3
{rwkv_ops-0.3.2.dist-info → rwkv_ops-0.3.3.dist-info}/METADATA +3 -1
{rwkv_ops-0.3.2.dist-info → rwkv_ops-0.3.3.dist-info}/RECORD +16 -15
{rwkv_ops-0.3.2.dist-info → rwkv_ops-0.3.3.dist-info}/WHEEL +0 -0
{rwkv_ops-0.3.2.dist-info → rwkv_ops-0.3.3.dist-info}/licenses/LICENSE.txt +0 -0

rwkv_ops/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.3.2"
+__version__ = "0.3.3"
 import os
 KERNEL_TYPE = os.environ.get("KERNEL_TYPE", "cuda").lower()

rwkv_ops/rwkv6_kernel/jax_rwkv_kernel.py CHANGED Viewed

@@ -103,9 +103,9 @@ class RWKVKernelOperator:
                 bz, seq_len, hd_sz = r_type.shape
                 assert hd_sz % head_size == 0
-                assert reduce(lambda x, y: x * y, u_type.shape, 1) == hd_sz, (
-                    "the elements of u (time first) is not equal to hidden_size"
-                )
+                assert (
+                    reduce(lambda x, y: x * y, u_type.shape, 1) == hd_sz
+                ), "the elements of u (time first) is not equal to hidden_size"
                 input_type = r_type.element_type
                 if input_type in [ir.F32Type.get(), ir.BF16Type.get()]:
@@ -159,9 +159,9 @@ class RWKVKernelOperator:
                 bz, seq_len, channels = r.shape
                 assert channels % head_size == 0
                 assert seq_len <= max_sequence_length
-                assert reduce(lambda x, y: x * y, u.shape, 1) == channels, (
-                    "the elements of u (time first) is not equal to hidden_size"
-                )
+                assert (
+                    reduce(lambda x, y: x * y, u.shape, 1) == channels
+                ), "the elements of u (time first) is not equal to hidden_size"
                 r_dtype = dtypes.canonicalize_dtype(r.dtype)
                 k_dtype = dtypes.canonicalize_dtype(k.dtype)
@@ -237,9 +237,9 @@ class RWKVKernelOperator:
                 bz, seq_len, hd_sz = r_type.shape
                 assert hd_sz % head_size == 0
-                assert reduce(lambda x, y: x * y, u_type.shape, 1) == hd_sz, (
-                    "the elements of u (time first) is not equal to hidden_size"
-                )
+                assert (
+                    reduce(lambda x, y: x * y, u_type.shape, 1) == hd_sz
+                ), "the elements of u (time first) is not equal to hidden_size"
                 input_type = r_type.element_type
                 if input_type in [ir.F32Type.get(), ir.BF16Type.get()]:
@@ -304,9 +304,9 @@ class RWKVKernelOperator:
                 bz, seq_len, channels = r.shape
                 assert channels % head_size == 0
                 assert seq_len <= max_sequence_length
-                assert reduce(lambda x, y: x * y, u.shape, 1) == channels, (
-                    "the elements of u (time first) is not equal to hidden_size"
-                )
+                assert (
+                    reduce(lambda x, y: x * y, u.shape, 1) == channels
+                ), "the elements of u (time first) is not equal to hidden_size"
                 r_dtype = dtypes.canonicalize_dtype(r.dtype)
                 k_dtype = dtypes.canonicalize_dtype(k.dtype)
@@ -374,9 +374,9 @@ class RWKVKernelOperator:
                         n_state = jnp.shape(init_state)[0]
                         B = jnp.shape(r)[0]
                         # print('ns:',n_state,'B:',B,r.shape,k.shape,v.shape)
-                        assert n_state == 1 or n_state == B, (
-                            "我无法为您推断state_map的形状，请手动指定。"
-                        )
+                        assert (
+                            n_state == 1 or n_state == B
+                        ), "我无法为您推断state_map的形状，请手动指定。"
                         if n_state == 1:
                             state_map = jnp.array([0] * B, dtype=jnp.int32)
                         elif n_state == B:
@@ -392,12 +392,12 @@ class RWKVKernelOperator:
                             jnp.int32,
                         ], "state_map的数值类型必须为int32"
                         state_map = jnp.astype(state_map, jnp.int32)
-                        assert jnp.all(state_map >= 0) and jnp.add(state_map < bz), (
-                            f"state_map内为state的映射下标，因此范围为: [0,{bz})"
-                        )
-                assert (init_state is None) == (state_map is None), (
-                    "init_state与state_map必须同时传入"
-                )
+                        assert jnp.all(state_map >= 0) and jnp.add(
+                            state_map < bz
+                        ), f"state_map内为state的映射下标，因此范围为: [0,{bz})"
+                assert (init_state is None) == (
+                    state_map is None
+                ), "init_state与state_map必须同时传入"
                 if init_state is None:
                     y, s = _rwkv_fwd_state_p.bind(r, k, v, w, u)
@@ -440,9 +440,9 @@ class RWKVKernelOperator:
                 assert hd_sz % head_size == 0
                 num_heads = hd_sz // head_size
-                assert reduce(lambda x, y: x * y, u_type.shape, 1) == hd_sz, (
-                    "the elements of u (time first) is not equal to hidden_size"
-                )
+                assert (
+                    reduce(lambda x, y: x * y, u_type.shape, 1) == hd_sz
+                ), "the elements of u (time first) is not equal to hidden_size"
                 input_type = r_type.element_type
                 if input_type in [ir.F32Type.get(), ir.BF16Type.get()]:
@@ -452,25 +452,25 @@ class RWKVKernelOperator:
                 state_shape = (bz, num_heads, head_size, head_size)
                 if with_init_state:
-                    assert s_map is not None, (
-                        "您必须同时传入init_state与state_map 或者都赋值为None."
-                    )
+                    assert (
+                        s_map is not None
+                    ), "您必须同时传入init_state与state_map 或者都赋值为None."
                     s_type = ir.RankedTensorType(s.type)
                     sm_type = ir.RankedTensorType(s_map.type)
                     # print(sm_type, ir.IntegerType.get_signless(64))
-                    assert sm_type.element_type == ir.IntegerType.get_signless(32), (
-                        "state_map的数据类型必须为int32"
-                    )
+                    assert sm_type.element_type == ir.IntegerType.get_signless(
+                        32
+                    ), "state_map的数据类型必须为int32"
                     # print(sm_type.shape,bz)
-                    assert tuple(sm_type.shape) == (bz,), (
-                        "state_map的shape 形状必须为(batch_size,)"
-                    )
+                    assert tuple(sm_type.shape) == (
+                        bz,
+                    ), "state_map的shape 形状必须为(batch_size,)"
                     assert s_type.element_type == output_type
-                    assert tuple(s_type.shape) == state_shape, (
-                        "the shape of init state must be (batch_size,num_heads,head_size,head_size)"
-                    )
+                    assert (
+                        tuple(s_type.shape) == state_shape
+                    ), "the shape of init state must be (batch_size,num_heads,head_size,head_size)"
                     # assert s_type.shape[0] == bz and reduce(lambda x,y: x * y, s_type.shape[1:],1) == head_size * hd_sz,"the shape of init state must be (batch_size,num_heads,head_size,head_size)"
                 opaque = rwkv_kernel.create_rwkv_descriptor(
@@ -535,9 +535,9 @@ class RWKVKernelOperator:
                 bz, seq_len, channels = r.shape
                 assert channels % head_size == 0
                 assert seq_len <= max_sequence_length
-                assert reduce(lambda x, y: x * y, u.shape, 1) == channels, (
-                    "the elements of u (time first) is not equal to hidden_size"
-                )
+                assert (
+                    reduce(lambda x, y: x * y, u.shape, 1) == channels
+                ), "the elements of u (time first) is not equal to hidden_size"
                 num_heads = channels // head_size
                 r_dtype = dtypes.canonicalize_dtype(r.dtype)
                 k_dtype = dtypes.canonicalize_dtype(k.dtype)
@@ -558,9 +558,9 @@ class RWKVKernelOperator:
                 if s is not None:
                     s_dtype = dtypes.canonicalize_dtype(s.dtype)
                     assert s_dtype == output_dtype
-                    assert s.shape == state_shape, (
-                        "the shape of init_state must be (batch_size, seq_len, num_heads, head_size, head_size)"
-                    )
+                    assert (
+                        s.shape == state_shape
+                    ), "the shape of init_state must be (batch_size, seq_len, num_heads, head_size, head_size)"
                 return [
                     ShapedArray(
@@ -586,22 +586,22 @@ class RWKVKernelOperator:
     def _load_or_build_kernel(head_size, max_sequence_length):
         assert head_size % 4 == 0, f"head size必须是4的倍数，而{head_size}显然不是."
         assert isinstance(head_size, int), "你是在搞笑吗？ head_size肯定得是int类型的啊"
-        assert isinstance(max_sequence_length, int), (
-            "你是在搞笑吗？ max_sequence_length肯定得是int类型的啊"
-        )
-        assert head_size > 0 and max_sequence_length > 0, (
-            "难绷，head_size与max_sequence_length肯定得是大于0的正整数啊。"
-        )
-        assert os.path.exists(cuda_lib_dir) and len(os.listdir(cuda_lib_dir)) > 0, (
-            f"请检查{cuda_lib_dir}文件夹是否存在，这个文件本质是是您的cuda library的超链接。"
-        )
+        assert isinstance(
+            max_sequence_length, int
+        ), "你是在搞笑吗？ max_sequence_length肯定得是int类型的啊"
+        assert (
+            head_size > 0 and max_sequence_length > 0
+        ), "难绷，head_size与max_sequence_length肯定得是大于0的正整数啊。"
+        assert (
+            os.path.exists(cuda_lib_dir) and len(os.listdir(cuda_lib_dir)) > 0
+        ), f"请检查{cuda_lib_dir}文件夹是否存在，这个文件本质是是您的cuda library的超链接。"
         kernel_dir = os.path.abspath(
             os.path.join(os.path.dirname(__file__), kernel_dir_name)
         )
         builds_dir = os.path.join(kernel_dir, "builds")
-        assert os.path.exists(kernel_dir), (
-            f"找不到{kernel_dir_name}文件夹，请问您的文件是完整的吗？"
-        )
+        assert os.path.exists(
+            kernel_dir
+        ), f"找不到{kernel_dir_name}文件夹，请问您的文件是完整的吗？"
         if not os.path.exists(builds_dir):
             os.mkdir(builds_dir)
         target_dir_name = f"_N_{head_size}_T_{max_sequence_length}"

rwkv_ops/rwkv6_kernel/ops_rwkv_kernel.py CHANGED Viewed

@@ -55,9 +55,9 @@ class RWKVKernelOperator:
                 if isinstance(state_map, list):
                     state_map = ops.convert_to_tensor(state_map, dtype="int32")
                 state_map = ops.cast(state_map, "int32")
-                assert (state_map >= 0).all() and (state_map < state_kinds).all(), (
-                    f"请确保state_map的值域为[0, {state_kinds})"
-                )
+                assert (state_map >= 0).all() and (
+                    state_map < state_kinds
+                ).all(), f"请确保state_map的值域为[0, {state_kinds})"
             s = ops.take(init_state, state_map, axis=0)
         else:

rwkv_ops/rwkv6_kernel/torch_rwkv_kernel.py CHANGED Viewed

@@ -154,9 +154,9 @@ class RWKVKernelOperator:
             @staticmethod
             def apply(B, T, C, H, S, s_map, r, k, v, w, u, s):
                 with torch.no_grad():
-                    assert s_map.dtype == torch.int64, (
-                        "s_map 必须为None 或者是长度为B的，int64类型的数组。"
-                    )
+                    assert (
+                        s_map.dtype == torch.int64
+                    ), "s_map 必须为None 或者是长度为B的，int64类型的数组。"
                     assert (s is None and s_map is None) or (
                         s is not None and s_map is not None
                     ), "init_state与s_map必须同时为None 或者同时不为None"
@@ -240,17 +240,15 @@ class RWKVKernelOperator:
         is_custom_init = init_state is not None
         if init_state is not None:
-            assert len(init_state.shape) in [3, 4], (
-                "init_state 的形状必须为(state_kinds /*<= Batch_size*/,num_heads,head_size,head_size) 或者(num_heads,head_size,head_size)"
-            )
+            assert (
+                len(init_state.shape) in [3, 4]
+            ), "init_state 的形状必须为(state_kinds /*<= Batch_size*/,num_heads,head_size,head_size) 或者(num_heads,head_size,head_size)"
             if len(init_state.shape) == 3:
                 init_state = init_state[None, :]
             assert (
                 init_state.shape[1:] == (H, self.head_size, self.head_size)
                 and init_state.shape[0] <= B
-            ), (
-                "init_state 的形状必须为(state_kinds /*<= Batch_size*/,num_heads,head_size,head_size) 或者(num_heads,head_size,head_size)"
-            )
+            ), "init_state 的形状必须为(state_kinds /*<= Batch_size*/,num_heads,head_size,head_size) 或者(num_heads,head_size,head_size)"
             assert init_state.dtype == s_dtype, f"init_state的数值类型应为: {s_dtype}"
             assert init_state.device == r.device
@@ -269,17 +267,17 @@ class RWKVKernelOperator:
         if with_state:
             if init_state is None:
-                assert state_map is None, (
-                    "您必须在指定了init_state的情况下才能使用state_map"
-                )
+                assert (
+                    state_map is None
+                ), "您必须在指定了init_state的情况下才能使用state_map"
                 init_state = torch.zeros((0,), device=r.device, dtype=s_dtype)
                 state_map = torch.zeros((0,), device=r.device, dtype=torch.int64)
             else:
                 n_state = init_state.shape[0]
                 if state_map is None:
-                    assert n_state == 1 or n_state == B, (
-                        "我无法为您推断state_map的形状，请手动指定。"
-                    )
+                    assert (
+                        n_state == 1 or n_state == B
+                    ), "我无法为您推断state_map的形状，请手动指定。"
                     if n_state == 1:
                         state_map = torch.tensor(
                             [0] * B, dtype=torch.int64, device=r.device
@@ -292,9 +290,9 @@ class RWKVKernelOperator:
                         assert False, "未实现"
                 else:
                     assert state_map.shape == (B,), "state_map的形状必须为(batch_size,)"
-                    assert (state_map >= 0).all() and (state_map < n_state).all(), (
-                        f"state_map的取值范围为[0,{n_state})之间的整数，您的输入显然不满足。"
-                    )
+                    assert (
+                        (state_map >= 0).all() and (state_map < n_state).all()
+                    ), f"state_map的取值范围为[0,{n_state})之间的整数，您的输入显然不满足。"
             # print('state map:',state_map)
             o, ys = self.kernel_with_state.apply(
                 B, T, C, H, is_custom_init, state_map, r, k, v, w, u, init_state

rwkv_ops/rwkv7_kernel/__init__.py CHANGED Viewed

@@ -15,9 +15,11 @@ def get_generalized_delta_rule(HEAD_SIZE=64, KERNEL_TYPE="native"):
     USE_TRITON_KERNEL = False
     if keras.config.backend() == "torch":
         import torch
         if not torch.cuda.is_available():
             from .native_keras_op import generalized_delta_rule
-            return generalized_delta_rule,False
+            return generalized_delta_rule, False
         if KERNEL_TYPE.lower() == "triton":
             from .torch_op import generalized_delta_rule
@@ -157,8 +159,8 @@ def get_generalized_delta_rule(HEAD_SIZE=64, KERNEL_TYPE="native"):
     elif keras.config.backend() == "jax":
         import jax
         import os
-        if jax.devices()[0].platform  == "gpu":
+        if jax.devices()[0].platform == "gpu":
             if KERNEL_TYPE.lower() == "triton":
                 os.environ["JAX_LOG_COMPUTATION"] = "0"
                 from .jax_op import generalized_delta_rule
@@ -196,6 +198,8 @@ def get_generalized_delta_rule(HEAD_SIZE=64, KERNEL_TYPE="native"):
                 from .native_keras_op import generalized_delta_rule
         else:
             from .native_keras_op import generalized_delta_rule
+    elif keras.config.backend() == "mlx" and KERNEL_TYPE.lower() == "cuda":
+        from .mlx_op import generalized_delta_rule
     else:
         from .native_keras_op import generalized_delta_rule
     return generalized_delta_rule, USE_TRITON_KERNEL

rwkv_ops/rwkv7_kernel/get_torch_devices_info.py CHANGED Viewed

@@ -74,9 +74,9 @@ if check_pytorch_version("2.4"):
     def custom_device_ctx(index: int):
         return device_torch_lib.device(index)
 else:
-    assert device == "cuda", (
-        "Only cuda device is supported for PyTorch version < 2.4.0."
-    )
+    assert (
+        device == "cuda"
+    ), "Only cuda device is supported for PyTorch version < 2.4.0."
     autocast_custom_fwd = device_torch_lib.amp.custom_fwd
     autocast_custom_bwd = device_torch_lib.amp.custom_bwd

rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_jax.py CHANGED Viewed

@@ -11,6 +11,7 @@ import jax
 import jax.numpy as jnp
 from typing import Optional, Tuple, Union
 from jax.ad_checkpoint import checkpoint_policies as cp
 CHUNK_LEN = 16  # 这是一个常数
 # ---------- 延迟编译（改到当前目录） ----------
 _CURRENT_DIR = pathlib.Path(

rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_bwd.py CHANGED Viewed

@@ -25,9 +25,9 @@ def chunk_dplr_bwd_dhu(
     B, T, H, K, V = *qg.shape, do.shape[-1]
     BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
     BK = triton.next_power_of_2(K)
-    assert BK <= 256, (
-        "current kernel does not support head dimension being larger than 256."
-    )
+    assert (
+        BK <= 256
+    ), "current kernel does not support head dimension being larger than 256."
     # H100
     if check_shared_mem("hopper"):
         BV = 64
@@ -42,9 +42,9 @@ def chunk_dplr_bwd_dhu(
     N, NT = B, triton.cdiv(T, BT)
     BC = min(BT, BC)
     NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
-    assert NK == 1, (
-        "NK > 1 is not supported because it involves time-consuming synchronization"
-    )
+    assert (
+        NK == 1
+    ), "NK > 1 is not supported because it involves time-consuming synchronization"
     dh_shape = (B, NT, H, K, V)
     out_shapes = [
         jax.ShapeDtypeStruct(dh_shape, dv.dtype),

rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_fwd.py CHANGED Viewed

@@ -43,9 +43,9 @@ def chunk_dplr_fwd_h(
     BC = min(BT, BC)
     NK = triton.cdiv(K, BK)
     NV = triton.cdiv(V, BV)
-    assert NK == 1, (
-        "NK > 1 is not supported because it involves time-consuming synchronization"
-    )
+    assert (
+        NK == 1
+    ), "NK > 1 is not supported because it involves time-consuming synchronization"
     out_shapes = [
         jax.ShapeDtypeStruct((B, NT, H, K, V), kg.dtype),

rwkv_ops/rwkv7_kernel/mlx_op.py ADDED Viewed

@@ -0,0 +1,132 @@
+# copy from https://github.com/ml-explore/mlx-lm/pull/580
+from dataclasses import dataclass
+from functools import partial
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+@partial(mx.compile, shapeless=True)
+def addcmul(x, y, z):
+    return x + y * z
+@partial(mx.compile, shapeless=True)
+def l2_norm(x):
+    return x / mx.maximum(mx.linalg.norm(x, axis=-1, keepdims=True), 1e-7)
+def _make_wkv7_kernel():
+    if not mx.metal.is_available():
+        return None
+    source = f"""
+        auto n = thread_position_in_grid.z;
+        auto b_idx = n / H;
+        auto h_idx = n % H;
+        constexpr int n_per_t = D / 32;
+        // [B, T, H, D]
+        auto r_ = r + b_idx * T * H * D + h_idx * D;
+        auto w_ = w + b_idx * T * H * D + h_idx * D;
+        auto k_ = k + b_idx * T * H * D + h_idx * D;
+        auto v_ = v + b_idx * T * H * D + h_idx * D;
+        auto a_ = a + b_idx * T * H * D + h_idx * D;
+        auto b_ = b + b_idx * T * H * D + h_idx * D;
+        y += b_idx * T * H * D + h_idx * D;
+        auto dk_idx = thread_position_in_threadgroup.x;
+        auto dv_idx = thread_position_in_grid.y;
+        // state_in, state_out: [B, H, D, D]
+        auto i_state = state_in  + (n * D + dv_idx) * D;
+        auto o_state = state_out + (n * D + dv_idx) * D;
+        float state[n_per_t];
+        for (int i = 0; i < n_per_t; ++i) {{
+          auto s_idx = n_per_t * dk_idx + i;
+          state[i] = static_cast<float>(i_state[s_idx]);
+        }}
+        for (int t = 0; t < T; ++t) {{
+          float sa = 0.0f;
+          for (int i = 0; i < n_per_t; ++i) {{
+            auto s_idx = n_per_t * dk_idx + i;
+            sa += state[i] * a_[s_idx];
+            state[i] = state[i] * w_[s_idx];
+          }}
+          sa = simd_sum(sa);
+          float out = 0.0f;
+          for (int i = 0; i < n_per_t; ++i) {{
+            auto s_idx = n_per_t * dk_idx + i;
+            state[i] = state[i] + k_[s_idx] * v_[dv_idx] + sa * b_[s_idx];
+            out += state[i] * r_[s_idx];
+          }}
+          out = simd_sum(out);
+          if (thread_index_in_simdgroup == 0) {{
+            y[dv_idx] = static_cast<InT>(out);
+          }}
+          // Increment data pointers to next time step
+          r_ += H * D;
+          w_ += H * D;
+          k_ += H * D;
+          v_ += H * D;
+          a_ += H * D;
+          b_ += H * D;
+          y  += H * D;
+        }}
+        for (int i = 0; i < n_per_t; ++i) {{
+          auto s_idx = n_per_t * dk_idx + i;
+          o_state[s_idx] = static_cast<InT>(state[i]);
+        }}
+    """
+    inputs = ["r", "w", "k", "v", "a", "b", "state_in", "T"]
+    return mx.fast.metal_kernel(
+        name="wkv7_kernel",
+        input_names=inputs,
+        output_names=["y", "state_out"],
+        source=source,
+    )
+_wkv7_kernel = _make_wkv7_kernel()
+def transpose_head(x, head_first: bool = True):
+    if head_first:
+        return mx.transpose(x, (0, 2, 1, 3))
+    return x
+def generalized_delta_rule(
+    r,
+    w,
+    k,
+    v,
+    a,
+    b,
+    initial_state=None,
+    output_final_state: bool = True,
+    head_first: bool = False,
+):
+    state = initial_state
+    r = transpose_head(r, head_first)
+    k = transpose_head(k, head_first)
+    v = transpose_head(v, head_first)
+    a = transpose_head(a, head_first)
+    b = transpose_head(b, head_first)
+    B, T, H, D = r.shape
+    input_dtype = r.dtype
+    y, out_state = _wkv7_kernel(
+        inputs=[r, w, k, v, a, b, state, T],
+        template=[
+            ("InT", input_dtype),
+            ("H", H),
+            ("D", D),
+        ],
+        grid=(32, D, B * H),
+        threadgroup=(32, 4, 1),
+        output_shapes=[(B, T, H, D), state.shape],
+        output_dtypes=[input_dtype, input_dtype],
+    )
+    if output_final_state:
+        return y, out_state
+    return y

rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_bwd.py CHANGED Viewed

@@ -24,9 +24,9 @@ def chunk_dplr_bwd_dhu(
     B, T, H, K, V = *qg.shape, do.shape[-1]
     BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
     BK = triton.next_power_of_2(K)
-    assert BK <= 256, (
-        "current kernel does not support head dimension being larger than 256."
-    )
+    assert (
+        BK <= 256
+    ), "current kernel does not support head dimension being larger than 256."
     # H100
     if check_shared_mem("hopper", qg.device.index):
         BV = 64
@@ -42,9 +42,9 @@ def chunk_dplr_bwd_dhu(
     BC = min(BT, BC)
     NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
-    assert NK == 1, (
-        "NK > 1 is not supported because it involves time-consuming synchronization"
-    )
+    assert (
+        NK == 1
+    ), "NK > 1 is not supported because it involves time-consuming synchronization"
     dh = qg.new_empty(B, NT, H, K, V)
     dh0 = torch.empty_like(h0, dtype=torch.float32) if h0 is not None else None

rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_fwd.py CHANGED Viewed

@@ -42,9 +42,9 @@ def chunk_dplr_fwd_h(
     BC = min(BT, BC)
     NK = triton.cdiv(K, BK)
     NV = triton.cdiv(V, BV)
-    assert NK == 1, (
-        "NK > 1 is not supported because it involves time-consuming synchronization"
-    )
+    assert (
+        NK == 1
+    ), "NK > 1 is not supported because it involves time-consuming synchronization"
     h = kg.new_empty(B, NT, H, K, V)
     final_state = (

{rwkv_ops-0.3.2.dist-info → rwkv_ops-0.3.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rwkv-ops
-Version: 0.3.2
+Version: 0.3.3
 Summary: RWKV operators for multiple backends (PyTorch, JAX, Keras)
 Project-URL: Homepage, https://github.com/pass-lin/rwkv_ops
 Author-email: pass-lin <qw_lin@qq.com>
@@ -125,12 +125,14 @@ if padding_mask is not None:
 | JAX         | ✅   | ✅     | ✅     |
 | TensorFlow  | ⚠️    | ❌     | ✅     |
 | NumPy       | ❌   | ❌     | ✅     |
+| MLX       | ⚠️   | ❌     | ❌     |
 ---
 > `native` 为原生算子，无 chunkwise，速度慢且显存高。
 > `triton` 使用的是chunkwise算法实现，速度快，并行度高，缺点是精度很差，介意勿用
 > `cuda` 为基于 CUDA 的原生算子，速度很快，并且kernel内部使用fp32实现，所以精度也很高。缺点就是长序列的时候比较吃亏跑不满。
 > tensorflow的CUDA实现只支持前向计算，是没有梯度的。并且这个是使用jax的cuda实现实现的，你需要保证你能够成功运行jax的cuda kernel。
+> 因为MLX还没合并到keras，所以原生算子暂不支持。但是我们提供了一个前向的算子。
 ## rwkv6op 使用方法

{rwkv_ops-0.3.2.dist-info → rwkv_ops-0.3.3.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
-rwkv_ops/__init__.py,sha256=ojPQmkz3yWNqqJwIyjAfsxWB_h3TowtBrJtuRqssEvA,855
+rwkv_ops/__init__.py,sha256=voJa6h1nvEua5iD2H-mxsZR3j6s1at8xrqnXj4Q-WYQ,855
 rwkv_ops/rwkv6_kernel/__init__.py,sha256=ktIzkK6EUc2nonLQnl2NAjJj9kMt02i9zqfjFcnM_NQ,3647
-rwkv_ops/rwkv6_kernel/jax_rwkv_kernel.py,sha256=4SL93Z4mmuQldHtmwqTKcP7M-outTU5Rge2qgDGzwBg,29966
-rwkv_ops/rwkv6_kernel/ops_rwkv_kernel.py,sha256=c3ZSJ9xC6-PKr88pOhjmBximdhwmP1_i7UOcIdKB43c,3354
-rwkv_ops/rwkv6_kernel/torch_rwkv_kernel.py,sha256=Pv0WsBp5byTSwkYrYkHcJa3wftSsHHzfRzleKdmJayY,12915
+rwkv_ops/rwkv6_kernel/jax_rwkv_kernel.py,sha256=DTYtT2v2WjOlndD-ESNmbmVj1ili03KOXOZek1V4DLw,29954
+rwkv_ops/rwkv6_kernel/ops_rwkv_kernel.py,sha256=XBsYAkbsyUCsxVPR-RfrboQkyM8TIzW1saOsh6vEjcM,3352
+rwkv_ops/rwkv6_kernel/torch_rwkv_kernel.py,sha256=tq5H-ndm9nq2nWTQYI-xpw5YK0j0E31yF552muSFHN4,12883
 rwkv_ops/rwkv6_kernel/jax_kernel_cuda/gpu_ops.cpp,sha256=oM13TCQi2GMIf3f-Z39WOL8M_8GmGI_Kdhiq3Y2keJw,1643
 rwkv_ops/rwkv6_kernel/jax_kernel_cuda/kernel_helpers.h,sha256=epwsW8OUIOvrlNuW3BAmAbgB8n8CKOFEYafBxQy3ptw,2209
 rwkv_ops/rwkv6_kernel/jax_kernel_cuda/kernels.h,sha256=KYJiWmmig0Wh-zpiWV96J_be8jlyc38Ztd1iqNoqVFI,1501
@@ -15,21 +15,22 @@ rwkv_ops/rwkv6_kernel/jax_kernel_hip/pybind11_kernel_helpers.h,sha256=CMQclcyHaD
 rwkv_ops/rwkv6_kernel/jax_kernel_hip/rwkv_kernels.hip,sha256=givSxPA7YfKGz75rOtN8TAjTxWWraVNgTGPZfAJsZsQ,20836
 rwkv_ops/rwkv6_kernel/torch_kernel/wkv6_cuda.cu,sha256=tfRbMQBkl_LT7EVaJ6KoWYcQ902ApCrS6zkjXldFZXY,12770
 rwkv_ops/rwkv6_kernel/torch_kernel/wkv6_op.cpp,sha256=cyCTiF--4SQiDJu7Dy_NuEhSe1vyki6JS4I2rsvT714,6659
-rwkv_ops/rwkv7_kernel/__init__.py,sha256=HfoB043qxcIyljNcSd_XtH2UKB6wF2qQlOq9VvXwWRI,8129
+rwkv_ops/rwkv7_kernel/__init__.py,sha256=OJq9ZU1GPP5Si8LY66miTFFotxyGlBtPyUp49Cedl8k,8250
 rwkv_ops/rwkv7_kernel/get_jax_devices_info.py,sha256=cMIaNED7d1PvYNSyq8wNI3G7wNvcgdUj9HWRBLuSVM8,6004
-rwkv_ops/rwkv7_kernel/get_torch_devices_info.py,sha256=ZL_rAM6lHB4nTOOU28Xm08qptfuIoijOMi_xwJG3KCo,7380
+rwkv_ops/rwkv7_kernel/get_torch_devices_info.py,sha256=BR6IqwcBDKjLf-uRCh0LAzYtRl4KP43JO5fnd9jsr2c,7380
 rwkv_ops/rwkv7_kernel/jax_op.py,sha256=C7jOvJ-ZWTFfCZBQNzMbqgoVHuDS2QCGlBsGEMM4Fn0,9140
+rwkv_ops/rwkv7_kernel/mlx_op.py,sha256=Ss9i_1TdGPNnc1YpD7QBSdK1_sTQ2R5l8Mk5UIcWQJ0,3795
 rwkv_ops/rwkv7_kernel/native_keras_op.py,sha256=dCWdzuVZxAKHCBURZqgOLN3n_yKFFNX5uORlbvztH6w,2502
 rwkv_ops/rwkv7_kernel/tf_eager_kernel.py,sha256=2t2uf1iNznYpYFlqt9REY0GwGeycYuaJl-4QFk2rJHc,4357
 rwkv_ops/rwkv7_kernel/torch_op.py,sha256=jw_AvqshTAG4t9-MRqxFQNi_bTzxNbx3lwnMifPk8-8,14070
 rwkv_ops/rwkv7_kernel/jax_cuda_kernel/CMakeLists.txt,sha256=Dq4Ea8N2xOEej2jZpEw4MtFjUFgN0PUciejVOCSP-FM,1400
 rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_ffi.cu,sha256=WePveEdUixaQA51hJUK8Sr7Q7jDTstybEWZczdjuGSo,9690
-rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_jax.py,sha256=3lvCKIa9DO7MY3aZNyJM0AyHlQUvDKGsnYVr8MLl7Vg,7998
+rwkv_ops/rwkv7_kernel/jax_cuda_kernel/wkv7_jax.py,sha256=t2mQ_zGhMeBZClcaLJSwRG4n2MtRhvn9z-vHWK79F6w,7999
 rwkv_ops/rwkv7_kernel/jax_kernel/__init__.py,sha256=uHsf_1qrtRK62IvhLuzefHGPWpHXmw1p0tqmwlHcptk,346
 rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_bwd.py,sha256=2Voq1Bdzn0DFloiLvwINBk7akmxRWIqXIQeyafrJJGg,2138
 rwkv_ops/rwkv7_kernel/jax_kernel/chunk_A_fwd.py,sha256=rhmglqHIIww7yPzaSBEp9ISxhhxoUbMtV51AUDyhUd8,1425
-rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_bwd.py,sha256=JDfVZsMb8yMlMN3sKT3i3l3y1YQiQkyUjnSNyan5Fqc,1888
-rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_fwd.py,sha256=g8b_81rIIjxeknYiklRGnox24rAvEvfKRKT-5nI0Euo,1992
+rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_bwd.py,sha256=U06dcacmND-y022mN4UmDunfRDxJYWthU_4V8z0HcSs,1888
+rwkv_ops/rwkv7_kernel/jax_kernel/chunk_h_fwd.py,sha256=Nl0migPjRmQopIsysSqt7ZMQ_X-vyblb7e2t-xghzlA,1992
 rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_bwd.py,sha256=gQnToi1e1GZCvjWsEdWx6WakUN4Lc0JfaBSsSXYdN84,3369
 rwkv_ops/rwkv7_kernel/jax_kernel/chunk_o_fwd.py,sha256=4SjQ_zTZvFxsBMeWOx0JGFg9EQ4vllvEx30EcvSZJzI,853
 rwkv_ops/rwkv7_kernel/jax_kernel/cumsum.py,sha256=NoOh2_hA_rdH5bmaNNMAdCgVPfWvQpf-Q8BqF926jrw,667
@@ -40,8 +41,8 @@ rwkv_ops/rwkv7_kernel/torch_cuda_kernel/wkv7_op.cpp,sha256=Wk5QYvIM9m-YJdSEh6zSz
 rwkv_ops/rwkv7_kernel/torch_kernel/__init__.py,sha256=_u1srIATeoHKlVTVWbWXdpkjaggugl9y-Kx_Y4pYdIY,430
 rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_bwd.py,sha256=CWtotXkVvHz4-rkuOqWh6zKy95jwimS9If6SU45ylW0,2103
 rwkv_ops/rwkv7_kernel/torch_kernel/chunk_A_fwd.py,sha256=4RJbyUTO23OxwH1rGVxeBiBVZKNHpPL_tJ7MFoDCIts,1475
-rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_bwd.py,sha256=zo6l0ZZUhXFu8wEFD76I0zSqFT9IXFKUKtyeaSwk380,1795
-rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_fwd.py,sha256=0ucN1U0EDTDqcyTPLLcsAX6FLTf2E_3toOY9p81gWYE,1858
+rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_bwd.py,sha256=AdbgPd0JRfPZ_poK_XAQ5iV1GsBqDehiN0lf_-_CbUw,1795
+rwkv_ops/rwkv7_kernel/torch_kernel/chunk_h_fwd.py,sha256=o_EbLxqqnzW8_aNduqv_Brd_-SlUU3szfi8Lfn40rqc,1858
 rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_bwd.py,sha256=ioPrS0NYQhpFk1j8rAxqtbwpx1CwjJQnrJEBDqVy-As,3283
 rwkv_ops/rwkv7_kernel/torch_kernel/chunk_o_fwd.py,sha256=54yoa3NpV64H-koURt-hUWpFHhUjwXpGvXPp2_ETCnw,825
 rwkv_ops/rwkv7_kernel/torch_kernel/cumsum.py,sha256=hQkpyaa0eUyB4V3UVks7l1_dHwOrbump0FZILityBKw,611
@@ -58,7 +59,7 @@ rwkv_ops/rwkv7_kernel/triton_kernel/cumsum.py,sha256=pRp_z587PrnpgRVpi031IndyjVI
 rwkv_ops/rwkv7_kernel/triton_kernel/utils.py,sha256=TNGlkwGq4t-TOcdVBk_N_vHPLzMFTu_F0V-O1RprIO4,553
 rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_bwd.py,sha256=szaG11q_WmpyhXi6aVWwzizvflCh5wND8wGA_V8afzA,5479
 rwkv_ops/rwkv7_kernel/triton_kernel/wy_fast_fwd.py,sha256=jbb19DUTHENU2RIOv_T4m_W1eXMqdRqG0XevIkBOhI4,9438
-rwkv_ops-0.3.2.dist-info/METADATA,sha256=lkSey3fiZxPrVO05sSb7Q4Q2cAHFgo8-f8RZjmLAWL4,8853
-rwkv_ops-0.3.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-rwkv_ops-0.3.2.dist-info/licenses/LICENSE.txt,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
-rwkv_ops-0.3.2.dist-info/RECORD,,
+rwkv_ops-0.3.3.dist-info/METADATA,sha256=EU1tq3Ub9WqVpcNYRa1T_I9H2Nx1nNypX8fZWdu7bsM,9011
+rwkv_ops-0.3.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+rwkv_ops-0.3.3.dist-info/licenses/LICENSE.txt,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
+rwkv_ops-0.3.3.dist-info/RECORD,,

{rwkv_ops-0.3.2.dist-info → rwkv_ops-0.3.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{rwkv_ops-0.3.2.dist-info → rwkv_ops-0.3.3.dist-info}/licenses/LICENSE.txt RENAMED Viewed

File without changes

rwkv-ops 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

Potentially problematic release.

rwkv-ops 0.3.2py3-none-any.whl → 0.3.3py3-none-any.whl