PyPI - brainstate - Versions diffs - 0.1.0__py2.py3-none-any.whl → 0.1.0.post20241122__py2.py3-none-any.whl - Mend

brainstate 0.1.0py2.py3-none-any.whl → 0.1.0.post20241122py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

benchmark/COBA_2005.py +125 -0
benchmark/CUBA_2005.py +149 -0
brainstate/augment/_autograd.py +9 -6
brainstate/event/__init__.py +4 -2
brainstate/event/_csr.py +26 -18
brainstate/event/_csr_benchmark.py +14 -0
brainstate/event/_fixed_probability.py +589 -152
brainstate/event/_fixed_probability_benchmark.py +128 -0
brainstate/event/_fixed_probability_test.py +13 -10
brainstate/event/_linear.py +267 -127
brainstate/event/_linear_benckmark.py +82 -0
brainstate/event/_linear_test.py +8 -3
brainstate/event/_xla_custom_op.py +312 -0
brainstate/event/_xla_custom_op_test.py +55 -0
brainstate/nn/_dyn_impl/_dynamics_synapse.py +6 -11
brainstate/nn/_dyn_impl/_rate_rnns.py +1 -1
brainstate/nn/_dynamics/_projection_base.py +1 -1
brainstate/nn/_exp_euler.py +1 -1
brainstate/nn/_interaction/__init__.py +13 -4
brainstate/nn/_interaction/{_connections.py → _conv.py} +0 -227
brainstate/nn/_interaction/{_connections_test.py → _conv_test.py} +0 -15
brainstate/nn/_interaction/_linear.py +582 -0
brainstate/nn/_interaction/_linear_test.py +42 -0
brainstate/optim/_lr_scheduler.py +1 -1
brainstate/optim/_optax_optimizer.py +18 -0
{brainstate-0.1.0.dist-info → brainstate-0.1.0.post20241122.dist-info}/METADATA +1 -1
{brainstate-0.1.0.dist-info → brainstate-0.1.0.post20241122.dist-info}/RECORD +30 -21
{brainstate-0.1.0.dist-info → brainstate-0.1.0.post20241122.dist-info}/top_level.txt +1 -0
{brainstate-0.1.0.dist-info → brainstate-0.1.0.post20241122.dist-info}/LICENSE +0 -0
{brainstate-0.1.0.dist-info → brainstate-0.1.0.post20241122.dist-info}/WHEEL +0 -0

brainstate/event/_fixed_probability.py CHANGED Viewed

@@ -12,23 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 from __future__ import annotations
 from typing import Union, Callable, Optional
 import brainunit as u
 import jax
+import jax.experimental.pallas as pl
 import jax.numpy as jnp
 import numpy as np
+from jax.interpreters import ad
 from brainstate._state import ParamState
-from brainstate._utils import set_module_as
-from brainstate.compile import for_loop
+from brainstate.augment import vmap
 from brainstate.init import param
 from brainstate.nn._module import Module
 from brainstate.random import RandomState
-from brainstate.typing import ArrayLike
-from ._misc import FloatScalar, IntScalar
+from brainstate.typing import ArrayLike, Size
+from ._misc import FloatScalar
+from ._xla_custom_op import XLACustomOp
 __all__ = [
     'FixedProb',
@@ -41,19 +44,23 @@ class FixedProb(Module):
     Parameters
     ----------
-    n_pre : int
-        Number of pre-synaptic neurons.
-    n_post : int
-        Number of post-synaptic neurons.
+    in_size : Size
+        Number of pre-synaptic neurons, i.e., input size.
+    out_size : Size
+        Number of post-synaptic neurons, i.e., output size.
     prob : float
-        Probability of connection.
+        Probability of connection, i.e., connection probability.
     weight : float or callable or jax.Array or brainunit.Quantity
-        Maximum synaptic conductance.
+        Maximum synaptic conductance, i.e., synaptic weight.
     allow_multi_conn : bool, optional
         Whether multiple connections are allowed from a single pre-synaptic neuron.
         Default is True, meaning that a value of ``a`` can be selected multiple times.
-    prob : float
-        Probability of connection.
+    seed: int, optional
+        Random seed. Default is None. If None, the default random seed will be used.
+    float_as_event : bool, optional
+        Whether to treat float as event. Default is True.
+    block_size : int, optional
+        Block size for parallel computation. Default is 64. This is only used for GPU.
     name : str, optional
         Name of the module.
     """
@@ -62,210 +69,640 @@ class FixedProb(Module):
     def __init__(
         self,
-        n_pre: IntScalar,
-        n_post: IntScalar,
+        in_size: Size,
+        out_size: Size,
         prob: FloatScalar,
         weight: Union[Callable, ArrayLike],
         allow_multi_conn: bool = True,
         seed: Optional[int] = None,
+        float_as_event: bool = True,
+        block_size: Optional[int] = None,
         name: Optional[str] = None,
-        grad_mode: str = 'vjp'
     ):
         super().__init__(name=name)
-        self.n_pre = n_pre
-        self.n_post = n_post
-        self.in_size = n_pre
-        self.out_size = n_post
-        self.n_conn = int(n_post * prob)
+        # network parameters
+        self.in_size = in_size
+        self.out_size = out_size
+        self.n_conn = int(self.out_size[-1] * prob)
         if self.n_conn < 1:
-            raise ValueError(
-                f"The number of connections must be at least 1. Got: int({n_post} * {prob}) = {self.n_conn}")
-        assert grad_mode in ['vjp', 'jvp'], f"Unsupported grad_mode: {grad_mode}"
-        self.grad_mode = grad_mode
+            raise ValueError(f"The number of connections must be at least 1. "
+                             f"Got: int({self.out_size[-1]} * {prob}) = {self.n_conn}")
+        self.float_as_event = float_as_event
+        self.block_size = block_size
         # indices of post connected neurons
-        if allow_multi_conn:
-            self.indices = np.random.RandomState(seed).randint(0, n_post, size=(self.n_pre, self.n_conn))
-        else:
-            rng = RandomState(seed)
-            self.indices = for_loop(lambda i: rng.choice(n_post, size=(self.n_conn,), replace=False), np.arange(n_pre))
-        self.indices = u.math.asarray(self.indices)
+        with jax.ensure_compile_time_eval():
+            if allow_multi_conn:
+                rng = np.random.RandomState(seed)
+                self.indices = rng.randint(0, self.out_size[-1], size=(self.in_size[-1], self.n_conn))
+            else:
+                rng = RandomState(seed)
+                @vmap(rngs=rng)
+                def rand_indices(key):
+                    rng.set_key(key)
+                    return rng.choice(self.out_size[-1], size=(self.n_conn,), replace=False)
+                self.indices = rand_indices(rng.split_key(self.in_size[-1]))
+            self.indices = u.math.asarray(self.indices)
         # maximum synaptic conductance
-        weight = param(weight, (self.n_pre, self.n_conn), allow_none=False)
+        weight = param(weight, (self.in_size[-1], self.n_conn), allow_none=False)
         self.weight = ParamState(weight)
     def update(self, spk: jax.Array) -> Union[jax.Array, u.Quantity]:
-        device_kind = jax.devices()[0].platform  # spk.device.device_kind
-        if device_kind == 'cpu':
-            return cpu_fixed_prob(self.indices,
-                                  u.math.asarray(self.weight.value),
-                                  u.math.asarray(spk),
-                                  n_post=self.n_post,
-                                  grad_mode=self.grad_mode)
-        elif device_kind in ['gpu', 'tpu']:
-            raise NotImplementedError()
-        else:
-            raise ValueError(f"Unsupported device: {device_kind}")
+        return event_fixed_prob(
+            spk,
+            self.weight.value,
+            self.indices,
+            n_post=self.out_size[-1],
+            block_size=self.block_size,
+            float_as_event=self.float_as_event
+        )
-@set_module_as('brainstate.event')
-def cpu_fixed_prob(
-    indices: jax.Array,
-    weight: Union[u.Quantity, jax.Array],
-    spk: jax.Array,
-    *,
-    n_post: int,
-    grad_mode: str = 'vjp'
-) -> Union[u.Quantity, jax.Array]:
+def event_fixed_prob(spk, weight, indices, *, n_post, block_size, float_as_event):
     """
     The FixedProb module implements a fixed probability connection with CSR sparse data structure.
     Parameters
     ----------
-    n_post : int
-        Number of post-synaptic neurons.
     weight : brainunit.Quantity or jax.Array
         Maximum synaptic conductance.
     spk : jax.Array
         Spike events.
-    indices : jax.Array
-        Indices of post connected neurons.
-    grad_mode : str, optional
-        Gradient mode. Default is 'vjp'. Can be 'vjp' or 'jvp'.
     Returns
     -------
     post_data : brainunit.Quantity or jax.Array
         Post synaptic data.
     """
-    unit = u.get_unit(weight)
-    weight = u.get_mantissa(weight)
-    indices = jnp.asarray(indices)
-    spk = jnp.asarray(spk)
+    with jax.ensure_compile_time_eval():
+        weight = u.math.asarray(weight)
+        unit = u.get_unit(weight)
+        weight = u.get_mantissa(weight)
+        indices = jnp.asarray(indices)
+        spk = jnp.asarray(spk)
     def mv(spk_vector):
         assert spk_vector.ndim == 1, f"spk must be 1D. Got: {spk.ndim}"
-        if grad_mode == 'vjp':
-            post_data = _cpu_event_fixed_prob_mv_vjp(indices, weight, spk_vector, n_post)
-        elif grad_mode == 'jvp':
-            post_data = _cpu_event_fixed_prob_mv_jvp(indices, weight, spk_vector, n_post)
-        else:
-            raise ValueError(f"Unsupported grad_mode: {grad_mode}")
-        return post_data
+        return event_ellmv_p_call(
+            spk,
+            weight,
+            indices,
+            n_post=n_post,
+            block_size=block_size,
+            float_as_event=float_as_event
+        )
     assert spk.ndim >= 1, f"spk must be at least 1D. Got: {spk.ndim}"
     assert weight.ndim in [2, 0], f"weight must be 2D or 0D. Got: {weight.ndim}"
     assert indices.ndim == 2, f"indices must be 2D. Got: {indices.ndim}"
     if spk.ndim == 1:
-        post_data = mv(spk)
+        [post_data] = mv(spk)
     else:
-        shape = spk.shape[:-1]
-        post_data = jax.vmap(mv)(u.math.reshape(spk, (-1, spk.shape[-1])))
-        post_data = u.math.reshape(post_data, shape + post_data.shape[-1:])
+        [post_data] = jax.vmap(mv)(u.math.reshape(spk, (-1, spk.shape[-1])))
+        post_data = u.math.reshape(post_data, spk.shape[:-1] + post_data.shape[-1:])
     return u.maybe_decimal(u.Quantity(post_data, unit=unit))
-# -------------------
-# CPU Implementation
-# -------------------
+Kernel = Callable
+def cpu_kernel_generator(
+    float_as_event: bool,
+    weight_info: jax.ShapeDtypeStruct,
+    spike_info: jax.ShapeDtypeStruct,
+    **kwargs
+):
+    import numba  # pylint: disable=import-outside-toplevel
+    if weight_info.size == 1:
+        if spike_info.dtype == jnp.bool_:
+            @numba.njit
+            def ell_mv(spikes, weights, indices, posts):
+                posts[:] = 0.
+                w = weights[()]
+                for i in range(spikes.shape[0]):
+                    if spikes[i]:
+                        for j in range(indices.shape[1]):
+                            posts[indices[i, j]] += w
+        elif float_as_event:
+            @numba.njit
+            def ell_mv(spikes, weights, indices, posts):
+                posts[:] = 0.
+                w = weights[()]
+                for i in range(spikes.shape[0]):
+                    if spikes[i] != 0.:
+                        for j in range(indices.shape[1]):
+                            posts[indices[i, j]] += w
-def _cpu_event_fixed_prob_mv(indices, g_max, spk, n_post: int) -> jax.Array:
-    def scan_fn(post, i):
-        w = g_max if jnp.size(g_max) == 1 else g_max[i]
-        ids = indices[i]
-        sp = spk[i]
-        if spk.dtype == jnp.bool_:
-            post = jax.lax.cond(sp, lambda: post.at[ids].add(w), lambda: post)
         else:
-            post = jax.lax.cond(sp == 0., lambda: post, lambda: post.at[ids].add(w * sp))
-        return post, None
-    return jax.lax.scan(scan_fn, jnp.zeros((n_post,), dtype=g_max.dtype), np.arange(len(spk)))[0]
-# --------------
-# VJP
-# --------------
-def _cpu_event_fixed_prob_mv_fwd(indices, g_max, spk, n_post):
-    return _cpu_event_fixed_prob_mv(indices, g_max, spk, n_post=n_post), (g_max, spk)
-def _cpu_event_fixed_prob_mv_bwd(indices, n_post, res, ct):
-    weight, spk = res
+            @numba.njit
+            def ell_mv(spikes, weights, indices, posts):
+                posts[:] = 0.
+                w = weights[()]
+                for i in range(spikes.shape[0]):
+                    sp = spikes[i]
+                    if sp != 0.:
+                        wsp = w * sp
+                        for j in range(indices.shape[1]):
+                            posts[indices[i, j]] += wsp
-    # ∂L/∂spk = ∂L/∂y * ∂y/∂spk
-    homo = jnp.size(weight) == 1
-    if homo:  # homogeneous weight
-        ct_spk = jax.vmap(lambda idx: jnp.sum(ct[idx] * weight))(indices)
-    else:  # heterogeneous weight
-        ct_spk = jax.vmap(lambda idx, w: jnp.inner(ct[idx], w))(indices, weight)
-    # ∂L/∂w = ∂L/∂y * ∂y/∂w
-    if homo:  # scalar
-        ct_gmax = _cpu_event_fixed_prob_mv(indices, jnp.asarray(1.), spk, n_post=n_post)
-        ct_gmax = jnp.inner(ct, ct_gmax)
     else:
-        def scan_fn(d_gmax, i):
-            if spk.dtype == jnp.bool_:
-                d_gmax = jax.lax.cond(spk[i], lambda: d_gmax.at[i].add(ct[indices[i]]), lambda: d_gmax)
-            else:
-                d_gmax = jax.lax.cond(spk[i] == 0., lambda: d_gmax, lambda: d_gmax.at[i].add(ct[indices[i]] * spk[i]))
-            return d_gmax, None
+        if spike_info.dtype == jnp.bool_:
+            @numba.njit
+            def ell_mv(spikes, weights, indices, posts):
+                posts[:] = 0.
+                for i in range(spikes.shape[0]):
+                    if spikes[i]:
+                        for j in range(indices.shape[1]):
+                            posts[indices[i, j]] += weights[i, j]
+        elif float_as_event:
+            @numba.njit
+            def ell_mv(spikes, weights, indices, posts):
+                posts[:] = 0.
+                for i in range(spikes.shape[0]):
+                    if spikes[i] != 0.:
+                        for j in range(indices.shape[1]):
+                            posts[indices[i, j]] += weights[i, j]
-        ct_gmax = jax.lax.scan(scan_fn, jnp.zeros_like(weight), np.arange(len(spk)))[0]
-    return ct_gmax, ct_spk
+        else:
+            @numba.njit
+            def ell_mv(spikes, weights, indices, posts):
+                posts[:] = 0.
+                for i in range(spikes.shape[0]):
+                    sp = spikes[i]
+                    if sp != 0.:
+                        for j in range(indices.shape[1]):
+                            posts[indices[i, j]] += weights[i, j] * sp
+    return ell_mv
-_cpu_event_fixed_prob_mv_vjp = jax.custom_vjp(_cpu_event_fixed_prob_mv, nondiff_argnums=(0, 3))
-_cpu_event_fixed_prob_mv_vjp.defvjp(_cpu_event_fixed_prob_mv_fwd, _cpu_event_fixed_prob_mv_bwd)
+def gpu_kernel_generator(
+    n_pre: int,
+    n_conn: int,
+    n_post: int,
+    block_size: int,
+    float_as_event: bool,
+    weight_info: jax.ShapeDtypeStruct,
+    **kwargs
+):
+    # 对于具有形状 [n_event] 的 spikes 向量，以及形状 [n_event, n_conn] 的 indices 和 weights 矩阵，
+    # 这个算子的计算逻辑为：
+    #
+    # - 每个block处理 [block_size] 个事件，每个事件对应一个 pre-synaptic neuron
+    # - 每个block处理 [block_size, block_size] 个 indices 和 weights
+    if weight_info.size == 1:
+        def _ell_mv_kernel_homo(
+            sp_ref,  # [block_size]
+            ind_ref,  # [block_size, block_size]
+            _,
+            y_ref,  # [n_post]
+        ):
+            r_pid = pl.program_id(0)
+            c_start = pl.program_id(1) * block_size
+            row_length = jnp.minimum(n_pre - r_pid * block_size, block_size)
+            mask = jnp.arange(block_size) + c_start < n_conn
+            def body_fn(j, _):
+                if sp_ref.dtype == jnp.bool_:
+                    def true_fn():
+                        ind = pl.load(ind_ref, (j, pl.dslice(None)), mask=mask)
+                        pl.atomic_add(y_ref, ind, jnp.ones(block_size, dtype=weight_info.dtype), mask=mask)
+                        # y_ref[ind] += 1.0
+                        # ind = ind_ref[j, ...]
+                        # pl.store(y_ref, ind, 1.0, mask=mask)
+                    jax.lax.cond(sp_ref[j], true_fn, lambda: None)
+                else:
+                    def true_fn(sp):
+                        ind = pl.load(ind_ref, (j, pl.dslice(None)), mask=mask)
+                        if float_as_event:
+                            pl.atomic_add(y_ref, ind, jnp.ones(block_size, dtype=weight_info.dtype), mask=mask)
+                        else:
+                            pl.atomic_add(y_ref, ind, jnp.ones(block_size, dtype=weight_info.dtype) * sp, mask=mask)
+                    sp_ = sp_ref[j]
+                    jax.lax.cond(sp_ != 0., true_fn, lambda _: None, sp_)
+            jax.lax.fori_loop(0, row_length, body_fn, None)
+        # homogenous weights
+        kernel = pl.pallas_call(
+            _ell_mv_kernel_homo,
+            out_shape=[
+                jax.ShapeDtypeStruct((n_post,), weight_info.dtype),
+            ],
+            in_specs=[
+                pl.BlockSpec((block_size,), lambda i, j: i),
+                pl.BlockSpec((block_size, block_size), lambda i, j: (i, j)),
+                pl.BlockSpec((n_post,), lambda i, j: 0)
+            ],
+            grid=(
+                pl.cdiv(n_pre, block_size),
+                pl.cdiv(n_conn, block_size),
+            ),
+            input_output_aliases={2: 0},
+            interpret=False
+        )
+        return (lambda spikes, weight, indices:
+                [kernel(spikes, indices, jnp.zeros(n_post, dtype=weight.dtype))[0] * weight])
-# --------------
-# JVP
-# --------------
+    else:
+        def _ell_mv_kernel_heter(
+            sp_ref,  # [block_size]
+            ind_ref,  # [block_size, block_size]
+            w_ref,  # [block_size, block_size]
+            _,
+            y_ref,  # [n_post]
+        ):
+            r_pid = pl.program_id(0)
+            c_start = pl.program_id(1) * block_size
+            row_length = jnp.minimum(n_pre - r_pid * block_size, block_size)
+            mask = jnp.arange(block_size) + c_start < n_conn
+            def body_fn(j, _):
+                if sp_ref.dtype == jnp.bool_:
+                    def true_fn():
+                        ind = pl.load(ind_ref, (j, pl.dslice(None)), mask=mask)
+                        w = pl.load(w_ref, (j, pl.dslice(None)), mask=mask)
+                        pl.atomic_add(y_ref, ind, w, mask=mask)
+                    jax.lax.cond(sp_ref[j], true_fn, lambda: None)
+                else:
+                    def true_fn(spk):
+                        ind = pl.load(ind_ref, (j, pl.dslice(None)), mask=mask)
+                        w = pl.load(w_ref, (j, pl.dslice(None)), mask=mask)
+                        if not float_as_event:
+                            w = w * spk
+                        pl.atomic_add(y_ref, ind, w, mask=mask)
+                    sp_ = sp_ref[j]
+                    jax.lax.cond(sp_ != 0., true_fn, lambda _: None, sp_)
+            jax.lax.fori_loop(0, row_length, body_fn, None)
+        # heterogeneous weights
+        kernel = pl.pallas_call(
+            _ell_mv_kernel_heter,
+            out_shape=[
+                jax.ShapeDtypeStruct((n_post,), weight_info.dtype),
+            ],
+            in_specs=[
+                pl.BlockSpec((block_size,), lambda i, j: i),  # sp_ref
+                pl.BlockSpec((block_size, block_size), lambda i, j: (i, j)),  # ind_ref
+                pl.BlockSpec((block_size, block_size), lambda i, j: (i, j)),  # w_ref,
+                pl.BlockSpec((n_post,), lambda i, j: 0)
+            ],
+            grid=(
+                pl.cdiv(n_pre, block_size),
+                pl.cdiv(n_conn, block_size),
+            ),
+            input_output_aliases={3: 0},
+            interpret=False
+        )
+        return (lambda spikes, weight, indices:
+                kernel(spikes, indices, weight, jnp.zeros(n_post, dtype=weight_info.dtype)))
+def jvp_spikes(spk_dot, spikes, weights, indices, *, n_post, block_size, **kwargs):
+    return ellmv_p_call(
+        spk_dot,
+        weights,
+        indices,
+        n_post=n_post,
+        block_size=block_size,
+    )
+def jvp_weights(w_dot, spikes, weights, indices, *, float_as_event, block_size, n_post, **kwargs):
+    return event_ellmv_p_call(
+        spikes,
+        w_dot,
+        indices,
+        n_post=n_post,
+        block_size=block_size,
+        float_as_event=float_as_event
+    )
+def transpose_rule(
+    ct, spikes, weights, indices,
+    *,
+    float_as_event, n_post, n_conn, block_size, weight_info, **kwargs
+):
+    if ad.is_undefined_primal(indices):
+        raise ValueError("Cannot transpose with respect to sparse indices.")
+    ct = ct[0]
-def _cpu_event_fixed_prob_mv_jvp_rule(indices, n_post, primals, tangents):
-    # forward pass
-    weight, spk = primals
-    y = _cpu_event_fixed_prob_mv(indices, weight, spk, n_post=n_post)
+    # ∂L/∂spk = ∂L/∂y * ∂y/∂spk
+    homo = weight_info.size == 1
+    if ad.is_undefined_primal(spikes):
+        if homo:
+            # homogeneous weight
+            ct_spk = jax.vmap(lambda idx: jnp.sum(ct[idx] * weights))(indices)
+        else:
+            # heterogeneous weight
+            ct_spk = jax.vmap(lambda idx, w: jnp.inner(ct[idx], w))(indices, weights)
+        return (ad.Zero(spikes) if type(ct) is ad.Zero else ct_spk), weights, indices
-    # forward gradients
-    gmax_dot, spk_dot = tangents
+    else:
+        # ∂L/∂w = ∂L/∂y * ∂y/∂w
+        if homo:
+            # scalar
+            ct_gmax = event_ellmv_p_call(
+                spikes,
+                jnp.asarray(1., dtype=weight_info.dtype),
+                indices,
+                n_post=n_post,
+                block_size=block_size,
+                float_as_event=float_as_event
+            )
+            ct_gmax = jnp.inner(ct, ct_gmax[0])
+        else:
+            def map_fn(one_spk, one_ind):
+                if spikes.dtype == jnp.bool_:
+                    return jax.lax.cond(
+                        one_spk,
+                        lambda: ct[one_ind],
+                        lambda: jnp.zeros([n_conn], weight_info.dtype)
+                    )
+                else:
+                    if float_as_event:
+                        return jax.lax.cond(
+                            one_spk == 0.,
+                            lambda: jnp.zeros([n_conn], weight_info.dtype),
+                            lambda: ct[one_ind]
+                        )
+                    else:
+                        return jax.lax.cond(
+                            one_spk == 0.,
+                            lambda: jnp.zeros([n_conn], weight_info.dtype),
+                            lambda: ct[one_ind] * one_spk
+                        )
+            ct_gmax = jax.vmap(map_fn)(spikes, indices)
+        return spikes, (ad.Zero(weights) if type(ct) is ad.Zero else ct_gmax), indices
+event_ellmv_p = XLACustomOp(
+    'event_ell_mv',
+    cpu_kernel_generator=cpu_kernel_generator,
+    gpu_kernel_generator=gpu_kernel_generator,
+)
+event_ellmv_p.defjvp(jvp_spikes, jvp_weights, None)
+event_ellmv_p.def_transpose_rule(transpose_rule)
+def event_ellmv_p_call(spikes, weights, indices, *, n_post, block_size, float_as_event):
+    n_conn = indices.shape[1]
+    if block_size is None:
+        if n_conn <= 16:
+            block_size = 16
+        elif n_conn <= 32:
+            block_size = 32
+        elif n_conn <= 64:
+            block_size = 64
+        elif n_conn <= 128:
+            block_size = 128
+        elif n_conn <= 256:
+            block_size = 256
+        else:
+            block_size = 128
+    return event_ellmv_p(
+        spikes,
+        weights,
+        indices,
+        outs=[jax.ShapeDtypeStruct([n_post], weights.dtype)],
+        block_size=block_size,
+        float_as_event=float_as_event,
+        n_pre=spikes.shape[0],
+        n_conn=indices.shape[1],
+        n_post=n_post,
+        weight_info=jax.ShapeDtypeStruct(weights.shape, weights.dtype),
+        spike_info=jax.ShapeDtypeStruct(spikes.shape, spikes.dtype),
+    )
+def ell_cpu_kernel_generator(
+    weight_info: jax.ShapeDtypeStruct,
+    **kwargs
+):
+    import numba  # pylint: disable=import-outside-toplevel
+    if jnp.size(weight_info) == 1:
+        @numba.njit
+        def ell_mv(vector, weights, indices, posts):
+            posts[:] = 0.
+            w = weights[()]
+            for i in range(vector.shape[0]):
+                wv = w * vector[i]
+                for j in range(indices.shape[1]):
+                    posts[indices[i, j]] += wv
-    # ∂y/∂gmax
-    dgmax = _cpu_event_fixed_prob_mv(indices, gmax_dot, spk, n_post=n_post)
+    else:
+        @numba.njit
+        def ell_mv(vector, weights, indices, posts):
+            posts[:] = 0.
+            for i in range(vector.shape[0]):
+                for j in range(indices.shape[1]):
+                    posts[indices[i, j]] += weights[i, j] * vector[i]
-    def scan_fn(post, i):
-        ids = indices[i]
-        w = weight if jnp.size(weight) == 1 else weight[i]
-        post = post.at[ids].add(w * spk_dot[i])
-        return post, None
+    return ell_mv
-    # ∂y/∂gspk
-    dspk = jax.lax.scan(scan_fn, jnp.zeros((n_post,), dtype=weight.dtype), np.arange(len(spk)))[0]
-    return y, dgmax + dspk
+def ell_gpu_kernel_generator(
+    block_size: int,
+    n_pre: int,
+    n_conn: int,
+    n_post: int,
+    weight_info: jax.ShapeDtypeStruct,
+    **kwargs
+):
+    homo = jnp.size(weight_info) == 1
+    if homo:
+        def _kernel(
+            vec_ref, ind_ref, _, out_ref,
+        ):
+            # 每个block 处理 [block_size] 大小的vector
+            # 每个block 处理 [block_size, block_size] 大小的indices 和 weights
+            # -------------------------------
+            # vec_ref: [block_size]
+            # ind_ref: [block_size, block_size]
+            # out_ref: [n_post]
+            r_pid = pl.program_id(0)
+            c_start = pl.program_id(1) * block_size
+            mask = jnp.arange(block_size) + c_start
+            row_length = jnp.minimum(n_pre - r_pid * block_size, block_size)
+            def body_fn(j, _):
+                y = vec_ref[j] * jnp.ones(block_size, dtype=weight_info.dtype)
+                ind = pl.load(ind_ref, (j, pl.dslice(None)), mask=mask)
+                pl.atomic_add(out_ref, ind, y, mask=mask)
+            jax.lax.fori_loop(0, row_length, body_fn, None)
+        # heterogeneous weights
+        kernel = pl.pallas_call(
+            _kernel,
+            out_shape=[
+                jax.ShapeDtypeStruct((n_post,), weight_info.dtype),
+            ],
+            in_specs=[
+                pl.BlockSpec((block_size,), lambda i, j: i),  # vec_ref
+                pl.BlockSpec((block_size, block_size), lambda i, j: (i, j)),  # ind_ref
+                pl.BlockSpec((n_post,), lambda i, j: 0)  # out_ref
+            ],
+            grid=(
+                pl.cdiv(n_pre, block_size),
+                pl.cdiv(n_conn, block_size),
+            ),
+            input_output_aliases={2: 0},
+            interpret=False
+        )
+        return lambda vector, weight, indices: kernel(vector, indices, jnp.zeros(n_post, dtype=weight.dtype)) * weight
-_cpu_event_fixed_prob_mv_jvp = jax.custom_jvp(_cpu_event_fixed_prob_mv, nondiff_argnums=(0, 3))
-_cpu_event_fixed_prob_mv_jvp.defjvp(_cpu_event_fixed_prob_mv_jvp_rule)
+    else:
+        def _kernel(
+            vec_ref, ind_ref, w_ref, _, out_ref,
+        ):
+            # 每个block 处理 [block_size] 大小的vector
+            # 每个block 处理 [block_size, n_conn] 大小的indices 和 weights
+            # -------------------------------
+            # vec_ref: [block_size]
+            # ind_ref: [block_size, block_size]
+            # w_ref: [block_size, block_size]
+            # out_ref: [n_post]
+            r_pid = pl.program_id(0)
+            c_start = pl.program_id(1) * block_size
+            mask = jnp.arange(block_size) + c_start
+            row_length = jnp.minimum(n_pre - r_pid * block_size, block_size)
+            def body_fn(j, _):
+                w = pl.load(w_ref, (j, pl.dslice(None)), mask=mask)
+                y = w * vec_ref[j]
+                ind = pl.load(ind_ref, (j, pl.dslice(None)), mask=mask)
+                pl.atomic_add(out_ref, ind, y, mask=mask)
+            jax.lax.fori_loop(0, row_length, body_fn, None)
+        # heterogeneous weights
+        kernel = pl.pallas_call(
+            _kernel,
+            out_shape=[
+                jax.ShapeDtypeStruct((n_post,), weight_info.dtype),
+            ],
+            in_specs=[
+                pl.BlockSpec((block_size,), lambda i, j: i),  # vec_ref
+                pl.BlockSpec((block_size, block_size), lambda i, j: (i, j)),  # ind_ref
+                pl.BlockSpec((block_size, block_size), lambda i, j: (i, j)),  # w_ref
+                pl.BlockSpec((n_post,), lambda i: 0)  # out_ref
+            ],
+            grid=(
+                pl.cdiv(n_pre, block_size),
+                pl.cdiv(n_conn, block_size),
+            ),
+            input_output_aliases={3: 0},
+            interpret=False
+        )
+        return lambda vector, weight, indices: kernel(vector, indices, weight, jnp.zeros(n_post, dtype=weight.dtype))
+def jvp_weights_no_spk(w_dot, vector, weights, indices, *, block_size, n_post, **kwargs):
+    return ellmv_p_call(
+        vector,
+        w_dot,
+        indices,
+        block_size=block_size,
+        n_post=n_post,
+    )
+def transpose_rule_no_spk(
+    ct, vector, weights, indices,
+    *,
+    n_post, block_size, weight_info, **kwargs
+):
+    if ad.is_undefined_primal(indices):
+        raise ValueError("Cannot transpose with respect to sparse indices.")
+    ct = ct[0]
-def _gpu_event_fixed_prob_mv(indices, g_max, spk, n_post: int) -> jax.Array:
-    def scan_fn(post, i):
-        w = g_max if jnp.size(g_max) == 1 else g_max[i]
-        ids = indices[i]
-        sp = spk[i]
-        if spk.dtype == jnp.bool_:
-            post = jax.lax.cond(sp, lambda: post.at[ids].add(w), lambda: post)
+    # ∂L/∂spk = ∂L/∂y * ∂y/∂spk
+    homo = weight_info.size == 1
+    if ad.is_undefined_primal(vector):
+        if homo:
+            # homogeneous weight
+            ct_spk = jax.vmap(lambda idx: jnp.sum(ct[idx] * weights))(indices)
         else:
-            post = jax.lax.cond(sp == 0., lambda: post, lambda: post.at[ids].add(w * sp))
-        return post, None
+            # heterogeneous weight
+            ct_spk = jax.vmap(lambda idx, w: jnp.inner(ct[idx], w))(indices, weights)
+        return (ad.Zero(vector) if type(ct) is ad.Zero else ct_spk), weights, indices
-    return jax.lax.scan(scan_fn, jnp.zeros((n_post,), dtype=g_max.dtype), np.arange(len(spk)))[0]
+    else:
+        # ∂L/∂w = ∂L/∂y * ∂y/∂w
+        if homo:
+            # scalar
+            ct_gmax = ellmv_p_call(
+                vector,
+                jnp.asarray(1., dtype=weight_info.dtype),
+                indices,
+                block_size=block_size,
+                n_post=n_post,
+            )
+            ct_gmax = jnp.inner(ct, ct_gmax[0])
+        else:
+            ct_gmax = jax.vmap(lambda vec, one_ind: ct[one_ind] * vec)(vector, indices)
+        return vector, (ad.Zero(weights) if type(ct) is ad.Zero else ct_gmax), indices
+ellmv_p = XLACustomOp(
+    'ell_mv',
+    cpu_kernel_generator=ell_cpu_kernel_generator,
+    gpu_kernel_generator=ell_gpu_kernel_generator,
+)
+ellmv_p.defjvp(jvp_spikes, jvp_weights_no_spk, None)
+ellmv_p.def_transpose_rule(transpose_rule_no_spk)
+def ellmv_p_call(vector, weights, indices, *, n_post, block_size):
+    n_conn = indices.shape[1]
+    if block_size is None:
+        if n_conn <= 16:
+            block_size = 16
+        elif n_conn <= 32:
+            block_size = 32
+        elif n_conn <= 64:
+            block_size = 64
+        elif n_conn <= 128:
+            block_size = 128
+        elif n_conn <= 256:
+            block_size = 256
+        else:
+            block_size = 128
+    return ellmv_p(
+        vector,
+        weights,
+        indices,
+        n_post=n_post,
+        n_pre=indices.shape[0],
+        n_conn=indices.shape[1],
+        block_size=block_size,
+        weight_info=jax.ShapeDtypeStruct(weights.shape, weights.dtype),
+        outs=[jax.ShapeDtypeStruct([n_post], weights.dtype)]
+    )

brainstate 0.1.0__py2.py3-none-any.whl → 0.1.0.post20241122__py2.py3-none-any.whl

brainstate 0.1.0py2.py3-none-any.whl → 0.1.0.post20241122py2.py3-none-any.whl