PyPI - rxnn - Versions diffs - 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl - Mend

rxnn 0.1.14py3-none-any.whl → 0.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

rxnn/experimental/attention.py +442 -88
rxnn/experimental/models.py +117 -0
rxnn/experimental/moe.py +206 -0
rxnn/transformers/moe.py +42 -86
{rxnn-0.1.14.dist-info → rxnn-0.1.16.dist-info}/METADATA +1 -1
{rxnn-0.1.14.dist-info → rxnn-0.1.16.dist-info}/RECORD +8 -6
{rxnn-0.1.14.dist-info → rxnn-0.1.16.dist-info}/LICENSE +0 -0
{rxnn-0.1.14.dist-info → rxnn-0.1.16.dist-info}/WHEEL +0 -0

rxnn/experimental/attention.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import torch
-from torch import nn
+import torch.nn as nn
+import torch.nn.functional as F
 from ..transformers.attention import MultiHeadAttention, GroupedQueryAttention
 from ..transformers.positional import RotaryPositionalEmbedding
 from ..transformers.moe import MoeRouter
@@ -9,6 +10,7 @@ from ..transformers.moe import MoeRouter
 class GroupedMoeAttention(GroupedQueryAttention):
     """
     Grouped MoE Attention (GMA) - GQA extended with Mixture-of-Experts (MoE) routing.
     Instead of mapping keys/values to static head groups, it dynamically selects head expert groups. It has the same
     number of total keys/values heads as query heads, but uses only a selected group for attention calculation.
     - with num_groups set to 1, it will be MoE MultiQueryAttention
@@ -20,8 +22,11 @@ class GroupedMoeAttention(GroupedQueryAttention):
     Optionally, it could use even more expert heads than attention heads - in example:
     - 512 dim divided into 16 heads with 32 dim, using 4 head groups - may use i.e., 24 total expert heads - still only
-      4 will be used for attention calculation, while 16 is used to split dimensions (in that case it will have 16 query heads)
+    4 will be used for attention calculation, while 16 is used to split dimensions (in that case it will have 16 query heads)
+    © 2025 Adam Filipek
     """
     def __init__(
             self,
             embed_dim: int,
@@ -39,7 +44,7 @@ class GroupedMoeAttention(GroupedQueryAttention):
             *args,
             **kwargs,
     ):
-        self.num_experts = num_experts if num_experts is not None else num_heads
+        self.num_experts = num_experts or num_heads
         super(GroupedMoeAttention, self).__init__(
             embed_dim,
             num_heads,
@@ -58,7 +63,228 @@ class GroupedMoeAttention(GroupedQueryAttention):
     def _init_kv(self, embed_dim: int):
         self.router = MoeRouter(embed_dim, self.num_experts, top_k=self.num_groups)
-        hidden_dim = embed_dim // (self.num_heads // self.num_groups)
+        hidden_dim = embed_dim // self.num_heads
+        self.wk = nn.Parameter(torch.empty(self.num_experts, embed_dim, hidden_dim))
+        self.bk = nn.Parameter(torch.zeros(self.num_experts, hidden_dim)) if self.use_bias else None
+        self.wv = nn.Parameter(torch.empty(self.num_experts, embed_dim, hidden_dim))
+        self.bv = nn.Parameter(torch.zeros(self.num_experts, hidden_dim)) if self.use_bias else None
+        self._init_experts()
+    def _init_experts(self):
+        nn.init.xavier_uniform_(self.wk)
+        nn.init.xavier_uniform_(self.wv)
+        if self.use_bias:
+            nn.init.zeros_(self.bk)
+            nn.init.zeros_(self.bv)
+    def _process_grouped_experts(self, x: torch.Tensor, w: torch.Tensor, b: torch.Tensor, weights: torch.Tensor, indices: torch.Tensor):
+        B, S, G = indices.shape
+        x_flat = x.view(-1, x.size(-1))
+        # Flatten batch and sequence dimensions
+        indices_flat = indices.view(-1, G)
+        weights_flat = weights.view(-1, G, 1)
+        # Create expanded indices for expert processing
+        mask = torch.zeros(B * S, self.num_experts, device=x.device, dtype=torch.bool)
+        for g in range(G):
+            mask.scatter_(1, indices_flat[:, g].unsqueeze(1), True)
+        output = torch.zeros(B * S, G, w.size(2), device=x.device, dtype=x.dtype)
+        for e in range(self.num_experts):
+            token_mask = mask[:, e]
+            if not token_mask.any():
+                continue
+            # Get positions where expert e is used in any group
+            x_slice = x_flat[token_mask]
+            proj = F.linear(x_slice, w[e], b[e] if b is not None else None)
+            # Find which groups use this expert for selected tokens
+            group_mask = (indices_flat[token_mask] == e)
+            # Accumulate projections for relevant groups
+            weighted_proj = proj.unsqueeze(1) * weights_flat[token_mask] * group_mask.unsqueeze(-1).float()
+            output[token_mask] += weighted_proj.sum(dim=1)
+        return output.view(B, S, G, -1)
+    def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int,
+                     skip_query_processing: bool = False):
+        q = self.q_proj(query).view(b, t, self.num_heads, -1).transpose(1, 2) if not skip_query_processing else query
+        # Key/Value processing
+        B, S, _ = key.shape
+        weights_k, indices_k = self.router(key)
+        k = self._process_grouped_experts(key, self.wk, self.bk, weights_k, indices_k)
+        v = self._process_grouped_experts(value, self.wv, self.bv, weights_k, indices_k)
+        # Expand to GQA format
+        k = k.permute(0, 2, 1, 3).reshape(B, self.num_groups, S, -1)
+        v = v.permute(0, 2, 1, 3).reshape(B, self.num_groups, S, -1)
+        if not self.use_flash_attention:
+            group_heads = self.num_heads // self.num_groups
+            k = k.unsqueeze(2).expand(-1, -1, group_heads, -1, -1)  # (B, G, group_heads, S, head_dim)
+            v = v.unsqueeze(2).expand(-1, -1, group_heads, -1, -1)  # (B, G, group_heads, S, head_dim)
+            k = k.flatten(start_dim=1, end_dim=2)  # (B, H, S, head_dim)
+            v = v.flatten(start_dim=1, end_dim=2)  # (B, H, S, head_dim)
+        return q, k, v
+class DeepMoeAttention(GroupedMoeAttention):
+    """
+    Deep MoE Attention (SMA) - Grouped MoE Attention extended even more for sublinear computational efficiency.
+    In addition to using Mixture-of-Experts (MoE) for key/value head groups, SMA is also using dynamically selected
+    query heads - with that approach, each token could attend to every other token, but only partially - only some part of
+    information from each token is used to identify related information parts from other tokens. So, DMA is not spatially
+    sparse (has access to all tokens), but rather structurally sparse (has access only to the part of token's information).
+    This solution could reduce the computational complexity of attention operation to sublinear level (<O(N)) and provide
+    a viable and efficient alternative to spatial sparse attention mechanisms like Flex Attention.
+    © 2025 Adam Filipek
+    """
+    def __init__(
+            self,
+            embed_dim: int,
+            num_heads: int,
+            num_groups: int,
+            dropout: float = 0.0,
+            rope: RotaryPositionalEmbedding = None,
+            rope_only_for_query: bool = False,
+            use_relative_embeddings: bool = False,
+            max_seq_len: int = 1024,
+            use_flash_attention: bool = False,
+            is_causal: bool = False,
+            use_bias: bool = False,
+            num_experts: int = None,
+            num_query_experts: int = None,
+            num_query_groups: int = None,
+            *args,
+            **kwargs,
+    ):
+        self.num_query_experts = num_query_experts if num_query_experts is not None else num_heads
+        self.num_query_groups = num_query_groups if num_query_groups is not None else num_groups
+        super(DeepMoeAttention, self).__init__(
+            embed_dim,
+            num_heads,
+            num_groups=num_groups,
+            dropout=dropout,
+            rope=rope,
+            rope_only_for_query=rope_only_for_query,
+            use_relative_embeddings=use_relative_embeddings,
+            max_seq_len=max_seq_len,
+            use_flash_attention=use_flash_attention,
+            is_causal=is_causal,
+            use_bias=use_bias,
+            num_experts=num_experts,
+            *args,
+            **kwargs,
+        )
+    def _init_q(self, embed_dim: int):
+        self.query_router = MoeRouter(embed_dim, self.num_query_experts, top_k=self.num_query_groups)
+        hidden_dim = embed_dim // self.num_heads
+        self.wq = nn.Parameter(torch.empty(self.num_query_experts, embed_dim, hidden_dim))
+        self.bq = nn.Parameter(torch.zeros(self.num_query_experts, hidden_dim)) if self.use_bias else None
+        self._init_query_experts()
+    def _init_query_experts(self):
+        nn.init.xavier_uniform_(self.wq)
+        if self.use_bias:
+            nn.init.zeros_(self.bq)
+    def _init_out(self, embed_dim: int):
+        """Initialize output projection"""
+        hidden_dim = embed_dim // (self.num_heads // self.num_query_groups)
+        self.out_proj = nn.Linear(hidden_dim, embed_dim)
+    def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int, skip_query_processing: bool = False):
+        # Query processing
+        B, T, _ = query.shape
+        weights_q, indices_q = self.query_router(query)
+        q = self._process_grouped_experts(query, self.wq, self.bq, weights_q, indices_q)
+        q = q.permute(0, 2, 1, 3).reshape(B, self.num_query_groups, T, -1)
+        # Expand query groups to match head count
+        group_heads = self.num_heads // self.num_query_groups
+        q = q.unsqueeze(2).expand(-1, -1, group_heads, -1, -1).flatten(1, 2).transpose(1, 2)
+        # Key/Value processing
+        return super()._forward_qkv(q, key, value, b, t, d, skip_query_processing=True)
+# Vectorized
+class GroupedMoeAttentionVectorized(GroupedQueryAttention):
+    """
+    Vectorized implementation calculates all expert heads for each token and selecting active tokens later. Linear layers
+    for Attention are rather small, compared to MoE Feed Forward layers, so it's possible that it will be faster than filtering
+    experts - it has to be tested.
+    Grouped MoE Attention (GMA) - GQA extended with Mixture-of-Experts (MoE) routing.
+    Instead of mapping keys/values to static head groups, it dynamically selects head expert groups. It has the same
+    number of total keys/values heads as query heads, but uses only a selected group for attention calculation.
+    - with num_groups set to 1, it will be MoE MultiQueryAttention
+    Compared to traditional GQA/MQA, it should provide better performance, because lot less data could be lost using
+    this approach - we are training the full number of keys/values heads, while using only a group.
+    In case of efficiency, it should be close to GQA/MQA linear performance, but with a small MoE routing overhead.
+    Optionally, it could use even more expert heads than attention heads - in example:
+    - 512 dim divided into 16 heads with 32 dim, using 4 head groups - may use i.e., 24 total expert heads - still only
+    4 will be used for attention calculation, while 16 is used to split dimensions (in that case it will have 16 query heads)
+    © 2025 Adam Filipek
+    """
+    def __init__(
+            self,
+            embed_dim: int,
+            num_heads: int,
+            num_groups: int,
+            dropout: float = 0.0,
+            rope: RotaryPositionalEmbedding = None,
+            rope_only_for_query: bool = False,
+            use_relative_embeddings: bool = False,
+            max_seq_len: int = 1024,
+            use_flash_attention: bool = False,
+            is_causal: bool = False,
+            use_bias: bool = False,
+            num_experts: int = None,
+            *args,
+            **kwargs,
+    ):
+        self.num_experts = num_experts if num_experts is not None else num_heads
+        super(GroupedMoeAttentionVectorized, self).__init__(
+            embed_dim,
+            num_heads,
+            num_groups=num_groups,
+            dropout=dropout,
+            rope=rope,
+            rope_only_for_query=rope_only_for_query,
+            use_relative_embeddings=use_relative_embeddings,
+            max_seq_len=max_seq_len,
+            use_flash_attention=use_flash_attention,
+            is_causal=is_causal,
+            use_bias=use_bias,
+            *args,
+            **kwargs,
+        )
+    def _init_kv(self, embed_dim: int):
+        self.router = MoeRouter(embed_dim, self.num_experts, top_k=self.num_groups)
+        hidden_dim = embed_dim // self.num_heads
         self.wk = nn.Parameter(torch.empty(self.num_experts, embed_dim, hidden_dim))
         self.bk = nn.Parameter(torch.zeros(self.num_experts, hidden_dim)) if self.use_bias else None
         self.wv = nn.Parameter(torch.empty(self.num_experts, embed_dim, hidden_dim))
@@ -72,47 +298,37 @@ class GroupedMoeAttention(GroupedQueryAttention):
             torch.nn.init.zeros_(self.bk)
             torch.nn.init.zeros_(self.bv)
-    def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int, skip_query_processing: bool = False):
+    def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int,
+                     skip_query_processing: bool = False):
+        # Indexed version may cause memory overflow
+        #
         # head_dim = d // self.num_heads
-        # group_heads = self.num_heads // self.num_groups
         #
         # # Process Query as in GQA
-        # q = self.q_proj(query).view(b, t, self.num_heads, head_dim).transpose(1, 2)
+        # q = self.q_proj(query).view(b, t, self.num_heads, head_dim).transpose(1,
+        #                                                                       2) if not skip_query_processing else query
         #
         # # Process Key and Value with MoE routing
-        # key_flat = key.view(-1, d)
-        # weights, indices = self.router(key_flat)
-        # weights = weights.view(b, key.size(1), self.num_groups, 1)
-        # indices = indices.view(b, key.size(1), self.num_groups)
+        # key_flat = key.view(-1, d)  # (B*S, d)
+        # value_flat = value.view(-1, d)  # (B*S, d)
         #
-        # # Compute all experts' K and V projections
-        # # Shape: (batch_size, seq_len, num_experts, head_dim * num_groups)
-        # k_all = torch.einsum(
-        #     'be, ehd -> bedh',
-        #     key_flat,
-        #     self.wk.view(self.num_experts, d, -1)
-        # ).view(b, key.size(1), self.num_experts, -1)
+        # # Get routing indices and weights for K
+        # weights_k, indices_k = self.router(key_flat)
+        # indices_k = indices_k.view(-1, self.top_k)  # (B*S, top_k)
+        # weights_k = weights_k.view(-1, self.top_k, 1)  # (B*S, top_k, 1)
         #
-        # v_all = torch.einsum(
-        #     'be, ehd -> bedh',
-        #     value.view(-1, d),
-        #     self.wv.view(self.num_experts, d, -1)
-        # ).view(b, value.size(1), self.num_experts, -1)
+        # # Select and compute K projections for only the top_k experts
+        # selected_k_weights = self.k_experts[indices_k]  # (B*S, top_k, d, k_out_dim)
+        # k_proj = torch.einsum('bd, behd -> beh', key_flat.unsqueeze(1), selected_k_weights)
+        # selected_k = (k_proj * weights_k).sum(dim=1)  # (B*S, k_out_dim)
+        # selected_k = selected_k.view(b, key.size(1), -1)  # (B, S, k_out_dim)
         #
-        # # Select top_k experts and compute weighted sum
-        # selected_k = torch.gather(
-        #     k_all,
-        #     2,
-        #     indices.unsqueeze(-1).expand(-1, -1, -1, k_all.size(-1))
-        # )
-        # selected_v = torch.gather(
-        #     v_all,
-        #     2,
-        #     indices.unsqueeze(-1).expand(-1, -1, -1, v_all.size(-1))
-        # )
+        # # Compute V using the same indices as K (since they share the same router)
+        # selected_v_weights = self.v_experts[indices_k]
+        # v_proj = torch.einsum('bd, behd -> beh', value_flat.unsqueeze(1), selected_v_weights)
+        # selected_v = (v_proj * weights_k).sum(dim=1)
+        # selected_v = selected_v.view(b, value.size(1), -1)  # (B, S, k_out_dim)
         #
-        # selected_k = (selected_k * weights).sum(dim=2)
-        # selected_v = (selected_v * weights).sum(dim=2)
         # # Reshape to GQA format: (B, G, S, head_dim)
         # k = selected_k.view(b, key.size(1), self.num_groups, head_dim).transpose(1, 2)
         # v = selected_v.view(b, value.size(1), self.num_groups, head_dim).transpose(1, 2)
@@ -127,32 +343,46 @@ class GroupedMoeAttention(GroupedQueryAttention):
         #     v = v.flatten(start_dim=1, end_dim=2)  # (B, H, S, head_dim)
         #
         # return q, k, v
         head_dim = d // self.num_heads
         # Process Query as in GQA
-        q = self.q_proj(query).view(b, t, self.num_heads, head_dim).transpose(1, 2) if not skip_query_processing else query
+        q = self.q_proj(query).view(b, t, self.num_heads, head_dim).transpose(1, 2)
         # Process Key and Value with MoE routing
-        key_flat = key.view(-1, d)  # (B*S, d)
-        value_flat = value.view(-1, d)  # (B*S, d)
-        # Get routing indices and weights for K
-        weights_k, indices_k = self.router(key_flat)
-        indices_k = indices_k.view(-1, self.top_k)  # (B*S, top_k)
-        weights_k = weights_k.view(-1, self.top_k, 1)  # (B*S, top_k, 1)
-        # Select and compute K projections for only the top_k experts
-        selected_k_weights = self.k_experts[indices_k]  # (B*S, top_k, d, k_out_dim)
-        k_proj = torch.einsum('bd, behd -> beh', key_flat.unsqueeze(1), selected_k_weights)
-        selected_k = (k_proj * weights_k).sum(dim=1)  # (B*S, k_out_dim)
-        selected_k = selected_k.view(b, key.size(1), -1)  # (B, S, k_out_dim)
-        # Compute V using the same indices as K (since they share the same router)
-        selected_v_weights = self.v_experts[indices_k]
-        v_proj = torch.einsum('bd, behd -> beh', value_flat.unsqueeze(1), selected_v_weights)
-        selected_v = (v_proj * weights_k).sum(dim=1)
-        selected_v = selected_v.view(b, value.size(1), -1)  # (B, S, k_out_dim)
+        key_flat = key.view(-1, d)
+        weights, indices = self.router(key_flat)
+        weights = weights.view(b, key.size(1), self.num_groups, 1)
+        indices = indices.view(b, key.size(1), self.num_groups)
+        # Compute all experts' K and V projections
+        # Shape: (batch_size, seq_len, num_experts, head_dim * num_groups)
+        k_all = torch.einsum(
+            'be, ehd -> bedh',
+            key_flat,
+            self.wk.view(self.num_experts, d, -1)
+        ).view(b, key.size(1), self.num_experts, -1)
+        v_all = torch.einsum(
+            'be, ehd -> bedh',
+            value.view(-1, d),
+            self.wv.view(self.num_experts, d, -1)
+        ).view(b, value.size(1), self.num_experts, -1)
+        # Select top_k experts and compute weighted sum
+        selected_k = torch.gather(
+            k_all,
+            2,
+            indices.unsqueeze(-1).expand(-1, -1, -1, k_all.size(-1))
+        )
+        selected_v = torch.gather(
+            v_all,
+            2,
+            indices.unsqueeze(-1).expand(-1, -1, -1, v_all.size(-1))
+        )
+        selected_k = (selected_k * weights).sum(dim=2)
+        selected_v = (selected_v * weights).sum(dim=2)
         # Reshape to GQA format: (B, G, S, head_dim)
         k = selected_k.view(b, key.size(1), self.num_groups, head_dim).transpose(1, 2)
         v = selected_v.view(b, value.size(1), self.num_groups, head_dim).transpose(1, 2)
@@ -168,15 +398,26 @@ class GroupedMoeAttention(GroupedQueryAttention):
         return q, k, v
-class SparseMoeAttention(GroupedMoeAttention):
+class DeepMoeAttentionVectorized(GroupedMoeAttentionVectorized):
     """
-    Sparse MoE Attention (SMA) - Grouped MoE Attention extended even more for sublinear computational efficiency.
+        Vectorized implementation calculates all expert heads for each token and selecting active tokens later. Linear layers
+    for Attention are rather small, compared to MoE Feed Forward layers, so it's possible that it will be faster than filtering
+    experts - it has to be tested.
+    Deep MoE Attention (SMA) - Grouped MoE Attention extended even more for sublinear computational efficiency.
     In addition to using Mixture-of-Experts (MoE) for key/value head groups, SMA is also using dynamically selected
     query heads - with that approach, each token could attend to every other token, but only partially - only some part of
-    information from each token is used to identify related information parts from other tokens.
+    information from each token is used to identify related information parts from other tokens. So, DMA is not spatially
+    sparse (has access to all tokens), but rather structurally sparse (has access only to the part of token's information).
+    This solution could reduce the computational complexity of attention operation to sublinear level (<O(N)) and provide
+    a viable and efficient alternative to spatial sparse attention mechanisms like Flex Attention.
-    This solution could reduce the computational complexity of attention operation to sublinear level (<O(N))
+    © 2025 Adam Filipek
     """
     def __init__(
             self,
             embed_dim: int,
@@ -192,13 +433,13 @@ class SparseMoeAttention(GroupedMoeAttention):
             use_bias: bool = False,
             num_experts: int = None,
             num_query_experts: int = None,
-            num_active_query_heads: int = None,
+            num_query_groups: int = None,
             *args,
             **kwargs,
     ):
         self.num_query_experts = num_query_experts if num_query_experts is not None else num_heads
-        self.num_active_query_heads = num_active_query_heads if num_active_query_heads is not None else num_groups
-        super(SparseMoeAttention, self).__init__(
+        self.num_query_groups = num_query_groups if num_query_groups is not None else num_groups
+        super(DeepMoeAttentionVectorized, self).__init__(
             embed_dim,
             num_heads,
             num_groups=num_groups,
@@ -216,8 +457,8 @@ class SparseMoeAttention(GroupedMoeAttention):
         )
     def _init_q(self, embed_dim: int):
-        self.query_router = MoeRouter(embed_dim, self.num_query_experts, top_k=self.num_active_query_heads)
-        hidden_dim = embed_dim // (self.num_heads // self.num_groups)
+        self.query_router = MoeRouter(embed_dim, self.num_query_experts, top_k=self.num_query_groups)
+        hidden_dim = embed_dim // self.num_heads
         self.wq = nn.Parameter(torch.empty(self.num_query_experts, embed_dim, hidden_dim))
         self.bq = nn.Parameter(torch.zeros(self.num_query_experts, hidden_dim)) if self.use_bias else None
         self._init_query_experts()
@@ -227,20 +468,47 @@ class SparseMoeAttention(GroupedMoeAttention):
         if self.use_bias:
             torch.nn.init.zeros_(self.bq)
+    def _init_out(self, embed_dim: int):
+        """Initialize output projection"""
+        self.out_proj = nn.Linear(embed_dim // (self.num_heads // self.num_groups), embed_dim)
     def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int):
+        # Indexed version may cause memory overflow
+        #
+        # head_dim = d // self.num_heads
+        #
+        # # Process Query with MoE routing
+        # query_flat = query.view(-1, d)  # (B*T, d)
+        # weights_q, indices_q = self.query_router(query_flat)
+        # indices_q = indices_q.view(-1, self.num_query_groups)  # (B*T, top_k_q)
+        # weights_q = weights_q.view(-1, self.num_query_groups, 1)  # (B*T, top_k_q, 1)
+        #
+        # # Select and compute Q projections for top_k experts
+        # selected_q_weights = self.wq[indices_q]  # (B*T, top_k_q, d, head_dim*num_heads)
+        # q_proj = torch.einsum('bd, behd -> beh', query_flat.unsqueeze(1), selected_q_weights)
+        # selected_q = (q_proj * weights_q).sum(dim=1)  # (B*T, head_dim*num_heads)
+        # selected_q = selected_q.view(b, t, -1)  # (B, T, head_dim*num_heads)
         head_dim = d // self.num_heads
         # Process Query with MoE routing
-        query_flat = query.view(-1, d)  # (B*T, d)
-        weights_q, indices_q = self.router_q(query_flat)
-        indices_q = indices_q.view(-1, self.top_k_q)  # (B*T, top_k_q)
-        weights_q = weights_q.view(-1, self.top_k_q, 1)  # (B*T, top_k_q, 1)
-        # Select and compute Q projections for top_k experts
-        selected_q_weights = self.q_experts[indices_q]  # (B*T, top_k_q, d, head_dim*num_heads)
-        q_proj = torch.einsum('bd, behd -> beh', query_flat.unsqueeze(1), selected_q_weights)
-        selected_q = (q_proj * weights_q).sum(dim=1)  # (B*T, head_dim*num_heads)
-        selected_q = selected_q.view(b, t, -1)  # (B, T, head_dim*num_heads)
+        query_flat = query.view(b * t, d)
+        weights_q, indices_q = self.query_router(query_flat)
+        weights_q = weights_q.view(b, t, self.num_query_groups, 1)
+        indices_q = indices_q.view(b, t, self.num_query_groups)
+        # Compute all experts' Q projections
+        q_all = torch.einsum(
+            'be, ehd -> bedh',
+            query_flat,
+            self.wq.view(self.num_query_experts, d, -1)
+        ).view(b, t, self.num_query_experts, -1)
+        selected_q = torch.gather(
+            q_all,
+            2,
+            indices_q.unsqueeze(-1).expand(-1, -1, -1, q_all.shape[-1])
+        )
+        selected_q = (selected_q * weights_q).sum(dim=2)
         q = selected_q.view(b, t, self.num_heads, head_dim).transpose(1, 2)  # (B, H, T, head_dim)
@@ -251,12 +519,12 @@ class SparseMoeAttention(GroupedMoeAttention):
 class FlexAttention(MultiHeadAttention):
     def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        num_global_tokens: int = 16,
-        window_size: int = 128,
-        **kwargs
+            self,
+            embed_dim: int,
+            num_heads: int,
+            num_global_tokens: int = 16,
+            window_size: int = 128,
+            **kwargs
     ):
         super().__init__(embed_dim, num_heads, **kwargs)
         self.num_global_tokens = num_global_tokens
@@ -319,14 +587,15 @@ class FlexAttention(MultiHeadAttention):
         output = self._calculate_output(combined_attn, v, b, t, d)
         return self.out_proj(output)
 class InfiniteAttention(MultiHeadAttention):
     def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        kernel_size: int = 128,
-        use_rotary: bool = True,
-        **kwargs
+            self,
+            embed_dim: int,
+            num_heads: int,
+            kernel_size: int = 128,
+            use_rotary: bool = True,
+            **kwargs
     ):
         super().__init__(embed_dim, num_heads, **kwargs)
         self.kernel_size = kernel_size
@@ -377,4 +646,89 @@ class InfiniteAttention(MultiHeadAttention):
         q = q / (q.shape[-1] ** 0.5)
         attn = torch.einsum('b h i d, b h j d -> b h i j', q, k)
         attn = torch.softmax(attn, dim=-1)
-        return torch.einsum('b h i j, b h j d -> b h i d', attn, v)
+        return torch.einsum('b h i j, b h j d -> b h i d', attn, v)
+def init_moe_attention(
+        embed_dim: int,
+        num_heads: int,
+        attention_type: str,
+        gqa_groups: int = 1,
+        dropout: float = 0.0,
+        rope: RotaryPositionalEmbedding = None,
+        rope_only_for_query: bool = False,
+        use_relative_embeddings: bool = False,
+        max_seq_len: int = 1024,
+        use_flash_attention: bool = False,
+        is_causal: bool = False,
+        use_bias: bool = False,
+        num_experts: int = None,
+        num_query_experts: int = None,
+        num_query_groups: int = None,
+) -> GroupedQueryAttention:
+    assert attention_type == 'gma' or attention_type == 'dma' or attention_type == 'gma_v' or attention_type == 'dma_v', \
+        "Error, attention type should be one of: 'gma', 'dma', 'gma_v', 'dma_v'"
+    if attention_type == "gma":
+        return GroupedMoeAttention(
+            embed_dim,
+            num_heads,
+            gqa_groups,
+            dropout=dropout,
+            rope=rope,
+            use_relative_embeddings=use_relative_embeddings,
+            max_seq_len=max_seq_len,
+            rope_only_for_query=rope_only_for_query,
+            use_flash_attention=use_flash_attention,
+            is_causal=is_causal,
+            use_bias=use_bias,
+            num_experts=num_experts,
+        )
+    elif attention_type == "dma":
+        return DeepMoeAttention(
+            embed_dim,
+            num_heads,
+            gqa_groups,
+            dropout=dropout,
+            rope=rope,
+            use_relative_embeddings=use_relative_embeddings,
+            max_seq_len=max_seq_len,
+            rope_only_for_query=rope_only_for_query,
+            use_flash_attention=use_flash_attention,
+            is_causal=is_causal,
+            use_bias=use_bias,
+            num_experts=num_experts,
+            num_query_experts=num_query_experts,
+            num_query_groups=num_query_groups,
+        )
+    elif attention_type == "gma_v":
+        return GroupedMoeAttentionVectorized(
+            embed_dim,
+            num_heads,
+            gqa_groups,
+            dropout=dropout,
+            rope=rope,
+            use_relative_embeddings=use_relative_embeddings,
+            max_seq_len=max_seq_len,
+            rope_only_for_query=rope_only_for_query,
+            use_flash_attention=use_flash_attention,
+            is_causal=is_causal,
+            use_bias=use_bias,
+            num_experts=num_experts,
+        )
+    else:
+        return DeepMoeAttentionVectorized(
+            embed_dim,
+            num_heads,
+            gqa_groups,
+            dropout=dropout,
+            rope=rope,
+            use_relative_embeddings=use_relative_embeddings,
+            max_seq_len=max_seq_len,
+            rope_only_for_query=rope_only_for_query,
+            use_flash_attention=use_flash_attention,
+            is_causal=is_causal,
+            use_bias=use_bias,
+            num_experts=num_experts,
+            num_query_experts=num_query_experts,
+            num_query_groups=num_query_groups,
+        )

rxnn/experimental/models.py ADDED Viewed

@@ -0,0 +1,117 @@
+import torch
+from torch import nn
+from typing import TypedDict, Union
+from huggingface_hub import PyTorchModelHubMixin
+from ..transformers.positional import RotaryPositionalEmbedding
+from ..transformers.attention import init_attention
+from ..transformers.layers import ClassicTransformerLayer
+from ..transformers.models import ClassicTransformerDecoder
+from ..transformers.ff import get_activation_layer
+from ..memory.stm import ShortTermMemory
+from ..utils import get_model_size
+from .attention import init_moe_attention
+class MoeAttentionTransformerConfig(TypedDict):
+    num_layers: int
+    vocab_size: int
+    embed_dim: int
+    ff_dim: int
+    att_heads: int
+    seq_len: int
+    use_flash_attention: bool
+    use_gated: bool
+    ff_activation: str
+    ff_dropout: float
+    att_dropout: float
+    use_rms_norm: bool
+    att_groups: int
+    use_moe_ff: bool
+    ff_num_experts: int
+    ff_moe_top_k: int
+    att_type: str
+    att_num_experts: int
+    att_num_query_experts: int
+    att_num_query_groups: int
+class MoeAttentionTransformer(nn.Module, PyTorchModelHubMixin, pipeline_tag="text-generation", license="apache-2.0"):
+    """Research model for experiments with Mixture-of-Experts Attention"""
+    def __init__(
+            self,
+            num_layers: int = 6,
+            vocab_size: int = 5000,
+            embed_dim: int = 128,
+            ff_dim: int = 384,
+            att_heads: int = 16,
+            seq_len: int = 256,
+            use_flash_attention: bool = True,
+            use_gated: bool = True,
+            ff_activation: str = "swish",
+            ff_dropout: float = 0.0,
+            att_dropout: float = 0.0,
+            use_rms_norm: bool = True,
+            att_groups: int = 1,
+            use_moe_ff: bool = False,
+            ff_num_experts: int = 1,
+            ff_moe_top_k: int = 1,
+            att_type: str = 'gma',
+            att_num_experts: int = None,
+            att_num_query_experts: int = None,
+            att_num_query_groups: int = None,
+            **kwargs
+    ):
+        super(MoeAttentionTransformer, self).__init__(**kwargs)
+        assert ff_activation in ['relu', 'gelu',
+                                 'swish', 'silu', 'linear',
+                                 'sigmoid'], 'Feed-forward activation could be "relu", "gelu", "swish", "silu", "linear", "sigmoid".'
+        assert att_type in ['mha', 'gqa', 'mqa', 'gma', 'dma', 'gma_v',
+                            'dma_v'], 'Self-attention type could be "mha", "gqa", "mqa", "gma", "dma", "gma_v", "dma_v"'
+        embedding = nn.Embedding(vocab_size, embed_dim)
+        rope = RotaryPositionalEmbedding(embed_dim // att_heads, seq_len)
+        ff_activation = get_activation_layer(ff_activation)
+        if att_type in ['mha', 'gqa', 'mqa']:
+            att_init = lambda: init_attention(embed_dim, att_heads, att_type, att_groups, rope=rope,
+                                              use_flash_attention=use_flash_attention, dropout=att_dropout,
+                                              max_seq_len=seq_len, is_causal=True)
+        else:
+            att_init = lambda: init_moe_attention(embed_dim, att_heads, att_type, att_groups, rope=rope,
+                                                  use_flash_attention=use_flash_attention, dropout=att_dropout,
+                                                  max_seq_len=seq_len, is_causal=True, num_experts=att_num_experts,
+                                                  num_query_experts=att_num_query_experts,
+                                                  num_query_groups=att_num_query_groups)
+        self.model = ClassicTransformerDecoder(
+            embed_dim,
+            vocab_size,
+            embedding=embedding,
+            layers=nn.ModuleList([
+                ClassicTransformerLayer(
+                    embed_dim,
+                    ff_dim,
+                    use_gated=use_gated,
+                    use_moe=use_moe_ff,
+                    num_experts=ff_num_experts,
+                    moe_top_k=ff_moe_top_k,
+                    ff_activation=ff_activation,
+                    ff_dropout=ff_dropout,
+                    use_rms_norm=use_rms_norm,
+                    self_attention=att_init(),
+                ) for _ in range(num_layers)
+            ]),
+            use_flash_attention=use_flash_attention,
+        )
+    def params_count(self):
+        return get_model_size(self.model)
+    def load_shared_embedding(self, embedding: nn.Embedding):
+        self.model.embedding = embedding
+    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None) -> Union[
+        torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        return self.model(x, attention_mask=attention_mask)

rxnn/experimental/moe.py ADDED Viewed

@@ -0,0 +1,206 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..transformers.moe import MoeRouter
+class DynamicMoeRouter(nn.Module):
+    """Dynamic Mixture-of-Experts Router layer - dynamically selects top-k experts for each token."""
+    def __init__(self, embed_dim: int, num_experts: int, top_ks: tuple[int] = (1, 2, 3), *args, **kwargs):
+        super(DynamicMoeRouter, self).__init__(*args, **kwargs)
+        self.top_ks = top_ks
+        self.num_options = len(top_ks)
+        self.num_experts = num_experts
+        self.gate = nn.Linear(embed_dim, num_experts + self.num_options, bias=False)
+        # For expert load balancing
+        self.register_buffer('aux_loss', torch.tensor(0.0), persistent=False)
+    def calculate_aux_loss(self, top_k_indices: torch.Tensor, routing_probs: torch.Tensor) -> torch.Tensor:
+        expert_mask = F.one_hot(top_k_indices, self.num_experts).float()
+        expert_usage = expert_mask.sum(dim=0).mean(dim=0)
+        mean_probs = routing_probs.mean(dim=0)
+        return (expert_usage * mean_probs).sum() * self.num_experts
+    def forward(self, x: torch.Tensor):
+        # Input shape: [batch*seq_len, embed_dim]
+        all_logits = self.gate(x)
+        routing_logits = all_logits[:, :-self.num_options]
+        options_logits = all_logits[:, -self.num_options:]
+        routing_probs = F.softmax(routing_logits, dim=-1)
+        top_k_id = torch.argmax(options_logits, dim=-1).item()
+        top_k = self.top_ks[top_k_id]
+        # Get top-k experts for each token
+        top_k_weights, top_k_indices = routing_probs.topk(top_k, dim=-1)
+        # Normalize weights (sum to 1 for each token)
+        top_k_weights = top_k_weights / (top_k_weights.sum(dim=-1, keepdim=True) + 1e-9)
+        # Load Balance Loss
+        self.aux_loss = self.calculate_aux_loss(top_k_indices, routing_probs)
+        return top_k_weights, top_k_indices, top_k
+class MoeFeedForwardVectorized(nn.Module):
+    """
+    Vectorized MoE - current implementation is incorrect - it calculates all the experts, then selects the correct ones.
+    Commented out implementation is fixing this problem, but is causing memory overflows, because of experts weights
+    indexing - it's using ~15x more memory, than dense model of similar size, so it's currently not viable.
+    It's recommended to use standard MoE from rxnn.transformers.moe instead.
+    """
+    def __init__(
+            self,
+            embed_dim: int,
+            hidden_dim: int,
+            num_experts: int,
+            activation: nn.Module,
+            top_k: int = 1,
+            dropout: float = 0.0,
+            *args,
+            **kwargs
+    ):
+        super(MoeFeedForwardVectorized, self).__init__(*args, **kwargs)
+        self.embed_dim = embed_dim
+        self.num_experts = num_experts
+        self.top_k = top_k
+        self.router = MoeRouter(embed_dim, num_experts, top_k)
+        # Batch all expert parameters together
+        self.w1 = nn.Parameter(torch.empty(num_experts, embed_dim, self._w1_dim_factor(hidden_dim)))
+        self.b1 = nn.Parameter(torch.zeros(num_experts, self._w1_dim_factor(hidden_dim)))
+        self.w2 = nn.Parameter(torch.empty(num_experts, hidden_dim, embed_dim))
+        self.b2 = nn.Parameter(torch.zeros(num_experts, embed_dim))
+        self.activation = activation
+        self.dropout = nn.Dropout(dropout)
+        # Initialize parameters
+        self._init_linear_parameters()
+        nn.init.zeros_(self.b1)
+        nn.init.zeros_(self.b2)
+    def _init_linear_parameters(self):
+        nn.init.kaiming_normal_(self.w1, nonlinearity='relu')
+        nn.init.kaiming_normal_(self.w2, nonlinearity='relu')
+    def _w1_dim_factor(self, hidden_dim: int) -> int:
+        return hidden_dim
+    def _activate(self, h: torch.Tensor):
+        return self.activation(h)
+    def router_loss(self):
+        return self.router.aux_loss
+    def forward(self, x: torch.Tensor):
+        orig_shape = x.shape
+        x = x.view(-1, self.embed_dim)  # [batch*seq_len, embed_dim]
+        # Get routing weights and indices
+        weights, indices = self.router(x)  # [batch*seq_len, top_k]
+        # Create expert masks and combine it with masks
+        mask = F.one_hot(indices, self.num_experts).float()  # [batch*seq_len, top_k, num_experts]
+        weights = (weights.unsqueeze(-1) * mask).sum(dim=1)  # [batch*seq_len, num_experts]
+        # Expert computation
+        x = x.unsqueeze(1).expand(-1, self.num_experts, -1)  # [batch*seq_len, num_experts, embed_dim]
+        # First linear layer
+        h = torch.einsum('bie,ieh->bih', x, self.w1) + self.b1  # [batch*seq_len, num_experts, hidden_dim]
+        h = self._activate(h)
+        h = self.dropout(h)
+        # Second linear layer (projection back to embed_dim)
+        out = torch.einsum('bih,ihe->bie', h, self.w2) + self.b2  # [batch*seq_len, num_experts, embed_dim]
+        # Weighted sum of expert outputs
+        out = (out * weights.unsqueeze(-1)).sum(dim=1)  # [batch*seq_len, embed_dim]
+        return out.view(*orig_shape)
+        # orig_shape = x.shape
+        # x = x.view(-1, self.embed_dim)  # [batch*seq_len, embed_dim]
+        #
+        # # Get routing weights and indices
+        # weights, indices = self.router(x)  # [B*T, top_k], [B*T, top_k]
+        #
+        # # Flatten indices and weights
+        # batch_size = x.shape[0]
+        # top_k = indices.shape[1]
+        # indices_flat = indices.view(-1)  # [B*T * top_k]
+        #
+        # # Compute contributions for selected experts without materializing large tensors
+        # # First Layer:
+        # # Compute all expert contributions first (but this may still be memory-heavy)
+        # # Alternative: Compute contributions for selected experts directly
+        # # ... (see detailed steps below)
+        #
+        # # Alternative approach using gather and batched operations
+        # x_expanded = x.unsqueeze(1).repeat(1, top_k, 1).view(-1, self.embed_dim)  # [B*T*top_k, D]
+        #
+        # # Compute first layer contributions using gather
+        # # indices_flat has shape [B*T*top_k]
+        # # selected_w1 is self.w1[indices_flat], but we compute the product inline
+        # h = torch.einsum(
+        #     'be, eih -> bh',
+        #     x_expanded,
+        #     self.w1[indices_flat]
+        # ) + self.b1[indices_flat]
+        # h = self._activate(h)
+        # h = self.dropout(h)
+        #
+        # # Second layer:
+        # out = torch.einsum(
+        #     'bh, eho -> beo',
+        #     h,
+        #     self.w2[indices_flat]
+        # ).squeeze(-1) + self.b2[indices_flat]
+        #
+        # # Reshape and apply weights
+        # out = out.view(batch_size, top_k, -1)
+        # weights = weights.view(batch_size, top_k, 1)
+        # out = (out * weights).sum(dim=1)
+        #
+        # return out.view(*orig_shape)
+class GatedMoeFeedForwardVectorized(MoeFeedForwardVectorized):
+    """Gated Mixture-of-Experts Feed-Forward layer - enable GLU-based activations for MoE"""
+    def __init__(
+            self,
+            embed_dim: int,
+            hidden_dim: int,
+            num_experts: int,
+            activation: nn.Module = nn.SiLU(),
+            top_k: int = 1,
+            dropout: float = 0.1,
+            *args,
+            **kwargs
+    ):
+        super(GatedMoeFeedForwardVectorized, self).__init__(
+            embed_dim=embed_dim,
+            hidden_dim=hidden_dim,
+            num_experts=num_experts,
+            activation=activation,
+            top_k=top_k,
+            dropout=dropout,
+            *args,
+            **kwargs
+        )
+    def _init_linear_parameters(self):
+        nn.init.kaiming_normal_(self.w1, nonlinearity='relu')
+        nn.init.kaiming_normal_(self.w2, nonlinearity='linear')
+    def _w1_dim_factor(self, hidden_dim: int) -> int:
+        return 2 * hidden_dim
+    def _activate(self, h: torch.Tensor):
+        a, b = h.chunk(2, dim=-1)
+        return a * self.activation(b)

rxnn/transformers/moe.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from .ff import FeedForward, GatedFeedForward
 class MoeRouter(nn.Module):
     """Mixture-of-Experts Router layer - computes routing weights for each expert."""
@@ -14,18 +14,27 @@ class MoeRouter(nn.Module):
         # For expert load balancing
         self.register_buffer('aux_loss', torch.tensor(0.0), persistent=False)
+    def calculate_aux_loss(self, top_k_indices: torch.Tensor, probs: torch.Tensor) -> torch.Tensor:
+        expert_mask = F.one_hot(top_k_indices, self.num_experts).float()
+        expert_usage = expert_mask.sum(dim=0).mean(dim=0)
+        mean_probs = probs.mean(dim=0)
+        return (expert_usage * mean_probs).sum() * self.num_experts
     def forward(self, x: torch.Tensor):
-        # x shape: [batch_size*seq_len, embed_dim]
+        # Input shape: [batch*seq_len, embed_dim]
         logits = self.gate(x)
         probs = F.softmax(logits, dim=-1)
-        # Expert load balancing loss
-        mean_probs = probs.mean(dim=0)  # Mean probability per expert across batch
-        self.aux_loss = (mean_probs * torch.log(mean_probs + 1e-9)).sum()  # Entropy-based loss
+        # Get top-k experts for each token
         top_k_weights, top_k_indices = probs.topk(self.top_k, dim=-1)
+        # Normalize weights (sum to 1 for each token)
         top_k_weights = top_k_weights / (top_k_weights.sum(dim=-1, keepdim=True) + 1e-9)
+        # Load Balance Loss
+        self.aux_loss = self.calculate_aux_loss(top_k_indices, probs)
         return top_k_weights, top_k_indices
@@ -51,91 +60,43 @@ class MoeFeedForward(nn.Module):
         self.router = MoeRouter(embed_dim, num_experts, top_k)
         # Batch all expert parameters together
-        self.w1 = nn.Parameter(torch.empty(num_experts, embed_dim, self._w1_dim_factor(hidden_dim)))
-        self.b1 = nn.Parameter(torch.zeros(num_experts, self._w1_dim_factor(hidden_dim)))
-        self.w2 = nn.Parameter(torch.empty(num_experts, hidden_dim, embed_dim))
-        self.b2 = nn.Parameter(torch.zeros(num_experts, embed_dim))
-        self.activation = activation
-        self.dropout = nn.Dropout(dropout)
-        # Initialize parameters
-        self._init_linear_parameters()
-        nn.init.zeros_(self.b1)
-        nn.init.zeros_(self.b2)
+        self._init_experts(num_experts, embed_dim, hidden_dim, activation, dropout)
-    def _init_linear_parameters(self):
-        nn.init.kaiming_normal_(self.w1, nonlinearity='relu')
-        nn.init.kaiming_normal_(self.w2, nonlinearity='relu')
-    def _w1_dim_factor(self, hidden_dim: int) -> int:
-        return hidden_dim
-    def _activate(self, h: torch.Tensor):
-        return self.activation(h)
+    def _init_experts(self, num_experts: int, embed_dim: int, hidden_dim: int, activation: nn.Module, dropout: float):
+        self.experts = nn.ModuleList([
+            FeedForward(embed_dim, hidden_dim, activation, dropout)
+            for _ in range(num_experts)
+        ])
     def router_loss(self):
         return self.router.aux_loss
     def forward(self, x: torch.Tensor):
-        # orig_shape = x.shape
-        # x = x.view(-1, self.embed_dim)  # [batch*seq_len, embed_dim]
-        #
-        # # Get routing weights and indices
-        # weights, indices = self.router(x)  # [batch*seq_len, top_k]
-        #
-        # # Create expert masks and combine it with masks
-        # mask = F.one_hot(indices, self.num_experts).float()  # [batch*seq_len, top_k, num_experts]
-        # weights = (weights.unsqueeze(-1) * mask).sum(dim=1)  # [batch*seq_len, num_experts]
-        #
-        # # Expert computation
-        # x = x.unsqueeze(1).expand(-1, self.num_experts, -1)  # [batch*seq_len, num_experts, embed_dim]
-        #
-        # # First linear layer
-        # h = torch.einsum('bie,ieh->bih', x, self.w1) + self.b1  # [batch*seq_len, num_experts, hidden_dim]
-        # h = self._activate(h)
-        # h = self.dropout(h)
-        #
-        # # Second linear layer (projection back to embed_dim)
-        # out = torch.einsum('bih,ihe->bie', h, self.w2) + self.b2  # [batch*seq_len, num_experts, embed_dim]
-        #
-        # # Weighted sum of expert outputs
-        # out = (out * weights.unsqueeze(-1)).sum(dim=1)  # [batch*seq_len, embed_dim]
-        #
-        # return out.view(*orig_shape)
         orig_shape = x.shape
         x = x.view(-1, self.embed_dim)  # [batch*seq_len, embed_dim]
         # Get routing weights and indices
-        weights, indices = self.router(x)  # [batch*seq_len, top_k], [batch*seq_len, top_k]
-        # Flatten indices and weights
-        batch_size = x.size(0)
-        top_k = indices.size(1)
-        indices = indices.view(-1)  # [batch*seq_len * top_k]
-        weights = weights.view(-1, 1)  # [batch*seq_len * top_k, 1]
+        weights, indices = self.router(x)  # [B*T, top_k], [B*T, top_k]
-        # Select only the relevant experts for each token
-        selected_w1 = self.w1[indices]  # [batch*seq_len * top_k, embed_dim, hidden_dim]
-        selected_b1 = self.b1[indices]  # [batch*seq_len * top_k, hidden_dim]
-        selected_w2 = self.w2[indices]  # [batch*seq_len * top_k, hidden_dim, embed_dim]
-        selected_b2 = self.b2[indices]  # [batch*seq_len * top_k, embed_dim]
+        # Create mask for expert contributions (B*T, num_experts)
+        expert_mask = F.one_hot(indices, self.num_experts).float()  # [B*T, top_k, num_experts]
+        expert_weights = (weights.unsqueeze(-1) * expert_mask).sum(dim=1)  # [B*T, num_experts]
-        # Reshape x for batched computation
-        x_expanded = x.unsqueeze(1).repeat(1, top_k, 1).view(-1, self.embed_dim)  # [batch*seq_len * top_k, embed_dim]
+        output = torch.zeros_like(x)
+        for expert_idx in range(self.num_experts):
+            # Mask for tokens where this expert is in top_k
+            mask = expert_weights[:, expert_idx] > 0
+            if not mask.any():
+                continue
-        # Compute only the selected experts
-        h = torch.einsum('be, beh -> bh', x_expanded, selected_w1) + selected_b1
-        h = self._activate(h)
-        h = self.dropout(h)
+            # Compute expert output for selected tokens
+            expert_input = x[mask]
+            expert_output = self.experts[expert_idx](expert_input)
-        out = torch.einsum('bh, bhe -> be', h, selected_w2) + selected_b2
+            # Apply combined weights for this expert
+            output[mask] += expert_output * expert_weights[mask, expert_idx].unsqueeze(-1)
-        # Reshape back and apply weights
-        out = out.view(batch_size, top_k, -1)  # [batch*seq_len, top_k, embed_dim]
-        weights = weights.view(batch_size, top_k, 1)  # [batch*seq_len, top_k, 1]
-        out = (out * weights).sum(dim=1)  # Weighted sum over top_k experts
-        return out.view(*orig_shape)
+        return output.view(*orig_shape)
 class GatedMoeFeedForward(MoeFeedForward):
@@ -163,13 +124,8 @@ class GatedMoeFeedForward(MoeFeedForward):
             **kwargs
         )
-    def _init_linear_parameters(self):
-        nn.init.kaiming_normal_(self.w1, nonlinearity='relu')
-        nn.init.kaiming_normal_(self.w2, nonlinearity='linear')
-    def _w1_dim_factor(self, hidden_dim: int) -> int:
-        return 2 * hidden_dim
-    def _activate(self, h: torch.Tensor):
-        a, b = h.chunk(2, dim=-1)
-        return a * self.activation(b)
+    def _init_experts(self, num_experts: int, embed_dim: int, hidden_dim: int, activation: nn.Module, dropout: float):
+        self.experts = nn.ModuleList([
+            GatedFeedForward(embed_dim, hidden_dim, activation, dropout)
+            for _ in range(num_experts)
+        ])

{rxnn-0.1.14.dist-info → rxnn-0.1.16.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rxnn
-Version: 0.1.14
+Version: 0.1.16
 Summary: RxNN: Reactive Neural Networks Platform
 License: Apache-2.0
 Keywords: deep-learning,ai,machine-learning

{rxnn-0.1.14.dist-info → rxnn-0.1.16.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,8 @@
 rxnn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rxnn/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-rxnn/experimental/attention.py,sha256=HahcWU37FTfW8kwSTW8z_l7EtAVkJgvDDxLU8k3miHo,17101
+rxnn/experimental/attention.py,sha256=qly-Lf9UsYC9JB945JcLnt27ZbF0vFvfyS5iUm-Rsak,31644
+rxnn/experimental/models.py,sha256=ioYtbJDxJ4zASiKs9dFY4WvAJn7eVqFf7zid-65pbUU,4709
+rxnn/experimental/moe.py,sha256=PhiaNr3FwR2Zv2a0tfj6sfZ4iyhLo3Jyp2DwXq19qZQ,7935
 rxnn/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rxnn/memory/norm.py,sha256=Ofl8Q5NYEF9GQeO0bhM43tkTW91J0y6TSvTAOYMgloM,6278
 rxnn/memory/stm.py,sha256=EsD8slSP4_9dLuq6aFPDmuFe8PWilxh90so5Z3nm-ig,2057
@@ -19,11 +21,11 @@ rxnn/transformers/ff.py,sha256=jJnuBDsnnX5uYC_WZH8cXAYrMnz0P-iX7MwcPivjRtI,2533
 rxnn/transformers/layers.py,sha256=HhIiykmrBgdsV4AbMQXr9t0cSo4gSIeN0dPtc8mDyOo,5629
 rxnn/transformers/mask.py,sha256=J0cfLVLt3SzS2ra3KcY4khrkhI975Dw4CjpUi3Sn25s,419
 rxnn/transformers/models.py,sha256=w-zB_8QB9-Fae-GkGgmVDNY-Ts_0gBeWcevpl9qzZVM,7169
-rxnn/transformers/moe.py,sha256=fFPTRcctCSc9OwHd0PhNb0nwHgNJY7dXfUtGreXtaho,6720
+rxnn/transformers/moe.py,sha256=FeaQR7hTX1dE74YdMOcuyZHSkGiV_0JwF8fw-GnfNOQ,4741
 rxnn/transformers/positional.py,sha256=2l38RS0Dini3f6Z3LUHr3XwWzg1UK7fO2C6wazWDAYU,4292
 rxnn/transformers/sampler.py,sha256=poWBpxg1iuK5gEJtxHkk5VVfS9V48hs2Olqdhy_Gw8c,6548
 rxnn/utils.py,sha256=d5U8i5ukovgDyqiycc2AoxObTz_eF_bgo2MKvdtJ98s,467
-rxnn-0.1.14.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
-rxnn-0.1.14.dist-info/METADATA,sha256=YQDNMaHDrfVdOk44qEUczgLaNcrXApoqVmNX50yQDdM,14629
-rxnn-0.1.14.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
-rxnn-0.1.14.dist-info/RECORD,,
+rxnn-0.1.16.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
+rxnn-0.1.16.dist-info/METADATA,sha256=Cr_8OPHWlf2LHYlZEmc_NaUkIiE3ShJ01Z5B5ZhI6G8,14629
+rxnn-0.1.16.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
+rxnn-0.1.16.dist-info/RECORD,,

{rxnn-0.1.14.dist-info → rxnn-0.1.16.dist-info}/LICENSE RENAMED Viewed

File without changes

{rxnn-0.1.14.dist-info → rxnn-0.1.16.dist-info}/WHEEL RENAMED Viewed

File without changes

rxnn 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl

rxnn 0.1.14py3-none-any.whl → 0.1.16py3-none-any.whl