PyPI - rxnn - Versions diffs - 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl - Mend

rxnn 0.1.33py3-none-any.whl → 0.1.35py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

rxnn/experimental/attention.py +151 -156
rxnn/experimental/moe.py +0 -40
rxnn/transformers/attention.py +45 -37
{rxnn-0.1.33.dist-info → rxnn-0.1.35.dist-info}/METADATA +1 -1
{rxnn-0.1.33.dist-info → rxnn-0.1.35.dist-info}/RECORD +7 -7
{rxnn-0.1.33.dist-info → rxnn-0.1.35.dist-info}/LICENSE +0 -0
{rxnn-0.1.33.dist-info → rxnn-0.1.35.dist-info}/WHEEL +0 -0

rxnn/experimental/attention.py CHANGED Viewed

@@ -9,6 +9,10 @@ from ..transformers.moe import MoeRouter
 class GroupedMoeAttention(GroupedQueryAttention):
     """
+    Vectorized implementation calculates all expert heads for each token and selecting active tokens later. Linear layers
+    for Attention are rather small, compared to MoE Feed Forward layers, so it's possible that it will be faster than filtering
+    experts - it has to be tested.
     Grouped MoE Attention (GMA) - GQA extended with Mixture-of-Experts (MoE) routing.
     Instead of mapping keys/values to static head groups, it dynamically selects head expert groups. It has the same
@@ -44,7 +48,7 @@ class GroupedMoeAttention(GroupedQueryAttention):
             *args,
             **kwargs,
     ):
-        self.num_experts = num_experts or num_heads
+        self.num_experts = num_experts if num_experts is not None else num_heads
         super(GroupedMoeAttention, self).__init__(
             embed_dim,
             num_heads,
@@ -61,78 +65,63 @@ class GroupedMoeAttention(GroupedQueryAttention):
             **kwargs,
         )
-    def router_loss(self):
-        return self.router.aux_loss
     def _init_kv(self, embed_dim: int):
         self.router = MoeRouter(embed_dim, self.num_experts, top_k=self.num_groups)
         hidden_dim = embed_dim // self.num_heads
-        self.wk = nn.Parameter(torch.empty(self.num_experts, hidden_dim, embed_dim))
+        self.wk = nn.Parameter(torch.empty(self.num_experts, embed_dim, hidden_dim))
         self.bk = nn.Parameter(torch.zeros(self.num_experts, hidden_dim)) if self.use_bias else None
-        self.wv = nn.Parameter(torch.empty(self.num_experts, hidden_dim, embed_dim))
+        self.wv = nn.Parameter(torch.empty(self.num_experts, embed_dim, hidden_dim))
         self.bv = nn.Parameter(torch.zeros(self.num_experts, hidden_dim)) if self.use_bias else None
         self._init_experts()
     def _init_experts(self):
-        nn.init.xavier_uniform_(self.wk)
-        nn.init.xavier_uniform_(self.wv)
+        torch.nn.init.xavier_uniform_(self.wk)
+        torch.nn.init.xavier_uniform_(self.wv)
         if self.use_bias:
-            nn.init.zeros_(self.bk)
-            nn.init.zeros_(self.bv)
-    def _process_grouped_experts(self, x: torch.Tensor, w: torch.Tensor, b: torch.Tensor, weights: torch.Tensor, indices: torch.Tensor):
-        B, S, G = indices.shape
-        x_flat = x.view(-1, x.size(-1))  # [B*S, D]
-        indices_flat = indices.view(-1, G)  # [B*S, G]
-        weights_flat = weights.view(-1, G)  # [B*S, G]
+            torch.nn.init.zeros_(self.bk)
+            torch.nn.init.zeros_(self.bv)
-        output = torch.zeros(B * S, G, w.size(1), device=x.device, dtype=x.dtype)  # [B*S, G, hidden_dim]
+    def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int,
+                     skip_query_processing: bool = False):
+        head_dim = d // self.num_heads
-        for e in range(self.num_experts):
-            # 1. Find tokens where expert `e` is used in ANY group
-            expert_mask = (indices_flat == e).any(dim=1)  # [B*S]
-            if not expert_mask.any():
-                continue
+        # Process Query as in GQA
+        q = self.q_proj(query).view(b, t, self.num_heads, -1).transpose(1, 2) if not skip_query_processing else query
-            # 2. Project tokens using expert `e`
-            x_slice = x_flat[expert_mask]  # [num_selected, D]
-            proj = F.linear(x_slice, w[e], b[e] if b is not None else None)  # [num_selected, hidden_dim]
+        # Key/Value MoE routing
+        B, S, D = key.shape
+        key_flat = key.reshape(-1, D)
+        weights, indices = self.router(key_flat)  # (B*S, num_groups), (B*S, num_groups)
+        weights = weights.view(B, S, self.num_groups, 1)
+        indices = indices.view(B, S, self.num_groups)
-            # 3. Scatter projections into correct groups
-            for g in range(G):
-                group_mask = indices_flat[expert_mask, g] == e  # [num_selected]
-                if not group_mask.any():
-                    continue
+        # Compute all experts' projections
+        # Shape: (B*S, num_experts, head_dim)
+        k_all = torch.einsum('bd,edh->beh', key_flat, self.wk)  # [B*S, num_experts, head_dim]
+        v_all = torch.einsum('bd,edh->beh', value.view(-1, D), self.wv)
-                # Get tokens in this group using expert `e`
-                group_tokens = expert_mask.nonzero()[group_mask].squeeze(1)
-                # Weight and scatter
-                weighted_proj = proj[group_mask] * weights_flat[group_tokens, g].unsqueeze(-1)
-                output[group_tokens, g] += weighted_proj
+        if self.use_bias:
+            k_all += self.bk
+            v_all += self.bv
-        return output.view(B, S, G, -1)
+        # Get results for all heads
+        k_all = k_all.view(B, S, self.num_experts, -1) # [B, S, num_experts, head_dim]
+        v_all = v_all.view(B, S, self.num_experts, -1) # [B, S, num_experts, head_dim]
-    def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int,
-                     skip_query_processing: bool = False):
-        q = self.q_proj(query).view(b, t, self.num_heads, -1).transpose(1, 2) if not skip_query_processing else query
+        # Gather top-k experts using expanded indices
+        expanded_indices = indices.unsqueeze(-1).expand(-1, -1, -1, k_all.size(-1)) # [B, S, num_groups, head_dim]
+        selected_k = torch.gather(k_all, 2, expanded_indices) # [B, S, num_groups, head_dim]
+        selected_v = torch.gather(v_all, 2, expanded_indices) # [B, S, num_groups, head_dim]
-        # Key/Value processing
-        B, S, D = key.shape
-        key_flat = key.view(-1, D)
-        weights_k_flat, indices_k_flat = self.router(key_flat)
-        # Reshape back to original dimensions
-        weights_k = weights_k_flat.view(B, S, -1)
-        indices_k = indices_k_flat.view(B, S, -1)
-        k = self._process_grouped_experts(key, self.wk, self.bk, weights_k, indices_k)
-        v = self._process_grouped_experts(value, self.wv, self.bv, weights_k, indices_k)
+        # Weighted
+        weighted_k = selected_k * weights # [B, S, num_groups, head_dim]
+        weighted_v = selected_v * weights # [B, S, num_groups, head_dim]
-        # Expand to GQA format
-        k = k.permute(0, 2, 1, 3).reshape(B, self.num_groups, S, -1)
-        v = v.permute(0, 2, 1, 3).reshape(B, self.num_groups, S, -1)
+        # Reshape to GQA format
+        k = weighted_k.view(B, S, self.num_groups, -1).permute(0, 2, 1, 3) # [B, num_groups, S, head_dim]
+        v = weighted_v.view(B, S, self.num_groups, -1).permute(0, 2, 1, 3) # [B, num_groups, S, head_dim]
-        if not self.use_flash_attention:
+        if not self.rel_embed:
             group_heads = self.num_heads // self.num_groups
             k = k.unsqueeze(2).expand(-1, -1, group_heads, -1, -1)  # (B, G, group_heads, S, head_dim)
@@ -197,55 +186,54 @@ class DeepMoeAttention(GroupedMoeAttention):
             **kwargs,
         )
-    def router_loss(self):
-        return (self.router.aux_loss + self.query_router.aux_loss) / 2
     def _init_q(self, embed_dim: int):
         self.query_router = MoeRouter(embed_dim, self.num_query_experts, top_k=self.num_query_groups)
         hidden_dim = embed_dim // self.num_heads
-        self.wq = nn.Parameter(torch.empty(self.num_query_experts, hidden_dim, embed_dim))
+        self.wq = nn.Parameter(torch.empty(self.num_query_experts, embed_dim, hidden_dim))
         self.bq = nn.Parameter(torch.zeros(self.num_query_experts, hidden_dim)) if self.use_bias else None
         self._init_query_experts()
     def _init_query_experts(self):
-        nn.init.xavier_uniform_(self.wq)
+        torch.nn.init.xavier_uniform_(self.wq)
         if self.use_bias:
-            nn.init.zeros_(self.bq)
+            torch.nn.init.zeros_(self.bq)
     def _init_out(self, embed_dim: int):
         """Initialize output projection"""
-        hidden_dim = embed_dim // (self.num_heads // self.num_query_groups)
-        self.out_proj = nn.Linear(hidden_dim, embed_dim)
+        out_hidden_dim = embed_dim // self.num_heads * self.num_query_groups
+        self.out_proj = nn.Linear(out_hidden_dim, embed_dim)
     def _transpose_output(self, attn_output: torch.Tensor, b: int, t: int, d: int):
         """Transpose attention output back to (B, T, D) shape"""
-        hidden_dim = d // self.num_heads * self.num_query_groups
-        return attn_output.transpose(1, 2).contiguous().view(b, t, hidden_dim)
+        out_hidden_dim = d // self.num_heads * self.num_query_groups
+        return attn_output.transpose(1, 2).contiguous().view(b, t, out_hidden_dim)
-    def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int, skip_query_processing: bool = False):
-        # Query processing
+    def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int):
         B, T, D = query.shape
-        # Flatten for query routing
-        query_flat = query.view(-1, D)
-        weights_q_flat, indices_q_flat = self.query_router(query_flat)
-        # Reshape back
-        weights_q = weights_q_flat.view(B, T, -1)
-        indices_q = indices_q_flat.view(B, T, -1)
-        q = self._process_grouped_experts(query, self.wq, self.bq, weights_q, indices_q)
+        query_flat = query.reshape(-1, D)
+        weights_q, indices_q = self.query_router(query_flat)
+        weights_q = weights_q.view(B, T, self.num_query_groups, 1)
+        indices_q = indices_q.view(B, T, self.num_query_groups)
-        q = q.permute(0, 2, 1, 3).reshape(B, self.num_query_groups, T, -1)
-        # Key/Value processing
-        return super()._forward_qkv(q, key, value, b, t, d, skip_query_processing=True)
+        # Compute all query experts
+        q_all = torch.einsum('bd,edh->beh', query_flat, self.wq)  # [B*T, num_query_experts, head_dim]
+        if self.use_bias:
+            q_all += self.bq
-# Vectorized
+        q_all = q_all.view(B, T, self.num_query_experts, -1)
-class GroupedMoeAttentionVectorized(GroupedQueryAttention):
-    """
-    Vectorized implementation calculates all expert heads for each token and selecting active tokens later. Linear layers
-    for Attention are rather small, compared to MoE Feed Forward layers, so it's possible that it will be faster than filtering
-    experts - it has to be tested.
+        # Gather top-k experts
+        expanded_indices = indices_q.unsqueeze(-1).expand(-1, -1, -1, q_all.size(-1))
+        selected_q = torch.gather(q_all, 2, expanded_indices)  # [B, T, num_query_groups, head_dim]
+        # Weighted sum
+        q = selected_q * weights_q  # [B, T, num_query_groups, head_dim]
+        q = q.view(B, T, self.num_query_groups, -1).permute(0, 2, 1, 3)  # [B, num_query_groups, T, head_dim]
+        return super()._forward_qkv(q, key, value, b, t, d, skip_query_processing=True)
+class GroupedMoeAttentionSimplified(GroupedQueryAttention):
+    """
     Grouped MoE Attention (GMA) - GQA extended with Mixture-of-Experts (MoE) routing.
     Instead of mapping keys/values to static head groups, it dynamically selects head expert groups. It has the same
@@ -281,8 +269,8 @@ class GroupedMoeAttentionVectorized(GroupedQueryAttention):
             *args,
             **kwargs,
     ):
-        self.num_experts = num_experts if num_experts is not None else num_heads
-        super(GroupedMoeAttentionVectorized, self).__init__(
+        self.num_experts = num_experts or num_heads
+        super(GroupedMoeAttentionSimplified, self).__init__(
             embed_dim,
             num_heads,
             num_groups=num_groups,
@@ -298,63 +286,78 @@ class GroupedMoeAttentionVectorized(GroupedQueryAttention):
             **kwargs,
         )
+    def router_loss(self):
+        return self.router.aux_loss
     def _init_kv(self, embed_dim: int):
         self.router = MoeRouter(embed_dim, self.num_experts, top_k=self.num_groups)
         hidden_dim = embed_dim // self.num_heads
-        self.wk = nn.Parameter(torch.empty(self.num_experts, embed_dim, hidden_dim))
+        self.wk = nn.Parameter(torch.empty(self.num_experts, hidden_dim, embed_dim))
         self.bk = nn.Parameter(torch.zeros(self.num_experts, hidden_dim)) if self.use_bias else None
-        self.wv = nn.Parameter(torch.empty(self.num_experts, embed_dim, hidden_dim))
+        self.wv = nn.Parameter(torch.empty(self.num_experts, hidden_dim, embed_dim))
         self.bv = nn.Parameter(torch.zeros(self.num_experts, hidden_dim)) if self.use_bias else None
         self._init_experts()
     def _init_experts(self):
-        torch.nn.init.xavier_uniform_(self.wk)
-        torch.nn.init.xavier_uniform_(self.wv)
+        nn.init.xavier_uniform_(self.wk)
+        nn.init.xavier_uniform_(self.wv)
         if self.use_bias:
-            torch.nn.init.zeros_(self.bk)
-            torch.nn.init.zeros_(self.bv)
+            nn.init.zeros_(self.bk)
+            nn.init.zeros_(self.bv)
-    def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int,
-                     skip_query_processing: bool = False):
-        head_dim = d // self.num_heads
+    def _process_grouped_experts(self, x: torch.Tensor, w: torch.Tensor, b: torch.Tensor, weights: torch.Tensor, indices: torch.Tensor):
+        B, S, G = indices.shape
+        x_flat = x.view(-1, x.size(-1))  # [B*S, D]
-        # Process Query as in GQA
-        q = self.q_proj(query).view(b, t, self.num_heads, -1).transpose(1, 2) if not skip_query_processing else query
+        indices_flat = indices.view(-1, G)  # [B*S, G]
+        weights_flat = weights.view(-1, G)  # [B*S, G]
-        # Key/Value MoE routing
-        B, S, D = key.shape
-        key_flat = key.reshape(-1, D)
-        weights, indices = self.router(key_flat)  # (B*S, num_groups), (B*S, num_groups)
-        weights = weights.view(B, S, self.num_groups, 1)
-        indices = indices.view(B, S, self.num_groups)
+        output = torch.zeros(B * S, G, w.size(1), device=x.device, dtype=x.dtype)  # [B*S, G, hidden_dim]
-        # Compute all experts' projections
-        # Shape: (B*S, num_experts, head_dim)
-        k_all = torch.einsum('bd,edh->beh', key_flat, self.wk)  # [B*S, num_experts, head_dim]
-        v_all = torch.einsum('bd,edh->beh', value.view(-1, D), self.wv)
+        for e in range(self.num_experts):
+            # 1. Find tokens where expert `e` is used in ANY group
+            expert_mask = (indices_flat == e).any(dim=1)  # [B*S]
+            if not expert_mask.any():
+                continue
-        if self.use_bias:
-            k_all += self.bk
-            v_all += self.bv
+            # 2. Project tokens using expert `e`
+            x_slice = x_flat[expert_mask]  # [num_selected, D]
+            proj = F.linear(x_slice, w[e], b[e] if b is not None else None)  # [num_selected, hidden_dim]
-        # Get results for all heads
-        k_all = k_all.view(B, S, self.num_experts, -1) # [B, S, num_experts, head_dim]
-        v_all = v_all.view(B, S, self.num_experts, -1) # [B, S, num_experts, head_dim]
+            # 3. Scatter projections into correct groups
+            for g in range(G):
+                group_mask = indices_flat[expert_mask, g] == e  # [num_selected]
+                if not group_mask.any():
+                    continue
-        # Gather top-k experts using expanded indices
-        expanded_indices = indices.unsqueeze(-1).expand(-1, -1, -1, k_all.size(-1)) # [B, S, num_groups, head_dim]
-        selected_k = torch.gather(k_all, 2, expanded_indices) # [B, S, num_groups, head_dim]
-        selected_v = torch.gather(v_all, 2, expanded_indices) # [B, S, num_groups, head_dim]
+                # Get tokens in this group using expert `e`
+                group_tokens = expert_mask.nonzero()[group_mask].squeeze(1)
+                # Weight and scatter
+                weighted_proj = proj[group_mask] * weights_flat[group_tokens, g].unsqueeze(-1)
+                output[group_tokens, g] += weighted_proj
-        # Weighted
-        weighted_k = selected_k * weights # [B, S, num_groups, head_dim]
-        weighted_v = selected_v * weights # [B, S, num_groups, head_dim]
+        return output.view(B, S, G, -1)
-        # Reshape to GQA format
-        k = weighted_k.view(B, S, self.num_groups, -1).permute(0, 2, 1, 3) # [B, num_groups, S, head_dim]
-        v = weighted_v.view(B, S, self.num_groups, -1).permute(0, 2, 1, 3) # [B, num_groups, S, head_dim]
+    def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int,
+                     skip_query_processing: bool = False):
+        q = self.q_proj(query).view(b, t, self.num_heads, -1).transpose(1, 2) if not skip_query_processing else query
+        # Key/Value processing
+        B, S, D = key.shape
+        key_flat = key.view(-1, D)
+        weights_k_flat, indices_k_flat = self.router(key_flat)
+        # Reshape back to original dimensions
+        weights_k = weights_k_flat.view(B, S, -1)
+        indices_k = indices_k_flat.view(B, S, -1)
+        k = self._process_grouped_experts(key, self.wk, self.bk, weights_k, indices_k)
+        v = self._process_grouped_experts(value, self.wv, self.bv, weights_k, indices_k)
+        # Expand to GQA format
+        k = k.permute(0, 2, 1, 3).reshape(B, self.num_groups, S, -1)
+        v = v.permute(0, 2, 1, 3).reshape(B, self.num_groups, S, -1)
-        if not self.use_flash_attention:
+        if not self.rel_embed:
             group_heads = self.num_heads // self.num_groups
             k = k.unsqueeze(2).expand(-1, -1, group_heads, -1, -1)  # (B, G, group_heads, S, head_dim)
@@ -366,12 +369,8 @@ class GroupedMoeAttentionVectorized(GroupedQueryAttention):
         return q, k, v
-class DeepMoeAttentionVectorized(GroupedMoeAttentionVectorized):
+class DeepMoeAttentionSimplified(GroupedMoeAttentionSimplified):
     """
-    Vectorized implementation calculates all expert heads for each token and selecting active tokens later. Linear layers
-    for Attention are rather small, compared to MoE Feed Forward layers, so it's possible that it will be faster than filtering
-    experts - it has to be tested.
     Deep MoE Attention (SMA) - Grouped MoE Attention extended even more for sublinear computational efficiency.
     In addition to using Mixture-of-Experts (MoE) for key/value head groups, SMA is also using dynamically selected
@@ -406,7 +405,7 @@ class DeepMoeAttentionVectorized(GroupedMoeAttentionVectorized):
     ):
         self.num_query_experts = num_query_experts if num_query_experts is not None else num_heads
         self.num_query_groups = num_query_groups if num_query_groups is not None else num_groups
-        super(DeepMoeAttentionVectorized, self).__init__(
+        super(DeepMoeAttentionSimplified, self).__init__(
             embed_dim,
             num_heads,
             num_groups=num_groups,
@@ -423,52 +422,48 @@ class DeepMoeAttentionVectorized(GroupedMoeAttentionVectorized):
             **kwargs,
         )
+    def router_loss(self):
+        return (self.router.aux_loss + self.query_router.aux_loss) / 2
     def _init_q(self, embed_dim: int):
         self.query_router = MoeRouter(embed_dim, self.num_query_experts, top_k=self.num_query_groups)
         hidden_dim = embed_dim // self.num_heads
-        self.wq = nn.Parameter(torch.empty(self.num_query_experts, embed_dim, hidden_dim))
+        self.wq = nn.Parameter(torch.empty(self.num_query_experts, hidden_dim, embed_dim))
         self.bq = nn.Parameter(torch.zeros(self.num_query_experts, hidden_dim)) if self.use_bias else None
         self._init_query_experts()
     def _init_query_experts(self):
-        torch.nn.init.xavier_uniform_(self.wq)
+        nn.init.xavier_uniform_(self.wq)
         if self.use_bias:
-            torch.nn.init.zeros_(self.bq)
+            nn.init.zeros_(self.bq)
     def _init_out(self, embed_dim: int):
         """Initialize output projection"""
-        out_hidden_dim = embed_dim // self.num_heads * self.num_query_groups
-        self.out_proj = nn.Linear(out_hidden_dim, embed_dim)
+        hidden_dim = embed_dim // (self.num_heads // self.num_query_groups)
+        self.out_proj = nn.Linear(hidden_dim, embed_dim)
     def _transpose_output(self, attn_output: torch.Tensor, b: int, t: int, d: int):
         """Transpose attention output back to (B, T, D) shape"""
-        out_hidden_dim = d // self.num_heads * self.num_query_groups
-        return attn_output.transpose(1, 2).contiguous().view(b, t, out_hidden_dim)
+        hidden_dim = d // self.num_heads * self.num_query_groups
+        return attn_output.transpose(1, 2).contiguous().view(b, t, hidden_dim)
-    def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int):
+    def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int, skip_query_processing: bool = False):
+        # Query processing
         B, T, D = query.shape
-        query_flat = query.reshape(-1, D)
-        weights_q, indices_q = self.query_router(query_flat)
-        weights_q = weights_q.view(B, T, self.num_query_groups, 1)
-        indices_q = indices_q.view(B, T, self.num_query_groups)
-        # Compute all query experts
-        q_all = torch.einsum('bd,edh->beh', query_flat, self.wq)  # [B*T, num_query_experts, head_dim]
-        if self.use_bias:
-            q_all += self.bq
-        q_all = q_all.view(B, T, self.num_query_experts, -1)
-        # Gather top-k experts
-        expanded_indices = indices_q.unsqueeze(-1).expand(-1, -1, -1, q_all.size(-1))
-        selected_q = torch.gather(q_all, 2, expanded_indices)  # [B, T, num_query_groups, head_dim]
-        # Weighted sum
-        q = selected_q * weights_q  # [B, T, num_query_groups, head_dim]
-        q = q.view(B, T, self.num_query_groups, -1).permute(0, 2, 1, 3)  # [B, num_query_groups, T, head_dim]
+        # Flatten for query routing
+        query_flat = query.view(-1, D)
+        weights_q_flat, indices_q_flat = self.query_router(query_flat)
+        # Reshape back
+        weights_q = weights_q_flat.view(B, T, -1)
+        indices_q = indices_q_flat.view(B, T, -1)
+        q = self._process_grouped_experts(query, self.wq, self.bq, weights_q, indices_q)
+        q = q.permute(0, 2, 1, 3).reshape(B, self.num_query_groups, T, -1)
+        # Key/Value processing
         return super()._forward_qkv(q, key, value, b, t, d, skip_query_processing=True)
 # Others
 class FlexAttention(MultiHeadAttention):

rxnn/experimental/moe.py CHANGED Viewed

@@ -3,46 +3,6 @@ import torch.nn as nn
 import torch.nn.functional as F
 from ..transformers.moe import MoeRouter
-class DynamicMoeRouter(nn.Module):
-    """Dynamic Mixture-of-Experts Router layer - dynamically selects top-k experts for each token."""
-    def __init__(self, embed_dim: int, num_experts: int, top_ks: tuple[int] = (1, 2, 3), *args, **kwargs):
-        super(DynamicMoeRouter, self).__init__(*args, **kwargs)
-        self.top_ks = top_ks
-        self.num_options = len(top_ks)
-        self.num_experts = num_experts
-        self.gate = nn.Linear(embed_dim, num_experts + self.num_options, bias=False)
-        # For expert load balancing
-        self.register_buffer('aux_loss', torch.tensor(0.0), persistent=False)
-    def calculate_aux_loss(self, top_k_indices: torch.Tensor, routing_probs: torch.Tensor) -> torch.Tensor:
-        expert_mask = F.one_hot(top_k_indices, self.num_experts).float()
-        expert_usage = expert_mask.sum(dim=0).mean(dim=0)
-        mean_probs = routing_probs.mean(dim=0)
-        return (expert_usage * mean_probs).sum() * self.num_experts
-    def forward(self, x: torch.Tensor):
-        # Input shape: [batch*seq_len, embed_dim]
-        all_logits = self.gate(x)
-        routing_logits = all_logits[:, :-self.num_options]
-        options_logits = all_logits[:, -self.num_options:]
-        routing_probs = F.softmax(routing_logits, dim=-1)
-        top_k_id = torch.argmax(options_logits, dim=-1).item()
-        top_k = self.top_ks[top_k_id]
-        # Get top-k experts for each token
-        top_k_weights, top_k_indices = routing_probs.topk(top_k, dim=-1)
-        # Normalize weights (sum to 1 for each token)
-        top_k_weights = top_k_weights / (top_k_weights.sum(dim=-1, keepdim=True) + 1e-9)
-        # Load Balance Loss
-        self.aux_loss = self.calculate_aux_loss(top_k_indices, routing_probs)
-        return top_k_weights, top_k_indices, top_k
 class MoeFeedForwardVectorized(nn.Module):
     """
     Vectorized MoE - current implementation is incorrect - it calculates all the experts, then selects the correct ones.

rxnn/transformers/attention.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.backends.cuda import sdp_kernel, SDPBackend
+from torch.nn.attention import sdpa_kernel, SDPBackend
 import math
 from .positional import RotaryPositionalEmbedding, RelativePositionalEmbedding
@@ -102,36 +102,41 @@ class MultiHeadAttention(nn.Module):
     def _flash_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, b: int, t: int, d: int,
                          mask: torch.Tensor = None, enable_gqa: bool = False):
-        with sdp_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
-            attn_output = F.scaled_dot_product_attention(
-                q, k, v,
-                attn_mask=mask if not self.is_causal else None,
-                dropout_p=self.dropout.p if self.training else 0.0,
-                is_causal=self.is_causal,
-                enable_gqa=enable_gqa,
-            )
+        with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
+            return self._torch_attention(q, k, v, b, t, d, mask=mask, enable_gqa=enable_gqa)
+    def _torch_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, b: int, t: int, d: int,
+                         mask: torch.Tensor = None, enable_gqa: bool = False):
+        attn_output = F.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask=mask if not self.is_causal else None,
+            dropout_p=self.dropout.p if self.training else 0.0,
+            is_causal=self.is_causal,
+            enable_gqa=enable_gqa,
+        )
         return self._transpose_output(attn_output, b, t, d)
-    def _calculate_flash_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, b: int, t: int, d: int,
-                                   mask: torch.Tensor = None):
-        # Compute attention with FlashAttention
-        return self._flash_attention(q.contiguous(), k.contiguous(), v.contiguous(), b, t, d, mask=mask)
+    def _calculate_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, b: int, t: int, d: int, mask: torch.Tensor = None):
+        if self.use_flash_attention:
+            # Compute attention with FlashAttention
+            return self._flash_attention(q.contiguous(), k.contiguous(), v.contiguous(), b, t, d, mask=mask)
+        else:
+            # Compute attention using optimized PyTorch implementation
+            return self._torch_attention(q.contiguous(), k.contiguous(), v.contiguous(), b, t, d, mask=mask)
+    def _calculate_attention_with_relative_embedding(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, b: int, t: int, d: int, mask: torch.Tensor = None):
+        attn_weights = self._calculate_attn_weight_with_relative_embeddings(q, k, mask=mask)
+        attn_weights = self.dropout(attn_weights)
+        return self._calculate_output(attn_weights, v, b, t, d)
     def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: torch.Tensor = None):
         b, t, d = query.size()
         q, k, v = self._forward_qkv(query, key, value, b, t, d)
-        if self.use_flash_attention:
+        if not self.rel_embed:
             q, k = self._apply_rope(q, k)
-            attn_output = self._calculate_flash_attention(q, k, v, b, t, d, mask=mask)
+            attn_output = self._calculate_attention(q, k, v, b, t, d, mask=mask)
         else:
-            if not self.rel_embed:
-                attn_weights = self._calculate_attn_weights(q, k, d, mask=mask)
-            else:
-                attn_weights = self._calculate_attn_weight_with_relative_embeddings(q, k, mask=mask)
-            attn_weights = self.dropout(attn_weights)
-            attn_output = self._calculate_output(attn_weights, v, b, t, d)
+            attn_output = self._calculate_attention_with_relative_embedding(q, k, v, b, t, d, mask=mask)
         return self.out_proj(attn_output)
@@ -178,7 +183,7 @@ class GroupedQueryAttention(MultiHeadAttention):
     def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int):
         """Override query, key, and value projections for GQA case - split data into heads and groups"""
         head_dim = d // self.num_heads
-        if self.use_flash_attention:
+        if not self.rel_embed:
             q = self.q_proj(query).view(b, t, self.num_heads, head_dim).transpose(1, 2)
             k = self.k_proj(key).view(b, -1, self.num_groups, head_dim).transpose(1, 2)
             v = self.v_proj(value).view(b, -1, self.num_groups, head_dim).transpose(1, 2)
@@ -202,12 +207,14 @@ class GroupedQueryAttention(MultiHeadAttention):
             v = v.flatten(start_dim=1, end_dim=2)  # (B, H, S, head_dim)
         return q, k, v
-    def _calculate_flash_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, b: int, t: int, d: int,
-                                   mask: torch.Tensor = None):
-        return self._flash_attention(
-            q.contiguous(), k.contiguous(), v.contiguous(), b, t, d, mask=mask,
-            enable_gqa=(self.num_heads != self.num_groups)
-        )
+    def _calculate_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, b: int, t: int, d: int, mask: torch.Tensor = None):
+        is_gqa = self.num_heads != self.num_groups
+        if self.use_flash_attention:
+            # Compute attention with FlashAttention
+            return self._flash_attention(q.contiguous(), k.contiguous(), v.contiguous(), b, t, d, mask=mask, enable_gqa=is_gqa)
+        else:
+            # Compute attention using optimized PyTorch implementation
+            return self._torch_attention(q.contiguous(), k.contiguous(), v.contiguous(), b, t, d, mask=mask, enable_gqa=is_gqa)
 class MultiQueryAttention(MultiHeadAttention):
@@ -251,7 +258,7 @@ class MultiQueryAttention(MultiHeadAttention):
     def _forward_qkv(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, b: int, t: int, d: int):
         """Override query, key, and value projections for GQA case - use multiple heads
         for query and single for key/values"""
-        if self.use_flash_attention:
+        if not self.rel_embed:
             q = self.q_proj(query).view(b, t, self.num_heads, d // self.num_heads).transpose(1, 2)
             k = self.k_proj(key).view(b, -1, 1, d // self.num_heads).transpose(1, 2)
             v = self.v_proj(value).view(b, -1, 1, d // self.num_heads).transpose(1, 2)
@@ -261,12 +268,13 @@ class MultiQueryAttention(MultiHeadAttention):
             v = self.v_proj(value).unsqueeze(1).expand(-1, self.num_heads, -1, -1)
         return q, k, v
-    def _calculate_flash_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, b: int, t: int, d: int,
-                                   mask: torch.Tensor = None):
-        return self._flash_attention(
-            q.contiguous(), k.contiguous(), v.contiguous(), b, t, d, mask=mask,
-            enable_gqa=True
-        )
+    def _calculate_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, b: int, t: int, d: int, mask: torch.Tensor = None):
+        if self.use_flash_attention:
+            # Compute attention with FlashAttention
+            return self._flash_attention(q.contiguous(), k.contiguous(), v.contiguous(), b, t, d, mask=mask, enable_gqa=True)
+        else:
+            # Compute attention using optimized PyTorch implementation
+            return self._torch_attention(q.contiguous(), k.contiguous(), v.contiguous(), b, t, d, mask=mask, enable_gqa=True)
 def init_attention(

{rxnn-0.1.33.dist-info → rxnn-0.1.35.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rxnn
-Version: 0.1.33
+Version: 0.1.35
 Summary: RxNN: Reactive Neural Networks Platform
 License: Apache-2.0
 Keywords: deep-learning,ai,machine-learning

{rxnn-0.1.33.dist-info → rxnn-0.1.35.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 rxnn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rxnn/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-rxnn/experimental/attention.py,sha256=h9pEv_70NKpD5KOQOFP3h-IJKzh7Wbnaxka4Bd3rdd8,29745
+rxnn/experimental/attention.py,sha256=GxbLmOTBvUiYU0Rc_0ju1n_ocJciHC6i3neDGe-rZZc,29426
 rxnn/experimental/models.py,sha256=QEuFBB9iEg5AbKQLwGJkAwPjMfaVeTqazhKDWPRkm7o,4598
-rxnn/experimental/moe.py,sha256=PhiaNr3FwR2Zv2a0tfj6sfZ4iyhLo3Jyp2DwXq19qZQ,7935
+rxnn/experimental/moe.py,sha256=jHZ1QhpWiVQOswVpFmuH7b2IUOPf0Uuf-I2Ddwsd7Us,6140
 rxnn/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rxnn/memory/norm.py,sha256=Ofl8Q5NYEF9GQeO0bhM43tkTW91J0y6TSvTAOYMgloM,6278
 rxnn/memory/stm.py,sha256=EsD8slSP4_9dLuq6aFPDmuFe8PWilxh90so5Z3nm-ig,2057
@@ -16,7 +16,7 @@ rxnn/training/dataset.py,sha256=vQ5mDF3bA0HXya474n4D4iL8Mn3AEpJukgzFNVkxjGU,5106
 rxnn/training/scheduler.py,sha256=ow6oALzWjWQmHSpcJEjv6tg4g4CDMvr73TypxfcefMc,712
 rxnn/training/tokenizer.py,sha256=4Y41f07uo2KPA_7bp3FCcwGKbXoS2hsckOoXUsXfQxY,8052
 rxnn/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-rxnn/transformers/attention.py,sha256=Nox986BH9qq4rDYLiYmfj1DeMeULF3akexIl99MPccM,14331
+rxnn/transformers/attention.py,sha256=zv0uH3_L39tVmpiwNdmEf6Cp602uqdbr3UQj8Z3hIIk,15349
 rxnn/transformers/ff.py,sha256=jJnuBDsnnX5uYC_WZH8cXAYrMnz0P-iX7MwcPivjRtI,2533
 rxnn/transformers/layers.py,sha256=n_jZTqEF_vLkF31AkB5XGErfm2sQFd9CRqJUHKRFkKI,6956
 rxnn/transformers/mask.py,sha256=J0cfLVLt3SzS2ra3KcY4khrkhI975Dw4CjpUi3Sn25s,419
@@ -25,7 +25,7 @@ rxnn/transformers/moe.py,sha256=6Cffyo0QjmEWc4rK1ncOmLRCQbY0OpQJ4D7xH_4nTN4,4738
 rxnn/transformers/positional.py,sha256=2l38RS0Dini3f6Z3LUHr3XwWzg1UK7fO2C6wazWDAYU,4292
 rxnn/transformers/sampler.py,sha256=poWBpxg1iuK5gEJtxHkk5VVfS9V48hs2Olqdhy_Gw8c,6548
 rxnn/utils.py,sha256=d5U8i5ukovgDyqiycc2AoxObTz_eF_bgo2MKvdtJ98s,467
-rxnn-0.1.33.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
-rxnn-0.1.33.dist-info/METADATA,sha256=m3DWDnTu7Lx1kHYPIAQCdKU8t4QZBdqG0QcSIFvB924,16627
-rxnn-0.1.33.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
-rxnn-0.1.33.dist-info/RECORD,,
+rxnn-0.1.35.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
+rxnn-0.1.35.dist-info/METADATA,sha256=aziCzqOeetdE3gMV2i15QoB5O31bGpiZgzcpGM97QPk,16627
+rxnn-0.1.35.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
+rxnn-0.1.35.dist-info/RECORD,,

{rxnn-0.1.33.dist-info → rxnn-0.1.35.dist-info}/LICENSE RENAMED Viewed

File without changes

{rxnn-0.1.33.dist-info → rxnn-0.1.35.dist-info}/WHEEL RENAMED Viewed

File without changes

rxnn 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl

rxnn 0.1.33py3-none-any.whl → 0.1.35py3-none-any.whl