PyPI - rxnn - Versions diffs - 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl - Mend

rxnn 0.1.19py3-none-any.whl → 0.1.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

rxnn/experimental/attention.py CHANGED Viewed

@@ -65,9 +65,9 @@ class GroupedMoeAttention(GroupedQueryAttention):
         self.router = MoeRouter(embed_dim, self.num_experts, top_k=self.num_groups)
         hidden_dim = embed_dim // self.num_heads
-        self.wk = nn.Parameter(torch.empty(self.num_experts, embed_dim, hidden_dim))
+        self.wk = nn.Parameter(torch.empty(self.num_experts, hidden_dim, embed_dim))
         self.bk = nn.Parameter(torch.zeros(self.num_experts, hidden_dim)) if self.use_bias else None
-        self.wv = nn.Parameter(torch.empty(self.num_experts, embed_dim, hidden_dim))
+        self.wv = nn.Parameter(torch.empty(self.num_experts, hidden_dim, embed_dim))
         self.bv = nn.Parameter(torch.zeros(self.num_experts, hidden_dim)) if self.use_bias else None
         self._init_experts()
@@ -80,34 +80,34 @@ class GroupedMoeAttention(GroupedQueryAttention):
     def _process_grouped_experts(self, x: torch.Tensor, w: torch.Tensor, b: torch.Tensor, weights: torch.Tensor, indices: torch.Tensor):
         B, S, G = indices.shape
-        x_flat = x.view(-1, x.size(-1))
-        # Flatten batch and sequence dimensions
-        indices_flat = indices.view(-1, G)
-        weights_flat = weights.view(-1, G, 1)
+        x_flat = x.view(-1, x.size(-1))  # [B*S, D]
-        # Create expanded indices for expert processing
-        mask = torch.zeros(B * S, self.num_experts, device=x.device, dtype=torch.bool)
-        for g in range(G):
-            mask.scatter_(1, indices_flat[:, g].unsqueeze(1), True)
+        indices_flat = indices.view(-1, G)  # [B*S, G]
+        weights_flat = weights.view(-1, G)  # [B*S, G]
-        output = torch.zeros(B * S, G, w.size(2), device=x.device, dtype=x.dtype)
+        output = torch.zeros(B * S, G, w.size(1), device=x.device, dtype=x.dtype)  # [B*S, G, hidden_dim]
         for e in range(self.num_experts):
-            token_mask = mask[:, e]
-            if not token_mask.any():
+            # 1. Find tokens where expert `e` is used in ANY group
+            expert_mask = (indices_flat == e).any(dim=1)  # [B*S]
+            if not expert_mask.any():
                 continue
-            # Get positions where expert e is used in any group
-            x_slice = x_flat[token_mask]
-            proj = F.linear(x_slice, w[e].t(), b[e] if b is not None else None)
+            # 2. Project tokens using expert `e`
+            x_slice = x_flat[expert_mask]  # [num_selected, D]
+            proj = F.linear(x_slice, w[e], b[e] if b is not None else None)  # [num_selected, hidden_dim]
-            # Find which groups use this expert for selected tokens
-            group_mask = (indices_flat[token_mask] == e)
+            # 3. Scatter projections into correct groups
+            for g in range(G):
+                group_mask = indices_flat[expert_mask, g] == e  # [num_selected]
+                if not group_mask.any():
+                    continue
-            # Accumulate projections for relevant groups
-            weighted_proj = proj.unsqueeze(1) * weights_flat[token_mask] * group_mask.unsqueeze(-1).float()
-            output[token_mask] += weighted_proj.sum(dim=1)
+                # Get tokens in this group using expert `e`
+                group_tokens = expert_mask.nonzero()[group_mask].squeeze(1)
+                # Weight and scatter
+                weighted_proj = proj[group_mask] * weights_flat[group_tokens, g].unsqueeze(-1)
+                output[group_tokens, g] += weighted_proj
         return output.view(B, S, G, -1)
@@ -118,7 +118,6 @@ class GroupedMoeAttention(GroupedQueryAttention):
         # Key/Value processing
         B, S, D = key.shape
         key_flat = key.view(-1, D)
-        print('key_flat: ', key_flat.shape)
         weights_k_flat, indices_k_flat = self.router(key_flat)
         # Reshape back to original dimensions
         weights_k = weights_k_flat.view(B, S, -1)
@@ -126,6 +125,9 @@ class GroupedMoeAttention(GroupedQueryAttention):
         k = self._process_grouped_experts(key, self.wk, self.bk, weights_k, indices_k)
         v = self._process_grouped_experts(value, self.wv, self.bv, weights_k, indices_k)
+        print('processed k', k.size())
+        print('processed v', v.size())
         # Expand to GQA format
         k = k.permute(0, 2, 1, 3).reshape(B, self.num_groups, S, -1)
         v = v.permute(0, 2, 1, 3).reshape(B, self.num_groups, S, -1)
@@ -139,6 +141,10 @@ class GroupedMoeAttention(GroupedQueryAttention):
             k = k.flatten(start_dim=1, end_dim=2)  # (B, H, S, head_dim)
             v = v.flatten(start_dim=1, end_dim=2)  # (B, H, S, head_dim)
+        print('q', q.size())
+        print('k', k.size())
+        print('v', v.size())
         return q, k, v
@@ -199,7 +205,7 @@ class DeepMoeAttention(GroupedMoeAttention):
         self.query_router = MoeRouter(embed_dim, self.num_query_experts, top_k=self.num_query_groups)
         hidden_dim = embed_dim // self.num_heads
-        self.wq = nn.Parameter(torch.empty(self.num_query_experts, embed_dim, hidden_dim))
+        self.wq = nn.Parameter(torch.empty(self.num_query_experts, hidden_dim, embed_dim))
         self.bq = nn.Parameter(torch.zeros(self.num_query_experts, hidden_dim)) if self.use_bias else None
         self._init_query_experts()
@@ -217,7 +223,7 @@ class DeepMoeAttention(GroupedMoeAttention):
         # Query processing
         B, T, D = query.shape
         # Flatten for query routing
-        query_flat = query.view(B * T, D)
+        query_flat = query.view(-1, D)
         weights_q_flat, indices_q_flat = self.query_router(query_flat)
         # Reshape back
         weights_q = weights_q_flat.view(B, T, -1)

rxnn/transformers/moe.py CHANGED Viewed

@@ -16,26 +16,20 @@ class MoeRouter(nn.Module):
     def calculate_aux_loss(self, top_k_indices: torch.Tensor, probs: torch.Tensor) -> torch.Tensor:
         expert_mask = F.one_hot(top_k_indices, self.num_experts).float()
-        print('expert mask: ', expert_mask.shape)
         expert_usage = expert_mask.sum(dim=0).mean(dim=0)
-        print('expert usage: ', expert_usage.shape)
         mean_probs = probs.mean(dim=0)
-        print('mean probs: ', mean_probs.shape)
         return (expert_usage * mean_probs).sum() * self.num_experts
     def forward(self, x: torch.Tensor):
         # Input shape: [batch*seq_len, embed_dim]
         logits = self.gate(x)
-        print('router logits: ', logits.shape)
         probs = F.softmax(logits, dim=-1)
-        print('router probs: ', probs.shape)
         # Get top-k experts for each token
         top_k_weights, top_k_indices = probs.topk(self.top_k, dim=-1)
         # Normalize weights (sum to 1 for each token)
         top_k_weights = top_k_weights / (top_k_weights.sum(dim=-1, keepdim=True) + 1e-9)
-        print('top k: ', top_k_weights.shape, top_k_indices.shape)
         # Load Balance Loss
         self.aux_loss = self.calculate_aux_loss(top_k_indices, probs)

{rxnn-0.1.19.dist-info → rxnn-0.1.21.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rxnn
-Version: 0.1.19
+Version: 0.1.21
 Summary: RxNN: Reactive Neural Networks Platform
 License: Apache-2.0
 Keywords: deep-learning,ai,machine-learning

{rxnn-0.1.19.dist-info → rxnn-0.1.21.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 rxnn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rxnn/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-rxnn/experimental/attention.py,sha256=nvYtC6BdJQ8VUcNc_co2Fe2at7TBzA4OOIfG2tWWVCk,32104
+rxnn/experimental/attention.py,sha256=6qD3QCpkQHsIaNktjcQrRitQgQ-WkRUVtSFgEDfYGbA,32340
 rxnn/experimental/models.py,sha256=-XkEHsyT8iNAjhZbgC7N_5nzP4ENVJLwxSoLHgMfA0I,4668
 rxnn/experimental/moe.py,sha256=PhiaNr3FwR2Zv2a0tfj6sfZ4iyhLo3Jyp2DwXq19qZQ,7935
 rxnn/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -21,11 +21,11 @@ rxnn/transformers/ff.py,sha256=jJnuBDsnnX5uYC_WZH8cXAYrMnz0P-iX7MwcPivjRtI,2533
 rxnn/transformers/layers.py,sha256=HhIiykmrBgdsV4AbMQXr9t0cSo4gSIeN0dPtc8mDyOo,5629
 rxnn/transformers/mask.py,sha256=J0cfLVLt3SzS2ra3KcY4khrkhI975Dw4CjpUi3Sn25s,419
 rxnn/transformers/models.py,sha256=w-zB_8QB9-Fae-GkGgmVDNY-Ts_0gBeWcevpl9qzZVM,7169
-rxnn/transformers/moe.py,sha256=gJ-jXKtc01xcBayaYchRZy7imFGnvwVfUflXvFiKjKU,5048
+rxnn/transformers/moe.py,sha256=msspVdefdt2ekIN8aT-V8DolK4taESQL_NVsSGOepIs,4739
 rxnn/transformers/positional.py,sha256=2l38RS0Dini3f6Z3LUHr3XwWzg1UK7fO2C6wazWDAYU,4292
 rxnn/transformers/sampler.py,sha256=poWBpxg1iuK5gEJtxHkk5VVfS9V48hs2Olqdhy_Gw8c,6548
 rxnn/utils.py,sha256=d5U8i5ukovgDyqiycc2AoxObTz_eF_bgo2MKvdtJ98s,467
-rxnn-0.1.19.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
-rxnn-0.1.19.dist-info/METADATA,sha256=4ul6X1SOT2bzHCxK88SjcYc0-1zy8YAKPCoMtZ2dKrY,16627
-rxnn-0.1.19.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
-rxnn-0.1.19.dist-info/RECORD,,
+rxnn-0.1.21.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
+rxnn-0.1.21.dist-info/METADATA,sha256=gtCrs3sVTMXB9UNS1-qcJNIPzHNO8d7UaJlfviJNFEI,16627
+rxnn-0.1.21.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
+rxnn-0.1.21.dist-info/RECORD,,

{rxnn-0.1.19.dist-info → rxnn-0.1.21.dist-info}/LICENSE RENAMED Viewed

File without changes

{rxnn-0.1.19.dist-info → rxnn-0.1.21.dist-info}/WHEEL RENAMED Viewed

File without changes

rxnn 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl

rxnn 0.1.19py3-none-any.whl → 0.1.21py3-none-any.whl