PyPI - rxnn - Versions diffs - 0.1.17__tar.gz → 0.1.19__tar.gz - Mend

rxnn 0.1.17tar.gz → 0.1.19tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{rxnn-0.1.17 → rxnn-0.1.19}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rxnn
-Version: 0.1.17
+Version: 0.1.19
 Summary: RxNN: Reactive Neural Networks Platform
 License: Apache-2.0
 Keywords: deep-learning,ai,machine-learning

{rxnn-0.1.17 → rxnn-0.1.19}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "rxnn"
-version = "0.1.17"
+version = "0.1.19"
 description = "RxNN: Reactive Neural Networks Platform"
 license = "Apache-2.0"

{rxnn-0.1.17 → rxnn-0.1.19}/src/rxnn/experimental/attention.py RENAMED Viewed

@@ -100,7 +100,7 @@ class GroupedMoeAttention(GroupedQueryAttention):
             # Get positions where expert e is used in any group
             x_slice = x_flat[token_mask]
-            proj = F.linear(x_slice, w[e], b[e] if b is not None else None)
+            proj = F.linear(x_slice, w[e].t(), b[e] if b is not None else None)
             # Find which groups use this expert for selected tokens
             group_mask = (indices_flat[token_mask] == e)
@@ -118,6 +118,7 @@ class GroupedMoeAttention(GroupedQueryAttention):
         # Key/Value processing
         B, S, D = key.shape
         key_flat = key.view(-1, D)
+        print('key_flat: ', key_flat.shape)
         weights_k_flat, indices_k_flat = self.router(key_flat)
         # Reshape back to original dimensions
         weights_k = weights_k_flat.view(B, S, -1)

{rxnn-0.1.17 → rxnn-0.1.19}/src/rxnn/transformers/moe.py RENAMED Viewed

@@ -16,22 +16,26 @@ class MoeRouter(nn.Module):
     def calculate_aux_loss(self, top_k_indices: torch.Tensor, probs: torch.Tensor) -> torch.Tensor:
         expert_mask = F.one_hot(top_k_indices, self.num_experts).float()
+        print('expert mask: ', expert_mask.shape)
         expert_usage = expert_mask.sum(dim=0).mean(dim=0)
+        print('expert usage: ', expert_usage.shape)
         mean_probs = probs.mean(dim=0)
+        print('mean probs: ', mean_probs.shape)
         return (expert_usage * mean_probs).sum() * self.num_experts
     def forward(self, x: torch.Tensor):
         # Input shape: [batch*seq_len, embed_dim]
         logits = self.gate(x)
+        print('router logits: ', logits.shape)
         probs = F.softmax(logits, dim=-1)
+        print('router probs: ', probs.shape)
         # Get top-k experts for each token
         top_k_weights, top_k_indices = probs.topk(self.top_k, dim=-1)
         # Normalize weights (sum to 1 for each token)
         top_k_weights = top_k_weights / (top_k_weights.sum(dim=-1, keepdim=True) + 1e-9)
+        print('top k: ', top_k_weights.shape, top_k_indices.shape)
         # Load Balance Loss
         self.aux_loss = self.calculate_aux_loss(top_k_indices, probs)