PyPI - rxnn - Versions diffs - 0.1.51__py3-none-any.whl → 0.1.52__py3-none-any.whl - Mend

rxnn 0.1.51py3-none-any.whl → 0.1.52py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

rxnn/transformers/models.py CHANGED Viewed

@@ -16,6 +16,7 @@ class ReactiveTransformerBase(nn.Module):
             shared_layers: nn.ModuleList = None,
             absolute_embedding: AbsolutePositionalEmbedding = None,
             use_flash_attention: bool = False,
+            use_relative_embedding: bool = False,
             *args,
             **kwargs,
     ):
@@ -25,6 +26,7 @@ class ReactiveTransformerBase(nn.Module):
         self.stm = stm
         self.pos_embedding = absolute_embedding
         self.use_flash_attention = use_flash_attention
+        self.use_relative_embedding = use_relative_embedding
         self.shared_layers = shared_layers
         self.layers = own_layers
@@ -59,7 +61,7 @@ class ReactiveTransformerDecoder(ReactiveTransformerBase):
     def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None) -> torch.Tensor:
         x = super().forward(x)  # apply embeddings
         seq_len = x.size(1)
-        if not self.use_flash_attention:
+        if not self.use_flash_attention and self.use_relative_embedding:
             mask = create_causal_mask(seq_len, device=x.device)
             if attention_mask is not None:
                 mask &= attention_mask.unsqueeze(1).unsqueeze(1).bool()
@@ -111,6 +113,7 @@ class ClassicTransformerBase(nn.Module):
             layers: nn.ModuleList,
             absolute_embedding: AbsolutePositionalEmbedding = None,
             use_flash_attention: bool = False,
+            use_relative_embedding: bool = False,
             *args,
             **kwargs,
     ):
@@ -119,6 +122,7 @@ class ClassicTransformerBase(nn.Module):
         self.embedding = embedding
         self.pos_embedding = absolute_embedding
         self.use_flash_attention = use_flash_attention
+        self.use_relative_embedding = use_relative_embedding
         self.layers = layers
         self.num_layers = len(layers) if layers else 0
@@ -144,7 +148,7 @@ class ClassicTransformerDecoder(ClassicTransformerBase):
     def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None) -> torch.Tensor:
         x = super().forward(x)  # apply embeddings
         seq_len = x.size(1)
-        if not self.use_flash_attention:
+        if not self.use_flash_attention and self.use_relative_embedding:
             mask = create_causal_mask(seq_len, device=x.device)
             if attention_mask is not None:
                 mask &= attention_mask.unsqueeze(1).unsqueeze(1).bool()

rxnn/transformers/moe.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from .ff import FeedForward, GatedFeedForward
 class MoeRouter(nn.Module):
@@ -14,11 +15,36 @@ class MoeRouter(nn.Module):
         # For expert load balancing
         self.register_buffer('aux_loss', torch.tensor(0.0), persistent=False)
+    # def calculate_aux_loss(self, top_k_indices: torch.Tensor, probs: torch.Tensor) -> torch.Tensor:
+    #     expert_mask = F.one_hot(top_k_indices, self.num_experts).float()
+    #     expert_usage = expert_mask.sum(dim=0).mean(dim=0)
+    #     mean_probs = probs.mean(dim=0)
+    #     return (expert_usage * mean_probs).sum() * self.num_experts
     def calculate_aux_loss(self, top_k_indices: torch.Tensor, probs: torch.Tensor) -> torch.Tensor:
-        expert_mask = F.one_hot(top_k_indices, self.num_experts).float()
-        expert_usage = expert_mask.sum(dim=0).mean(dim=0)
-        mean_probs = probs.mean(dim=0)
-        return (expert_usage * mean_probs).sum() * self.num_experts
+        # Get shapes
+        B, S, K = top_k_indices.shape  # Batch, Sequence length, Top-K
+        # 1. Compute expert selection mask (one-hot encoded)
+        expert_mask = F.one_hot(top_k_indices, self.num_experts).float()  # (B, S, K, E)
+        # 2. Total number of times each expert is selected
+        expert_usage = expert_mask.sum(dim=(0, 1, 2))  # (E,)
+        # 3. Fraction of tokens assigned to each expert
+        total_tokens = B * S * K
+        fraction_expert = expert_usage / total_tokens  # (E,)
+        # 4. Sum of probabilities for each expert's selected tokens
+        sum_probs = (probs.unsqueeze(-1) * expert_mask).sum(dim=(0, 1, 2))  # (E,)
+        # 5. Average probability per expert (avoid division by zero)
+        avg_probs = sum_probs / expert_usage.clamp(min=1e-6)  # (E,)
+        # 6. Compute load balancing loss
+        loss = (fraction_expert * avg_probs).sum() * self.num_experts
+        return loss
     def forward(self, x: torch.Tensor):
         # Input shape: [batch*seq_len, embed_dim]

{rxnn-0.1.51.dist-info → rxnn-0.1.52.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rxnn
-Version: 0.1.51
+Version: 0.1.52
 Summary: RxNN: Reactive Neural Networks Platform
 License: Apache-2.0
 Keywords: deep-learning,ai,machine-learning

{rxnn-0.1.51.dist-info → rxnn-0.1.52.dist-info}/RECORD RENAMED Viewed

@@ -20,12 +20,12 @@ rxnn/transformers/attention.py,sha256=dC0UmC-_kjX8US6Sf0Fi5zw5kJ-P6orH3JDHeBB5gI
 rxnn/transformers/ff.py,sha256=jJnuBDsnnX5uYC_WZH8cXAYrMnz0P-iX7MwcPivjRtI,2533
 rxnn/transformers/layers.py,sha256=OX8CsFY9A7uqH1SLwyexR_5BNlwheYrJHCGXjF8Q7HU,7186
 rxnn/transformers/mask.py,sha256=J0cfLVLt3SzS2ra3KcY4khrkhI975Dw4CjpUi3Sn25s,419
-rxnn/transformers/models.py,sha256=_w5C7xvjT4-BFeMfzi57BQ51_fgaYZ4UK0SqUDE5Ooo,7266
-rxnn/transformers/moe.py,sha256=6Cffyo0QjmEWc4rK1ncOmLRCQbY0OpQJ4D7xH_4nTN4,4738
+rxnn/transformers/models.py,sha256=QFzBrOR7tDp9d_T0HoIukBMfEbLxsCictV5p3e2ilxg,7552
+rxnn/transformers/moe.py,sha256=88-w4cQhYNcebdq4zBsdkaoFa4VxJi1LFXDKAAkfVLk,5791
 rxnn/transformers/positional.py,sha256=2l38RS0Dini3f6Z3LUHr3XwWzg1UK7fO2C6wazWDAYU,4292
 rxnn/transformers/sampler.py,sha256=poWBpxg1iuK5gEJtxHkk5VVfS9V48hs2Olqdhy_Gw8c,6548
 rxnn/utils.py,sha256=d5U8i5ukovgDyqiycc2AoxObTz_eF_bgo2MKvdtJ98s,467
-rxnn-0.1.51.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
-rxnn-0.1.51.dist-info/METADATA,sha256=01n2lLYxVpppanxXJUwtM5Pck2vCLniFeaSkgrJyt-M,16627
-rxnn-0.1.51.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
-rxnn-0.1.51.dist-info/RECORD,,
+rxnn-0.1.52.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
+rxnn-0.1.52.dist-info/METADATA,sha256=aae9Bt0SpsDgugeHY-7Bi6SN3wWhXneD3Kbz1NMtxJo,16627
+rxnn-0.1.52.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
+rxnn-0.1.52.dist-info/RECORD,,

{rxnn-0.1.51.dist-info → rxnn-0.1.52.dist-info}/LICENSE RENAMED Viewed

File without changes

{rxnn-0.1.51.dist-info → rxnn-0.1.52.dist-info}/WHEEL RENAMED Viewed

File without changes

rxnn 0.1.51__py3-none-any.whl → 0.1.52__py3-none-any.whl

rxnn 0.1.51py3-none-any.whl → 0.1.52py3-none-any.whl