PyPI - rxnn - Versions diffs - 0.1.14__tar.gz → 0.1.15__tar.gz - Mend

rxnn 0.1.14tar.gz → 0.1.15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{rxnn-0.1.14 → rxnn-0.1.15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rxnn
-Version: 0.1.14
+Version: 0.1.15
 Summary: RxNN: Reactive Neural Networks Platform
 License: Apache-2.0
 Keywords: deep-learning,ai,machine-learning

{rxnn-0.1.14 → rxnn-0.1.15}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "rxnn"
-version = "0.1.14"
+version = "0.1.15"
 description = "RxNN: Reactive Neural Networks Platform"
 license = "Apache-2.0"

{rxnn-0.1.14 → rxnn-0.1.15}/src/rxnn/transformers/moe.py RENAMED Viewed

@@ -106,34 +106,44 @@ class MoeFeedForward(nn.Module):
         x = x.view(-1, self.embed_dim)  # [batch*seq_len, embed_dim]
         # Get routing weights and indices
-        weights, indices = self.router(x)  # [batch*seq_len, top_k], [batch*seq_len, top_k]
+        weights, indices = self.router(x)  # [B*T, top_k], [B*T, top_k]
         # Flatten indices and weights
-        batch_size = x.size(0)
-        top_k = indices.size(1)
-        indices = indices.view(-1)  # [batch*seq_len * top_k]
-        weights = weights.view(-1, 1)  # [batch*seq_len * top_k, 1]
-        # Select only the relevant experts for each token
-        selected_w1 = self.w1[indices]  # [batch*seq_len * top_k, embed_dim, hidden_dim]
-        selected_b1 = self.b1[indices]  # [batch*seq_len * top_k, hidden_dim]
-        selected_w2 = self.w2[indices]  # [batch*seq_len * top_k, hidden_dim, embed_dim]
-        selected_b2 = self.b2[indices]  # [batch*seq_len * top_k, embed_dim]
-        # Reshape x for batched computation
-        x_expanded = x.unsqueeze(1).repeat(1, top_k, 1).view(-1, self.embed_dim)  # [batch*seq_len * top_k, embed_dim]
-        # Compute only the selected experts
-        h = torch.einsum('be, beh -> bh', x_expanded, selected_w1) + selected_b1
+        batch_size = x.shape[0]
+        top_k = indices.shape[1]
+        indices_flat = indices.view(-1)  # [B*T * top_k]
+        # Compute contributions for selected experts without materializing large tensors
+        # First Layer:
+        # Compute all expert contributions first (but this may still be memory-heavy)
+        # Alternative: Compute contributions for selected experts directly
+        # ... (see detailed steps below)
+        # Alternative approach using gather and batched operations
+        x_expanded = x.unsqueeze(1).repeat(1, top_k, 1).view(-1, self.embed_dim)  # [B*T*top_k, D]
+        # Compute first layer contributions using gather
+        # indices_flat has shape [B*T*top_k]
+        # selected_w1 is self.w1[indices_flat], but we compute the product inline
+        h = torch.einsum(
+            'be, eih -> bh',
+            x_expanded,
+            self.w1[indices_flat]
+        ) + self.b1[indices_flat]
         h = self._activate(h)
         h = self.dropout(h)
-        out = torch.einsum('bh, bhe -> be', h, selected_w2) + selected_b2
-        # Reshape back and apply weights
-        out = out.view(batch_size, top_k, -1)  # [batch*seq_len, top_k, embed_dim]
-        weights = weights.view(batch_size, top_k, 1)  # [batch*seq_len, top_k, 1]
-        out = (out * weights).sum(dim=1)  # Weighted sum over top_k experts
+        # Second layer:
+        out = torch.einsum(
+            'bh, eho -> beo',
+            h,
+            self.w2[indices_flat]
+        ).squeeze(-1) + self.b2[indices_flat]
+        # Reshape and apply weights
+        out = out.view(batch_size, top_k, -1)
+        weights = weights.view(batch_size, top_k, 1)
+        out = (out * weights).sum(dim=1)
         return out.view(*orig_shape)