PyPI - rxnn - Versions diffs - 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

rxnn 0.1.14py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

rxnn/transformers/moe.py CHANGED Viewed

@@ -106,34 +106,44 @@ class MoeFeedForward(nn.Module):
         x = x.view(-1, self.embed_dim)  # [batch*seq_len, embed_dim]
         # Get routing weights and indices
-        weights, indices = self.router(x)  # [batch*seq_len, top_k], [batch*seq_len, top_k]
+        weights, indices = self.router(x)  # [B*T, top_k], [B*T, top_k]
         # Flatten indices and weights
-        batch_size = x.size(0)
-        top_k = indices.size(1)
-        indices = indices.view(-1)  # [batch*seq_len * top_k]
-        weights = weights.view(-1, 1)  # [batch*seq_len * top_k, 1]
-        # Select only the relevant experts for each token
-        selected_w1 = self.w1[indices]  # [batch*seq_len * top_k, embed_dim, hidden_dim]
-        selected_b1 = self.b1[indices]  # [batch*seq_len * top_k, hidden_dim]
-        selected_w2 = self.w2[indices]  # [batch*seq_len * top_k, hidden_dim, embed_dim]
-        selected_b2 = self.b2[indices]  # [batch*seq_len * top_k, embed_dim]
-        # Reshape x for batched computation
-        x_expanded = x.unsqueeze(1).repeat(1, top_k, 1).view(-1, self.embed_dim)  # [batch*seq_len * top_k, embed_dim]
-        # Compute only the selected experts
-        h = torch.einsum('be, beh -> bh', x_expanded, selected_w1) + selected_b1
+        batch_size = x.shape[0]
+        top_k = indices.shape[1]
+        indices_flat = indices.view(-1)  # [B*T * top_k]
+        # Compute contributions for selected experts without materializing large tensors
+        # First Layer:
+        # Compute all expert contributions first (but this may still be memory-heavy)
+        # Alternative: Compute contributions for selected experts directly
+        # ... (see detailed steps below)
+        # Alternative approach using gather and batched operations
+        x_expanded = x.unsqueeze(1).repeat(1, top_k, 1).view(-1, self.embed_dim)  # [B*T*top_k, D]
+        # Compute first layer contributions using gather
+        # indices_flat has shape [B*T*top_k]
+        # selected_w1 is self.w1[indices_flat], but we compute the product inline
+        h = torch.einsum(
+            'be, eih -> bh',
+            x_expanded,
+            self.w1[indices_flat]
+        ) + self.b1[indices_flat]
         h = self._activate(h)
         h = self.dropout(h)
-        out = torch.einsum('bh, bhe -> be', h, selected_w2) + selected_b2
-        # Reshape back and apply weights
-        out = out.view(batch_size, top_k, -1)  # [batch*seq_len, top_k, embed_dim]
-        weights = weights.view(batch_size, top_k, 1)  # [batch*seq_len, top_k, 1]
-        out = (out * weights).sum(dim=1)  # Weighted sum over top_k experts
+        # Second layer:
+        out = torch.einsum(
+            'bh, eho -> beo',
+            h,
+            self.w2[indices_flat]
+        ).squeeze(-1) + self.b2[indices_flat]
+        # Reshape and apply weights
+        out = out.view(batch_size, top_k, -1)
+        weights = weights.view(batch_size, top_k, 1)
+        out = (out * weights).sum(dim=1)
         return out.view(*orig_shape)

{rxnn-0.1.14.dist-info → rxnn-0.1.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rxnn
-Version: 0.1.14
+Version: 0.1.15
 Summary: RxNN: Reactive Neural Networks Platform
 License: Apache-2.0
 Keywords: deep-learning,ai,machine-learning

{rxnn-0.1.14.dist-info → rxnn-0.1.15.dist-info}/RECORD RENAMED Viewed

@@ -19,11 +19,11 @@ rxnn/transformers/ff.py,sha256=jJnuBDsnnX5uYC_WZH8cXAYrMnz0P-iX7MwcPivjRtI,2533
 rxnn/transformers/layers.py,sha256=HhIiykmrBgdsV4AbMQXr9t0cSo4gSIeN0dPtc8mDyOo,5629
 rxnn/transformers/mask.py,sha256=J0cfLVLt3SzS2ra3KcY4khrkhI975Dw4CjpUi3Sn25s,419
 rxnn/transformers/models.py,sha256=w-zB_8QB9-Fae-GkGgmVDNY-Ts_0gBeWcevpl9qzZVM,7169
-rxnn/transformers/moe.py,sha256=fFPTRcctCSc9OwHd0PhNb0nwHgNJY7dXfUtGreXtaho,6720
+rxnn/transformers/moe.py,sha256=s2yeBsAg-JIqKp7tLlXPdLNar9FXZ14LgbHyXlUKk6o,6758
 rxnn/transformers/positional.py,sha256=2l38RS0Dini3f6Z3LUHr3XwWzg1UK7fO2C6wazWDAYU,4292
 rxnn/transformers/sampler.py,sha256=poWBpxg1iuK5gEJtxHkk5VVfS9V48hs2Olqdhy_Gw8c,6548
 rxnn/utils.py,sha256=d5U8i5ukovgDyqiycc2AoxObTz_eF_bgo2MKvdtJ98s,467
-rxnn-0.1.14.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
-rxnn-0.1.14.dist-info/METADATA,sha256=YQDNMaHDrfVdOk44qEUczgLaNcrXApoqVmNX50yQDdM,14629
-rxnn-0.1.14.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
-rxnn-0.1.14.dist-info/RECORD,,
+rxnn-0.1.15.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
+rxnn-0.1.15.dist-info/METADATA,sha256=r3sjBGoGAsIcNqrNEC1tDuG6blEuNRVrQ_3fyy-yWJY,14629
+rxnn-0.1.15.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
+rxnn-0.1.15.dist-info/RECORD,,

{rxnn-0.1.14.dist-info → rxnn-0.1.15.dist-info}/LICENSE RENAMED Viewed

File without changes

{rxnn-0.1.14.dist-info → rxnn-0.1.15.dist-info}/WHEEL RENAMED Viewed

File without changes

rxnn 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

rxnn 0.1.14py3-none-any.whl → 0.1.15py3-none-any.whl