rxnn 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -100,7 +100,7 @@ class GroupedMoeAttention(GroupedQueryAttention):
100
100
 
101
101
  # Get positions where expert e is used in any group
102
102
  x_slice = x_flat[token_mask]
103
- proj = F.linear(x_slice, w[e], b[e] if b is not None else None)
103
+ proj = F.linear(x_slice, w[e].t(), b[e] if b is not None else None)
104
104
 
105
105
  # Find which groups use this expert for selected tokens
106
106
  group_mask = (indices_flat[token_mask] == e)
@@ -118,6 +118,7 @@ class GroupedMoeAttention(GroupedQueryAttention):
118
118
  # Key/Value processing
119
119
  B, S, D = key.shape
120
120
  key_flat = key.view(-1, D)
121
+ print('key_flat: ', key_flat.shape)
121
122
  weights_k_flat, indices_k_flat = self.router(key_flat)
122
123
  # Reshape back to original dimensions
123
124
  weights_k = weights_k_flat.view(B, S, -1)
rxnn/transformers/moe.py CHANGED
@@ -16,22 +16,26 @@ class MoeRouter(nn.Module):
16
16
 
17
17
  def calculate_aux_loss(self, top_k_indices: torch.Tensor, probs: torch.Tensor) -> torch.Tensor:
18
18
  expert_mask = F.one_hot(top_k_indices, self.num_experts).float()
19
+ print('expert mask: ', expert_mask.shape)
19
20
  expert_usage = expert_mask.sum(dim=0).mean(dim=0)
21
+ print('expert usage: ', expert_usage.shape)
20
22
  mean_probs = probs.mean(dim=0)
23
+ print('mean probs: ', mean_probs.shape)
21
24
  return (expert_usage * mean_probs).sum() * self.num_experts
22
25
 
23
26
 
24
27
  def forward(self, x: torch.Tensor):
25
28
  # Input shape: [batch*seq_len, embed_dim]
26
29
  logits = self.gate(x)
30
+ print('router logits: ', logits.shape)
27
31
  probs = F.softmax(logits, dim=-1)
28
-
32
+ print('router probs: ', probs.shape)
29
33
  # Get top-k experts for each token
30
34
  top_k_weights, top_k_indices = probs.topk(self.top_k, dim=-1)
31
35
 
32
36
  # Normalize weights (sum to 1 for each token)
33
37
  top_k_weights = top_k_weights / (top_k_weights.sum(dim=-1, keepdim=True) + 1e-9)
34
-
38
+ print('top k: ', top_k_weights.shape, top_k_indices.shape)
35
39
  # Load Balance Loss
36
40
  self.aux_loss = self.calculate_aux_loss(top_k_indices, probs)
37
41
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rxnn
3
- Version: 0.1.17
3
+ Version: 0.1.19
4
4
  Summary: RxNN: Reactive Neural Networks Platform
5
5
  License: Apache-2.0
6
6
  Keywords: deep-learning,ai,machine-learning
@@ -1,6 +1,6 @@
1
1
  rxnn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  rxnn/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- rxnn/experimental/attention.py,sha256=wjHrxfov3Ybg3iou8FlQtFvxNuHdcs_A7a6FTloosgA,32056
3
+ rxnn/experimental/attention.py,sha256=nvYtC6BdJQ8VUcNc_co2Fe2at7TBzA4OOIfG2tWWVCk,32104
4
4
  rxnn/experimental/models.py,sha256=-XkEHsyT8iNAjhZbgC7N_5nzP4ENVJLwxSoLHgMfA0I,4668
5
5
  rxnn/experimental/moe.py,sha256=PhiaNr3FwR2Zv2a0tfj6sfZ4iyhLo3Jyp2DwXq19qZQ,7935
6
6
  rxnn/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -21,11 +21,11 @@ rxnn/transformers/ff.py,sha256=jJnuBDsnnX5uYC_WZH8cXAYrMnz0P-iX7MwcPivjRtI,2533
21
21
  rxnn/transformers/layers.py,sha256=HhIiykmrBgdsV4AbMQXr9t0cSo4gSIeN0dPtc8mDyOo,5629
22
22
  rxnn/transformers/mask.py,sha256=J0cfLVLt3SzS2ra3KcY4khrkhI975Dw4CjpUi3Sn25s,419
23
23
  rxnn/transformers/models.py,sha256=w-zB_8QB9-Fae-GkGgmVDNY-Ts_0gBeWcevpl9qzZVM,7169
24
- rxnn/transformers/moe.py,sha256=FeaQR7hTX1dE74YdMOcuyZHSkGiV_0JwF8fw-GnfNOQ,4741
24
+ rxnn/transformers/moe.py,sha256=gJ-jXKtc01xcBayaYchRZy7imFGnvwVfUflXvFiKjKU,5048
25
25
  rxnn/transformers/positional.py,sha256=2l38RS0Dini3f6Z3LUHr3XwWzg1UK7fO2C6wazWDAYU,4292
26
26
  rxnn/transformers/sampler.py,sha256=poWBpxg1iuK5gEJtxHkk5VVfS9V48hs2Olqdhy_Gw8c,6548
27
27
  rxnn/utils.py,sha256=d5U8i5ukovgDyqiycc2AoxObTz_eF_bgo2MKvdtJ98s,467
28
- rxnn-0.1.17.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
29
- rxnn-0.1.17.dist-info/METADATA,sha256=wId6o7JCcBjRD1plWzgJRmFAY5VlHN7-FIVySeVDqx8,16627
30
- rxnn-0.1.17.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
31
- rxnn-0.1.17.dist-info/RECORD,,
28
+ rxnn-0.1.19.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
29
+ rxnn-0.1.19.dist-info/METADATA,sha256=4ul6X1SOT2bzHCxK88SjcYc0-1zy8YAKPCoMtZ2dKrY,16627
30
+ rxnn-0.1.19.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
31
+ rxnn-0.1.19.dist-info/RECORD,,
File without changes
File without changes