rxnn 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rxnn/experimental/attention.py +2 -1
- rxnn/transformers/moe.py +6 -2
- {rxnn-0.1.17.dist-info → rxnn-0.1.19.dist-info}/METADATA +1 -1
- {rxnn-0.1.17.dist-info → rxnn-0.1.19.dist-info}/RECORD +6 -6
- {rxnn-0.1.17.dist-info → rxnn-0.1.19.dist-info}/LICENSE +0 -0
- {rxnn-0.1.17.dist-info → rxnn-0.1.19.dist-info}/WHEEL +0 -0
rxnn/experimental/attention.py
CHANGED
@@ -100,7 +100,7 @@ class GroupedMoeAttention(GroupedQueryAttention):
|
|
100
100
|
|
101
101
|
# Get positions where expert e is used in any group
|
102
102
|
x_slice = x_flat[token_mask]
|
103
|
-
proj = F.linear(x_slice, w[e], b[e] if b is not None else None)
|
103
|
+
proj = F.linear(x_slice, w[e].t(), b[e] if b is not None else None)
|
104
104
|
|
105
105
|
# Find which groups use this expert for selected tokens
|
106
106
|
group_mask = (indices_flat[token_mask] == e)
|
@@ -118,6 +118,7 @@ class GroupedMoeAttention(GroupedQueryAttention):
|
|
118
118
|
# Key/Value processing
|
119
119
|
B, S, D = key.shape
|
120
120
|
key_flat = key.view(-1, D)
|
121
|
+
print('key_flat: ', key_flat.shape)
|
121
122
|
weights_k_flat, indices_k_flat = self.router(key_flat)
|
122
123
|
# Reshape back to original dimensions
|
123
124
|
weights_k = weights_k_flat.view(B, S, -1)
|
rxnn/transformers/moe.py
CHANGED
@@ -16,22 +16,26 @@ class MoeRouter(nn.Module):
|
|
16
16
|
|
17
17
|
def calculate_aux_loss(self, top_k_indices: torch.Tensor, probs: torch.Tensor) -> torch.Tensor:
|
18
18
|
expert_mask = F.one_hot(top_k_indices, self.num_experts).float()
|
19
|
+
print('expert mask: ', expert_mask.shape)
|
19
20
|
expert_usage = expert_mask.sum(dim=0).mean(dim=0)
|
21
|
+
print('expert usage: ', expert_usage.shape)
|
20
22
|
mean_probs = probs.mean(dim=0)
|
23
|
+
print('mean probs: ', mean_probs.shape)
|
21
24
|
return (expert_usage * mean_probs).sum() * self.num_experts
|
22
25
|
|
23
26
|
|
24
27
|
def forward(self, x: torch.Tensor):
|
25
28
|
# Input shape: [batch*seq_len, embed_dim]
|
26
29
|
logits = self.gate(x)
|
30
|
+
print('router logits: ', logits.shape)
|
27
31
|
probs = F.softmax(logits, dim=-1)
|
28
|
-
|
32
|
+
print('router probs: ', probs.shape)
|
29
33
|
# Get top-k experts for each token
|
30
34
|
top_k_weights, top_k_indices = probs.topk(self.top_k, dim=-1)
|
31
35
|
|
32
36
|
# Normalize weights (sum to 1 for each token)
|
33
37
|
top_k_weights = top_k_weights / (top_k_weights.sum(dim=-1, keepdim=True) + 1e-9)
|
34
|
-
|
38
|
+
print('top k: ', top_k_weights.shape, top_k_indices.shape)
|
35
39
|
# Load Balance Loss
|
36
40
|
self.aux_loss = self.calculate_aux_loss(top_k_indices, probs)
|
37
41
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
rxnn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
rxnn/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
rxnn/experimental/attention.py,sha256=
|
3
|
+
rxnn/experimental/attention.py,sha256=nvYtC6BdJQ8VUcNc_co2Fe2at7TBzA4OOIfG2tWWVCk,32104
|
4
4
|
rxnn/experimental/models.py,sha256=-XkEHsyT8iNAjhZbgC7N_5nzP4ENVJLwxSoLHgMfA0I,4668
|
5
5
|
rxnn/experimental/moe.py,sha256=PhiaNr3FwR2Zv2a0tfj6sfZ4iyhLo3Jyp2DwXq19qZQ,7935
|
6
6
|
rxnn/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -21,11 +21,11 @@ rxnn/transformers/ff.py,sha256=jJnuBDsnnX5uYC_WZH8cXAYrMnz0P-iX7MwcPivjRtI,2533
|
|
21
21
|
rxnn/transformers/layers.py,sha256=HhIiykmrBgdsV4AbMQXr9t0cSo4gSIeN0dPtc8mDyOo,5629
|
22
22
|
rxnn/transformers/mask.py,sha256=J0cfLVLt3SzS2ra3KcY4khrkhI975Dw4CjpUi3Sn25s,419
|
23
23
|
rxnn/transformers/models.py,sha256=w-zB_8QB9-Fae-GkGgmVDNY-Ts_0gBeWcevpl9qzZVM,7169
|
24
|
-
rxnn/transformers/moe.py,sha256=
|
24
|
+
rxnn/transformers/moe.py,sha256=gJ-jXKtc01xcBayaYchRZy7imFGnvwVfUflXvFiKjKU,5048
|
25
25
|
rxnn/transformers/positional.py,sha256=2l38RS0Dini3f6Z3LUHr3XwWzg1UK7fO2C6wazWDAYU,4292
|
26
26
|
rxnn/transformers/sampler.py,sha256=poWBpxg1iuK5gEJtxHkk5VVfS9V48hs2Olqdhy_Gw8c,6548
|
27
27
|
rxnn/utils.py,sha256=d5U8i5ukovgDyqiycc2AoxObTz_eF_bgo2MKvdtJ98s,467
|
28
|
-
rxnn-0.1.
|
29
|
-
rxnn-0.1.
|
30
|
-
rxnn-0.1.
|
31
|
-
rxnn-0.1.
|
28
|
+
rxnn-0.1.19.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
|
29
|
+
rxnn-0.1.19.dist-info/METADATA,sha256=4ul6X1SOT2bzHCxK88SjcYc0-1zy8YAKPCoMtZ2dKrY,16627
|
30
|
+
rxnn-0.1.19.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
31
|
+
rxnn-0.1.19.dist-info/RECORD,,
|
File without changes
|
File without changes
|