rxnn 0.1.60__py3-none-any.whl → 0.1.62__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rxnn/transformers/attention.py +0 -3
- rxnn/transformers/layers.py +0 -3
- rxnn/transformers/models.py +4 -4
- {rxnn-0.1.60.dist-info → rxnn-0.1.62.dist-info}/METADATA +1 -2
- {rxnn-0.1.60.dist-info → rxnn-0.1.62.dist-info}/RECORD +7 -7
- {rxnn-0.1.60.dist-info → rxnn-0.1.62.dist-info}/LICENSE +0 -0
- {rxnn-0.1.60.dist-info → rxnn-0.1.62.dist-info}/WHEEL +0 -0
rxnn/transformers/attention.py
CHANGED
@@ -137,9 +137,6 @@ class MultiHeadAttention(nn.Module):
|
|
137
137
|
b, t, d = query.size()
|
138
138
|
q, k, v = self._forward_qkv(query, key, value, b, t, d)
|
139
139
|
if not self.rel_embed:
|
140
|
-
print('q', q.size())
|
141
|
-
print('k', k.size())
|
142
|
-
print('v', v.size())
|
143
140
|
q, k = self._apply_rope(q, k)
|
144
141
|
attn_output = self._calculate_attention(q, k, v, b, t, d, mask=mask)
|
145
142
|
else:
|
rxnn/transformers/layers.py
CHANGED
@@ -86,7 +86,6 @@ class ReactiveTransformerLayer(nn.Module):
|
|
86
86
|
residual = x
|
87
87
|
if not self.use_post_norm:
|
88
88
|
x = self.norm1(x)
|
89
|
-
print('self x', x.size())
|
90
89
|
x = self.attention(x, x, x, mask=mask)
|
91
90
|
x = residual + x
|
92
91
|
if self.use_post_norm:
|
@@ -95,8 +94,6 @@ class ReactiveTransformerLayer(nn.Module):
|
|
95
94
|
residual = x
|
96
95
|
if not self.use_post_norm:
|
97
96
|
x = self.norm2(x)
|
98
|
-
print('x', x.size())
|
99
|
-
print('STM', stm.size())
|
100
97
|
x = self.memory_cross_attention(x, stm, stm)
|
101
98
|
x = residual + x
|
102
99
|
if self.use_post_norm:
|
rxnn/transformers/models.py
CHANGED
@@ -72,11 +72,11 @@ class ReactiveTransformerDecoder(ReactiveTransformerBase):
|
|
72
72
|
# Process shared layers
|
73
73
|
if self.shared_layers is not None:
|
74
74
|
for i in range(self.num_shared_layers):
|
75
|
-
layer_stm = self.stm(i)
|
75
|
+
layer_stm = self.stm(i).expand(x.size(0), -1, -1)
|
76
76
|
x = self.shared_layers[i](x, layer_stm, mask=mask)
|
77
77
|
# Process own layers
|
78
78
|
for i in range(self.num_own_layers):
|
79
|
-
layer_stm = self.stm(i)
|
79
|
+
layer_stm = self.stm(i).expand(x.size(0), -1, -1)
|
80
80
|
x = self.layers[i](x, layer_stm, mask=mask)
|
81
81
|
return self.head(x)
|
82
82
|
|
@@ -93,12 +93,12 @@ class ReactiveTransformerEncoder(ReactiveTransformerBase):
|
|
93
93
|
# Process shared layers
|
94
94
|
if self.shared_layers is not None:
|
95
95
|
for i in range(self.num_shared_layers):
|
96
|
-
layer_stm = self.stm(i)
|
96
|
+
layer_stm = self.stm(i).expand(x.size(0), -1, -1)
|
97
97
|
x = self.shared_layers[i](x, layer_stm, mask=attention_mask)
|
98
98
|
hidden_states.append(x)
|
99
99
|
# Process own layers
|
100
100
|
for i in range(self.num_own_layers):
|
101
|
-
layer_stm = self.stm(i)
|
101
|
+
layer_stm = self.stm(i).expand(x.size(0), -1, -1)
|
102
102
|
x = self.layers[i](x, layer_stm, mask=attention_mask)
|
103
103
|
hidden_states.append(x)
|
104
104
|
return x, torch.stack(hidden_states)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: rxnn
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.62
|
4
4
|
Summary: RxNN: Reactive Neural Networks Platform
|
5
5
|
License: Apache-2.0
|
6
6
|
Keywords: deep-learning,ai,machine-learning
|
@@ -14,7 +14,6 @@ Classifier: Programming Language :: Python :: 3.11
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
15
15
|
Classifier: Programming Language :: Python :: 3.13
|
16
16
|
Requires-Dist: datasets (>=3.5.0,<4.0.0)
|
17
|
-
Requires-Dist: flash-attention (>=1.0.0,<2.0.0)
|
18
17
|
Requires-Dist: huggingface-hub (>=0.30.0,<0.31.0)
|
19
18
|
Requires-Dist: tensorboard (>=2.19.0,<3.0.0)
|
20
19
|
Requires-Dist: tokenizers (>=0.21.0,<0.22.0)
|
@@ -16,16 +16,16 @@ rxnn/training/dataset.py,sha256=JQuWSUdT5AnsrG6M_EsewoU6uroVHhg4K715nbtDx8A,9643
|
|
16
16
|
rxnn/training/scheduler.py,sha256=ow6oALzWjWQmHSpcJEjv6tg4g4CDMvr73TypxfcefMc,712
|
17
17
|
rxnn/training/tokenizer.py,sha256=umaLByMBx_NMrQElA45HLm9gkuzyKWDTFaKVd-CjXl0,8344
|
18
18
|
rxnn/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
-
rxnn/transformers/attention.py,sha256=
|
19
|
+
rxnn/transformers/attention.py,sha256=dC0UmC-_kjX8US6Sf0Fi5zw5kJ-P6orH3JDHeBB5gI8,15695
|
20
20
|
rxnn/transformers/ff.py,sha256=jJnuBDsnnX5uYC_WZH8cXAYrMnz0P-iX7MwcPivjRtI,2533
|
21
|
-
rxnn/transformers/layers.py,sha256=
|
21
|
+
rxnn/transformers/layers.py,sha256=OX8CsFY9A7uqH1SLwyexR_5BNlwheYrJHCGXjF8Q7HU,7186
|
22
22
|
rxnn/transformers/mask.py,sha256=J0cfLVLt3SzS2ra3KcY4khrkhI975Dw4CjpUi3Sn25s,419
|
23
|
-
rxnn/transformers/models.py,sha256=
|
23
|
+
rxnn/transformers/models.py,sha256=xbnn3FTNZFhaqq9A0XEM12ie_WL_58pPeq0qFXIgve0,7656
|
24
24
|
rxnn/transformers/moe.py,sha256=j6jEx6Ip0zttlUZKKn82azxo95lkLZs-H2GLSMD88hY,5859
|
25
25
|
rxnn/transformers/positional.py,sha256=2l38RS0Dini3f6Z3LUHr3XwWzg1UK7fO2C6wazWDAYU,4292
|
26
26
|
rxnn/transformers/sampler.py,sha256=poWBpxg1iuK5gEJtxHkk5VVfS9V48hs2Olqdhy_Gw8c,6548
|
27
27
|
rxnn/utils.py,sha256=d5U8i5ukovgDyqiycc2AoxObTz_eF_bgo2MKvdtJ98s,467
|
28
|
-
rxnn-0.1.
|
29
|
-
rxnn-0.1.
|
30
|
-
rxnn-0.1.
|
31
|
-
rxnn-0.1.
|
28
|
+
rxnn-0.1.62.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
|
29
|
+
rxnn-0.1.62.dist-info/METADATA,sha256=ku_ZxKtWSEaqJ5qeRmlAeKSk69upNr_CWRgoZel3yOo,16579
|
30
|
+
rxnn-0.1.62.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
31
|
+
rxnn-0.1.62.dist-info/RECORD,,
|
File without changes
|
File without changes
|