x-transformers 1.43.0__tar.gz → 1.43.1__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {x_transformers-1.43.0/x_transformers.egg-info → x_transformers-1.43.1}/PKG-INFO +1 -1
- {x_transformers-1.43.0 → x_transformers-1.43.1}/README.md +1 -1
- {x_transformers-1.43.0 → x_transformers-1.43.1}/setup.py +1 -1
- {x_transformers-1.43.0 → x_transformers-1.43.1}/tests/test_x_transformers.py +7 -2
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers/x_transformers.py +8 -4
- {x_transformers-1.43.0 → x_transformers-1.43.1/x_transformers.egg-info}/PKG-INFO +1 -1
- {x_transformers-1.43.0 → x_transformers-1.43.1}/LICENSE +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/setup.cfg +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers/__init__.py +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers/attend.py +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers/autoregressive_wrapper.py +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers/continuous.py +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers/dpo.py +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers/multi_input.py +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers/neo_mlp.py +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers/nonautoregressive_wrapper.py +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers/xl_autoregressive_wrapper.py +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers/xval.py +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers.egg-info/SOURCES.txt +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers.egg-info/dependency_links.txt +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers.egg-info/requires.txt +0 -0
- {x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers.egg-info/top_level.txt +0 -0
@@ -2240,7 +2240,7 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
|
|
2240
2240
|
}
|
2241
2241
|
```
|
2242
2242
|
|
2243
|
-
```
|
2243
|
+
```bibtex
|
2244
2244
|
@article{Yang2017BreakingTS,
|
2245
2245
|
title = {Breaking the Softmax Bottleneck: A High-Rank RNN Language Model},
|
2246
2246
|
author = {Zhilin Yang and Zihang Dai and Ruslan Salakhutdinov and William W. Cohen},
|
@@ -591,7 +591,9 @@ def test_cross_attn_rotary(
|
|
591
591
|
context_mask = context_mask
|
592
592
|
)
|
593
593
|
|
594
|
-
|
594
|
+
@pytest.mark.parametrize('tanh', (True, False))
|
595
|
+
def test_hyper_connections(tanh):
|
596
|
+
|
595
597
|
model = TransformerWrapper(
|
596
598
|
num_tokens = 20000,
|
597
599
|
max_seq_len = 1024,
|
@@ -599,7 +601,10 @@ def test_hyper_connections():
|
|
599
601
|
dim = 128,
|
600
602
|
depth = 6,
|
601
603
|
heads = 8,
|
602
|
-
num_residual_streams = 8 # 8 dynamic hyper connection residual streams
|
604
|
+
num_residual_streams = 8, # 8 dynamic hyper connection residual streams
|
605
|
+
residual_fn_kwargs = dict(
|
606
|
+
tanh = tanh
|
607
|
+
)
|
603
608
|
)
|
604
609
|
)
|
605
610
|
|
@@ -870,6 +870,7 @@ class HyperConnection(Module):
|
|
870
870
|
*,
|
871
871
|
layer_index,
|
872
872
|
num_residual_streams,
|
873
|
+
tanh = True,
|
873
874
|
**kwargs
|
874
875
|
):
|
875
876
|
"""
|
@@ -878,6 +879,8 @@ class HyperConnection(Module):
|
|
878
879
|
"""
|
879
880
|
super().__init__()
|
880
881
|
|
882
|
+
self.act = nn.Tanh() if tanh else nn.Identity()
|
883
|
+
|
881
884
|
self.norm = nn.LayerNorm(dim, bias = False)
|
882
885
|
|
883
886
|
self.num_residual_streams = num_residual_streams
|
@@ -901,11 +904,11 @@ class HyperConnection(Module):
|
|
901
904
|
|
902
905
|
normed = self.norm(residuals)
|
903
906
|
|
904
|
-
wc_weight = (normed @ self.dynamic_alpha_fn)
|
907
|
+
wc_weight = self.act(normed @ self.dynamic_alpha_fn)
|
905
908
|
dynamic_alpha = wc_weight * self.dynamic_alpha_scale
|
906
909
|
alpha = dynamic_alpha + self.static_alpha
|
907
910
|
|
908
|
-
dc_weight = (normed @ self.dynamic_beta_fn)
|
911
|
+
dc_weight = self.act(normed @ self.dynamic_beta_fn)
|
909
912
|
dynamic_beta = dc_weight * self.dynamic_beta_scale
|
910
913
|
beta = dynamic_beta + self.static_beta
|
911
914
|
|
@@ -1650,9 +1653,10 @@ class AttentionLayers(Module):
|
|
1650
1653
|
unet_skips = False,
|
1651
1654
|
num_residual_streams = 1,
|
1652
1655
|
reinject_input = False, # seen first in DEQ paper https://arxiv.org/abs/1909.01377, but later used in a number of papers trying to achieve depthwise generalization https://arxiv.org/abs/2410.03020v1
|
1653
|
-
add_value_residual = False, # resformer from Zhou et al - https://arxiv.org/abs/2410.17897v1
|
1656
|
+
add_value_residual = False, # resformer from Zhou et al - https://arxiv.org/abs/2410.17897v1 - further corroboration by https://arxiv.org/abs/2412.15113 (faster emergence of ICL) - looks like this setting may becoming a necessity for every transformer soon
|
1654
1657
|
learned_value_residual_mix = True, # seeing big improvements when the value residual mix value is learned per token - credit goes to @faresobeid for taking the first step with learned scalar mix, then @Blinkdl for taking it a step further with data dependent. here we will use per token learned
|
1655
1658
|
rel_pos_kwargs: dict = dict(),
|
1659
|
+
residual_fn_kwargs: dict = dict(),
|
1656
1660
|
**kwargs
|
1657
1661
|
):
|
1658
1662
|
super().__init__()
|
@@ -1957,7 +1961,7 @@ class AttentionLayers(Module):
|
|
1957
1961
|
else:
|
1958
1962
|
residual_fn = Residual
|
1959
1963
|
|
1960
|
-
residual = residual_fn(dim, layer_index = ind, scale_residual = scale_residual, scale_residual_constant = scale_residual_constant)
|
1964
|
+
residual = residual_fn(dim, layer_index = ind, scale_residual = scale_residual, scale_residual_constant = scale_residual_constant, **residual_fn_kwargs)
|
1961
1965
|
|
1962
1966
|
# handle unet skip connection
|
1963
1967
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{x_transformers-1.43.0 → x_transformers-1.43.1}/x_transformers.egg-info/dependency_links.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|