PyPI - x-transformers - Versions diffs - 1.44.4__tar.gz → 1.44.6__tar.gz - Mend

x-transformers 1.44.4tar.gz → 1.44.6tar.gz

Files changed (22) hide show

{x_transformers-1.44.4/x_transformers.egg-info → x_transformers-1.44.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: x-transformers
-Version: 1.44.4
+Version: 1.44.6
 Summary: X-Transformers - Pytorch
 Home-page: https://github.com/lucidrains/x-transformers
 Author: Phil Wang
@@ -19,3 +19,12 @@ Requires-Dist: einops>=0.8.0
 Requires-Dist: loguru
 Requires-Dist: packaging>=21.0
 Requires-Dist: torch>=2.0
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: requires-dist
+Dynamic: summary

{x_transformers-1.44.4 → x_transformers-1.44.6}/setup.py RENAMED Viewed

@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
   name = 'x-transformers',
   packages = find_packages(exclude=['examples']),
-  version = '1.44.4',
+  version = '1.44.6',
   license='MIT',
   description = 'X-Transformers - Pytorch',
   author = 'Phil Wang',

{x_transformers-1.44.4 → x_transformers-1.44.6}/x_transformers/x_transformers.py RENAMED Viewed

@@ -1736,6 +1736,7 @@ class AttentionLayers(Module):
         unet_skips = False,
         num_residual_streams = 1,
         reinject_input = False,              # seen first in DEQ paper https://arxiv.org/abs/1909.01377, but later used in a number of papers trying to achieve depthwise generalization https://arxiv.org/abs/2410.03020v1
+        learned_reinject_input_gate = False,
         add_value_residual = False,          # resformer from Zhou et al - https://arxiv.org/abs/2410.17897v1 - further corroboration by https://arxiv.org/abs/2412.15113 (faster emergence of ICL) - looks like this setting may becoming a necessity for every transformer soon
         learned_value_residual_mix = True,   # seeing big improvements when the value residual mix value is learned per token - credit goes to @faresobeid for taking the first step with learned scalar mix, then @Blinkdl for taking it a step further with data dependent. here we will use per token learned
         rel_pos_kwargs: dict = dict(),
@@ -1993,6 +1994,7 @@ class AttentionLayers(Module):
         self.reinject_input = reinject_input
         self.reinject_input_proj = nn.Linear(dim, dim, bias = False) if reinject_input else None
+        self.learned_reinject_input_gate = nn.Linear(dim, 1, bias = False) if learned_reinject_input_gate else None
         # add the value from the first self attention block to all latter projected self attention values as a residual
@@ -2225,6 +2227,8 @@ class AttentionLayers(Module):
         # derived input for reinjection if needed
+        inp_inject = None
         if self.reinject_input:
             assert not exists(in_attn_cond)
             inp_inject = self.reinject_input_proj(x)
@@ -2233,6 +2237,10 @@ class AttentionLayers(Module):
             # handle in-attention conditioning, which serves the same purpose of having the network learn the residual
             inp_inject = in_attn_cond if in_attn_cond.ndim == 3 else rearrange(in_attn_cond, 'b d -> b 1 d')
+        if exists(inp_inject) and exists(self.learned_reinject_input_gate):
+            inp_inject_gate = self.learned_reinject_input_gate(x).sigmoid()
+            inp_inject = inp_inject * inp_inject_gate
         # store all hiddens for skips
         skip_hiddens = []
@@ -2282,7 +2290,7 @@ class AttentionLayers(Module):
                 post_branch_norm = maybe(partial)(post_branch_norm, **norm_kwargs)
                 post_main_norm = maybe(partial)(post_main_norm, **norm_kwargs)
-            if self.reinject_input:
+            if exists(inp_inject):
                 x = x + inp_inject
             if exists(pre_norm):

{x_transformers-1.44.4 → x_transformers-1.44.6/x_transformers.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: x-transformers
-Version: 1.44.4
+Version: 1.44.6
 Summary: X-Transformers - Pytorch
 Home-page: https://github.com/lucidrains/x-transformers
 Author: Phil Wang
@@ -19,3 +19,12 @@ Requires-Dist: einops>=0.8.0
 Requires-Dist: loguru
 Requires-Dist: packaging>=21.0
 Requires-Dist: torch>=2.0
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: requires-dist
+Dynamic: summary