PyPI - x-transformers - Versions diffs - 2.3.5__py3-none-any.whl → 2.3.6__py3-none-any.whl - Mend

x-transformers 2.3.5py3-none-any.whl → 2.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

x_transformers/continuous.py CHANGED Viewed

@@ -220,8 +220,6 @@ class ContinuousAutoregressiveWrapper(Module):
     def __init__(
         self,
         net: ContinuousTransformerWrapper,
-        ignore_index = -100,
-        pad_value = 0,
         loss_fn: Module | None = None,
         equal_loss_weight_batch = False  # setting this to True, if the mask is passed in and sequences are variable in length, each sequence will be weighted the same (as opposed to each token)
     ):

x_transformers/x_transformers.py CHANGED Viewed

@@ -1207,7 +1207,9 @@ class FeedForward(Module):
         dropout = 0.,
         sublayer_dropout = 0.,
         no_bias = False,
-        zero_init_output = False
+        zero_init_output = False,
+        deep_embed_hiddens = False,
+        deep_embed_num_tokens = None,
     ):
         super().__init__()
         inner_dim = int(dim * mult)
@@ -1223,27 +1225,51 @@ class FeedForward(Module):
             activation = nn.GELU()
         if glu:
-            project_in = GLU(dim, inner_dim, activation, mult_bias = glu_mult_bias)
+            proj_in = GLU(dim, inner_dim, activation, mult_bias = glu_mult_bias)
         else:
-            project_in = nn.Sequential(
+            proj_in = nn.Sequential(
                 nn.Linear(dim, inner_dim, bias = not no_bias),
                 activation
             )
+        proj_out = nn.Linear(inner_dim, dim_out, bias = not no_bias)
         self.ff = Sequential(
-            project_in,
+            proj_in,
             LayerNorm(inner_dim) if post_act_ln else None,
             nn.Dropout(dropout),
-            nn.Linear(inner_dim, dim_out, bias = not no_bias),
+            proj_out,
             nn.Dropout(sublayer_dropout) if sublayer_dropout > 0. else None
         )
+        # deep embed
+        # credit goes to Braden Koszarsky for first devising value embeddings in nanogpt-speedrun project
+        # then Bo Peng for coming up with this alternate design in feedforward for RWKV 8
+        # improvements were clearest to me (on my toy setup) with multiplying on output of feedforward, will try with attention at future date
+        self.deep_embed = None
+        if deep_embed_hiddens:
+            assert exists(deep_embed_num_tokens)
+            self.deep_embed = nn.Parameter(torch.zeros(deep_embed_num_tokens, dim_out))
         # init last linear layer to 0
         if zero_init_output:
-            init_zero_(self.ff[-1])
+            init_zero_(proj_out)
-    def forward(self, x):
-        return self.ff(x)
+    def forward(
+        self,
+        x,
+        deep_embed_ids = None
+    ):
+        out = self.ff(x)
+        if exists(deep_embed_ids) and exists(self.deep_embed):
+            deep_embed = self.deep_embed[deep_embed_ids] + 1.
+            out = out * deep_embed
+        return out
 # attention. it is all we need
@@ -2354,6 +2380,7 @@ class AttentionLayers(Module):
         pos = None,
         context_pos = None,
         attn_bias = None,
+        deep_embed_ids = None,
         condition = None,
         in_attn_cond = None, # https://arxiv.org/abs/2105.04090
         layers_execute_order: tuple[int, ...] | None = None
@@ -2448,6 +2475,9 @@ class AttentionLayers(Module):
             if cache_age > 0:
                 x = x[:, -cache_age:] # for spec decoding, may be greater than 1
+                if exists(deep_embed_ids):
+                    deep_embed_ids = deep_embed_ids[:, -cache_age:]
             attn_cache = cache.attn_intermediates
         iter_attn_cache = iter(attn_cache)
@@ -2572,7 +2602,7 @@ class AttentionLayers(Module):
             elif layer_type == 'c':
                 out, inter = block(x, context = context, mask = mask, context_mask = context_mask, prev_attn = prev_cross_attn, cache = next(iter_attn_cache, None), value_residual = maybe_cross_attn_value_residual, **cross_attn_rotary_pos_emb, return_intermediates = True)
             elif layer_type == 'f':
-                out = block(x)
+                out = block(x, deep_embed_ids = deep_embed_ids)
             # store first self or cross attention intermediate for value residual
@@ -2959,7 +2989,7 @@ class TransformerWrapper(Module):
         # shapes and variables
-        b, n, device, num_mems, has_memory_tokens, emb_frac_gradient, orig_mask = x.shape[0], x.shape[1], x.device, self.num_memory_tokens, self.num_memory_tokens > 0, self.emb_frac_gradient, mask
+        b, n, device, token_ids, num_mems, has_memory_tokens, emb_frac_gradient, orig_mask = x.shape[0], x.shape[1], x.device, x, self.num_memory_tokens, self.num_memory_tokens > 0, self.emb_frac_gradient, mask
         return_hiddens = return_mems | return_attn | return_intermediates | return_attn_z_loss | return_embeddings_and_intermediates
         return_embeddings = return_embeddings | (not exists(self.to_logits)) | return_embeddings_and_intermediates
@@ -3066,7 +3096,7 @@ class TransformerWrapper(Module):
             # regular
-            attended, intermediates = self.attn_layers(x, mask = mask, mems = mems, mem_masks = mem_masks, cache = cache, return_hiddens = True, seq_start_pos = seq_start_pos, **kwargs)
+            attended, intermediates = self.attn_layers(x, mask = mask, mems = mems, mem_masks = mem_masks, cache = cache, deep_embed_ids = token_ids, return_hiddens = True, seq_start_pos = seq_start_pos, **kwargs)
         else:
             # recycling

{x_transformers-2.3.5.dist-info → x_transformers-2.3.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: x-transformers
-Version: 2.3.5
+Version: 2.3.6
 Summary: X-Transformers
 Project-URL: Homepage, https://pypi.org/project/x-transformers/
 Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -2475,4 +2475,15 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 }
 ```
+```bibtex
+@misc{Jordan2024,
+    author       = {Keller Jordan and Braden Koszarsky},
+    title        = {modded-nanogpt (value embeddings from nanogpt speedrun)},
+    year         = {2024},
+    publisher    = {GitHub},
+    journal      = {GitHub repository},
+    howpublished = {https://github.com/KellerJordan/modded-nanogpt},
+}
+```
 *solve intelligence... then use that to solve everything else.* - Demis Hassabis

{x_transformers-2.3.5.dist-info → x_transformers-2.3.6.dist-info}/RECORD RENAMED Viewed

@@ -2,16 +2,16 @@ x_transformers/__init__.py,sha256=h3I2ejobgEdy8H7NgV-rP8UaBCnd16-MysvDXH9GMEA,98
 x_transformers/attend.py,sha256=-5BWWhFsp7tvZTdN91Ay5SqOjyj9uOs-122vFvoO6b4,17253
 x_transformers/autoregressive_wrapper.py,sha256=reLCno9Z9pchVU79tBF8OMo21LwSZ67KAeB83jqkyAc,10505
 x_transformers/belief_state_wrapper.py,sha256=YLUMk6t2MhFBEw5lHDDHJHcoCxTIkHvxTNY__GGZEKU,13374
-x_transformers/continuous.py,sha256=bTxwCt_8RlT1-aR2F4R8YOhpjMF-TbpElRbbRiNd6M8,9512
+x_transformers/continuous.py,sha256=DWYD7wwVp0UU5UswK_6CKA_Cmpbl7XfzR9IKMxtECLM,9460
 x_transformers/dpo.py,sha256=xt4OuOWhU8pN3OKN2LZAaC2NC8iiEnchqqcrPWVqf0o,3521
 x_transformers/entropy_based_tokenizer.py,sha256=F2lO8-v3aLIcVDVNhu7RR-UtRdlmaaYJzBK9m7OnLE8,5018
 x_transformers/multi_input.py,sha256=tCh-fTJDj2ib4SMGtsa-AM8MxKzJAQSwqAXOu3HU2mg,9252
 x_transformers/neo_mlp.py,sha256=XCNnnop9WLarcxap1kGuYc1x8GHvwkZiDRnXOxSl3Po,3452
 x_transformers/nonautoregressive_wrapper.py,sha256=2NU58hYMgn-4Jzg3mie-mXb0XH_dCN7fjlzd3K1rLUY,10510
-x_transformers/x_transformers.py,sha256=MF91aJGr2DOjIGe57uqwgyNxCExBg_tI9z7usAJMxOM,112401
+x_transformers/x_transformers.py,sha256=kZKk80hxV0Pvmx1E745BR7c8YzB-S4u2cZHSMZvpZq8,113507
 x_transformers/xl_autoregressive_wrapper.py,sha256=CvZMJ6A6PA-Y_bQAhnORwjJBSl6Vjq2IdW5KTdk8NI8,4195
 x_transformers/xval.py,sha256=7S00kCuab4tWQa-vf-z-XfzADjVj48MoFIr7VSIvttg,8575
-x_transformers-2.3.5.dist-info/METADATA,sha256=wPHqpSgc75F3npfdSNCzro1F6PBlVXabA0oarpvZMHI,88686
-x_transformers-2.3.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-x_transformers-2.3.5.dist-info/licenses/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
-x_transformers-2.3.5.dist-info/RECORD,,
+x_transformers-2.3.6.dist-info/METADATA,sha256=Z337g7NRRYaKGbBHkKe1UZbIQJeXPk-dtZ4aBiVvSH8,89021
+x_transformers-2.3.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+x_transformers-2.3.6.dist-info/licenses/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
+x_transformers-2.3.6.dist-info/RECORD,,

{x_transformers-2.3.5.dist-info → x_transformers-2.3.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{x_transformers-2.3.5.dist-info → x_transformers-2.3.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

x-transformers 2.3.5__py3-none-any.whl → 2.3.6__py3-none-any.whl

x-transformers 2.3.5py3-none-any.whl → 2.3.6py3-none-any.whl