PyPI - titans-pytorch - Versions diffs - 0.3.25__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

titans-pytorch 0.3.25py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

titans_pytorch/mac_transformer.py CHANGED Viewed

@@ -46,7 +46,7 @@ def create_mac_block_mask(seq_len, window_size, persist_mem_len, sliding = False
 # einstein notation related
-from einops import repeat, rearrange, pack, unpack
+from einops import repeat, rearrange, pack, unpack, einsum
 from einops.layers.torch import Rearrange
 # b - batch
@@ -521,9 +521,7 @@ class MemoryAsContextTransformer(Module):
         self.sliding_window_attn = sliding_window_attn
         self.attn_window_size = segment_len + num_longterm_mem_tokens
-        # hyper conection
-        assert not (num_residual_streams <= 1 and neural_memory_qkv_receives_diff_views), 'allow neural memory queries, keys, values to be derived from different combinations of the residual streams can only work if hyper connections has greater than 1 residual stream'
+        # hyper connection
         init_hyper_conn, self.expand_streams, self.reduce_streams = get_init_and_expand_reduce_stream_functions(num_residual_streams, dim = dim, add_stream_embed = True, disable = num_residual_streams == 1)
@@ -560,17 +558,28 @@ class MemoryAsContextTransformer(Module):
             )
             mem = None
+            mem_qkv_layer_selector = None
             mem_hyper_conn = None
             if layer in neural_memory_layers:
-                mem_hyper_conn = init_hyper_conn(add_branch_out_to_residual = not neural_mem_gate_attn_output, num_input_views = 3 if neural_memory_qkv_receives_diff_views else 1)
+                mem_hyper_conn = init_hyper_conn(add_branch_out_to_residual = not neural_mem_gate_attn_output)
+                if not is_first and neural_memory_qkv_receives_diff_views:
+                    num_layer_choices = (layer - 1) * 4 + 1 # for each layer, have memory input select from attn inp, attn out, ff inp, and ff out - plus one for the current point in the residual stream (memory input)
+                    mem_qkv_layer_selector = nn.Sequential(
+                        nn.RMSNorm(dim),
+                        nn.Linear(dim, 3 * num_layer_choices),
+                        Rearrange('... (views layers) -> views ... layers', views = 3),
+                        nn.Softmax(dim = -1)
+                    )
                 mem = NeuralMemory(
                     dim = dim,
                     chunk_size = self.neural_memory_segment_len,
                     batch_size = neural_memory_batch_size,
                     model = deepcopy(neural_memory_model),
-                    qkv_receives_diff_views = neural_memory_qkv_receives_diff_views,
+                    qkv_receives_diff_views = True,
                     accept_weight_residual = neural_mem_weight_residual and not is_first_neural_mem,
                     **neural_memory_kwargs
                 )
@@ -581,9 +590,12 @@ class MemoryAsContextTransformer(Module):
             self.layers.append(ModuleList([
                 mem_hyper_conn,
+                init_hyper_conn(),
+                init_hyper_conn(),
+                mem_qkv_layer_selector,
                 mem,
-                init_hyper_conn(branch = attn),
-                init_hyper_conn(branch = ff)
+                attn,
+                ff,
             ]))
         self.norm = nn.RMSNorm(dim)
@@ -763,6 +775,10 @@ class MemoryAsContextTransformer(Module):
         mem_weight_residual = None
+        # layers for the neural mem to select the qkv inputs from
+        mem_input_layers = []
         # when inferencing, only do one token at a time
         if is_inferencing:
@@ -773,7 +789,7 @@ class MemoryAsContextTransformer(Module):
         x = self.expand_streams(x)
-        for mem_hyper_conn, mem, attn, ff in self.layers:
+        for mem_hyper_conn, attn_hyper_conn, ff_hyper_conn, mem_qkv_layer_selector, mem, attn, ff in self.layers:
             retrieved = None
             attn_out_gates = None
@@ -785,8 +801,19 @@ class MemoryAsContextTransformer(Module):
                 mem_input, add_residual = mem_hyper_conn(x)
+                if not exists(mem_qkv_layer_selector):
+                    qkv_mem_input = stack((mem_input, mem_input, mem_input))
+                else:
+                    layers_to_choose_from = stack((mem_input, *mem_input_layers))
+                    # let the current `mem_input` select the 3 layers for qkv
+                    selected = mem_qkv_layer_selector(mem_input)
+                    qkv_mem_input = einsum(layers_to_choose_from, selected, 'l b n d, v b n l -> v b n d')
                 retrieved, next_neural_mem_cache = mem.forward(
-                    mem_input,
+                    qkv_mem_input,
                     state = next(neural_mem_caches, None),
                     prev_weights = mem_weight_residual
                 )
@@ -801,8 +828,12 @@ class MemoryAsContextTransformer(Module):
             # attention
-            x, (values, next_kv_cache) = attn(
-                x,
+            attn_in, add_residual = attn_hyper_conn(x)
+            mem_input_layers.append(attn_in)
+            attn_out, (values, next_kv_cache) = attn(
+                attn_in,
                 value_residual = value_residual,
                 disable_flex_attn = disable_flex_attn,
                 flex_attn_fn = flex_attn_fn,
@@ -810,8 +841,12 @@ class MemoryAsContextTransformer(Module):
                 cache = next(kv_caches, None)
             )
+            mem_input_layers.append(attn_out)
             value_residual = default(value_residual, values)
+            x = add_residual(attn_out)
             # caches
             next_kv_caches.append(next_kv_cache)
@@ -819,7 +854,15 @@ class MemoryAsContextTransformer(Module):
             # feedforward
-            x = ff(x)
+            ff_in, add_ff_residual = ff_hyper_conn(x)
+            mem_input_layers.append(ff_in)
+            ff_out = ff(ff_in)
+            mem_input_layers.append(ff_out)
+            x = add_ff_residual(ff_out)
         # taking care of cache first
         # for early return when processing long term mem tokens during inference

titans_pytorch/neural_memory.py CHANGED Viewed

@@ -353,11 +353,11 @@ class NeuralMemory(Module):
             pred = functional_call(self.memory_model, params, inputs)
             loss = self.store_memory_loss_fn(pred, target) # simple mse loss in paper - eq (12) - |M(k) - v|²
             weighted_loss = loss * loss_weights
-            return weighted_loss.sum()
+            return weighted_loss.sum(), loss
         # two functions
-        grad_fn = grad(forward_and_loss)
+        grad_fn = grad(forward_and_loss, has_aux = True)
         self.per_sample_grad_fn = vmap(grad_fn, in_dims = (0, 0, 0, 0))
@@ -526,6 +526,7 @@ class NeuralMemory(Module):
         seq_index = 0,
         prev_weights = None,
         mask: Tensor | None = None,
+        return_surprises = True
     ):
         if self.qkv_receives_diff_views:
             _, batch, seq_len = seq.shape[:3]
@@ -645,10 +646,14 @@ class NeuralMemory(Module):
         # get grads and extra auxiliary loss (for backwarding through qkv projection in base neural memory module)
-        grads = self.per_sample_grad_fn(dict(weights_for_surprise), keys, adaptive_lr, values)
+        grads, unweighted_mem_model_loss = self.per_sample_grad_fn(dict(weights_for_surprise), keys, adaptive_lr, values)
         grads = TensorDict(grads)
+        # surprises
+        unweighted_mem_model_loss = rearrange(unweighted_mem_model_loss, '(b h n) c -> b h (n c)', b = batch, h = heads)
         # maybe softclamp grad norm
         if exists(self.max_grad_norm):
@@ -687,7 +692,10 @@ class NeuralMemory(Module):
             output = (updates, next_store_state)
-            return output
+            if not return_surprises:
+                return output
+            return (*output, unweighted_mem_model_loss)
         # momentum + weight decay - momentum is the new contribution, as most linear RNNs have learned forgetting gates
@@ -744,7 +752,10 @@ class NeuralMemory(Module):
         # return updates to neural memory at all chunked timesteps + neural mem cache / state to be fed back
-        return updates, next_store_state
+        if not return_surprises:
+            return updates, next_store_state
+        return updates, next_store_state, unweighted_mem_model_loss
     def retrieve_memories(
         self,
@@ -843,7 +854,8 @@ class NeuralMemory(Module):
         store_seq = None,
         state: NeuralMemState | None = None,
         prev_weights = None,
-        store_mask: Tensor | None = None
+        store_mask: Tensor | None = None,
+        return_surprises = False
     ):
         is_multi_input = self.qkv_receives_diff_views
@@ -927,6 +939,7 @@ class NeuralMemory(Module):
         # whether to allow network to slowly adjust from initial weight throughout (residual path) to fully updating weights every batch
+        surprises = None
         gate = None
         if exists(self.transition_gate):
@@ -937,13 +950,14 @@ class NeuralMemory(Module):
             # store
-            next_updates, next_neural_mem_state = self.store_memories(
+            next_updates, next_neural_mem_state, chunk_surprises = self.store_memories(
                 store_seq_chunk,
                 weights,
                 seq_index = seq_index,
                 past_state = past_state,
                 prev_weights = prev_weights,
-                mask = maybe_store_mask
+                mask = maybe_store_mask,
+                return_surprises = True
             )
             weights = next_neural_mem_state.weights
@@ -952,6 +966,8 @@ class NeuralMemory(Module):
             updates = accum_updates(updates, next_updates)
+            surprises = safe_cat((surprises, chunk_surprises), dim = -1)
             if is_last and not update_after_final_store:
                 continue
@@ -986,4 +1002,9 @@ class NeuralMemory(Module):
             updates
         )
-        return retrieved, next_neural_mem_state
+        # returning
+        if not return_surprises:
+            return retrieved, next_neural_mem_state
+        return retrieved, next_neural_mem_state, surprises

{titans_pytorch-0.3.25.dist-info → titans_pytorch-0.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.3.25
+Version: 0.4.1
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

titans_pytorch-0.4.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+titans_pytorch/__init__.py,sha256=AyEUlcXWpnqrvyeihRAXWIfQlzLA4NhBjOqQU4edL-4,297
+titans_pytorch/associative_scan.py,sha256=esaLbukFlgvy2aqopsqBy6KEcZ64B3rsNhG8moKdPSc,5159
+titans_pytorch/mac_transformer.py,sha256=tz72141G5t3AOnxSVsOLtLptGtl8T7zROUvaTw2_XCY,26960
+titans_pytorch/memory_models.py,sha256=wnH9i9kUSoVZhEWUlj8LpBSbB400L9kLt1zP8CO45QQ,5835
+titans_pytorch/neural_memory.py,sha256=io5fvLWpOTzx8mkDA9sg3Mkc7-aeugUJoDCniryiuYE,32666
+titans_pytorch-0.4.1.dist-info/METADATA,sha256=XwduHOXOJvjaWJhdYUq-1jhVq2zNKJBwMH1VWopxv5Y,6816
+titans_pytorch-0.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+titans_pytorch-0.4.1.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+titans_pytorch-0.4.1.dist-info/RECORD,,

titans_pytorch-0.3.25.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-titans_pytorch/__init__.py,sha256=AyEUlcXWpnqrvyeihRAXWIfQlzLA4NhBjOqQU4edL-4,297
-titans_pytorch/associative_scan.py,sha256=esaLbukFlgvy2aqopsqBy6KEcZ64B3rsNhG8moKdPSc,5159
-titans_pytorch/mac_transformer.py,sha256=grD327B3OCIy7d23jNUWIoUo1bIgXUqD26dXWCjdi28,25565
-titans_pytorch/memory_models.py,sha256=wnH9i9kUSoVZhEWUlj8LpBSbB400L9kLt1zP8CO45QQ,5835
-titans_pytorch/neural_memory.py,sha256=uh5NbtAAzfPeZPFe7uhgnpUF6qyP0zjP0eXPIgY5pfc,31929
-titans_pytorch-0.3.25.dist-info/METADATA,sha256=SZwazbaNFe1GstoF45zI_aNMpzgXAqv4mOh78gMN5-U,6817
-titans_pytorch-0.3.25.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-titans_pytorch-0.3.25.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-titans_pytorch-0.3.25.dist-info/RECORD,,

{titans_pytorch-0.3.25.dist-info → titans_pytorch-0.4.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{titans_pytorch-0.3.25.dist-info → titans_pytorch-0.4.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

titans-pytorch 0.3.25__py3-none-any.whl → 0.4.1__py3-none-any.whl

titans-pytorch 0.3.25py3-none-any.whl → 0.4.1py3-none-any.whl