PyPI - titans-pytorch - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

titans-pytorch 0.2.1py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

titans_pytorch/mac_transformer.py CHANGED Viewed

@@ -488,7 +488,7 @@ class MemoryAsContextTransformer(Module):
         neural_memory_model: Module | None = None,
         neural_memory_kwargs: dict = dict(),
         neural_memory_layers: tuple[int, ...] | None = None,
-        aux_kv_recon_loss_weight = 0.,
+        aux_kv_recon_loss_weight = 1.,
         use_flex_attn = False,
         sliding_window_attn = False,
         weight_tie_memory_model = False,
@@ -536,11 +536,6 @@ class MemoryAsContextTransformer(Module):
         self.weight_tie_memory_model = weight_tie_memory_model
         self.prev_neural_mem_update_for_weights = default(prev_neural_mem_update_for_weights, weight_tie_memory_model)
-        # value residual learning for neural memory
-        is_first_mem = True
-        self.mem_add_value_residual = neural_memory_add_value_residual
         # mem, attn, and feedforward layers
         for layer in layers:
@@ -570,12 +565,9 @@ class MemoryAsContextTransformer(Module):
                     dim = dim,
                     chunk_size = self.neural_memory_segment_len,
                     model = maybe_copy(neural_memory_model),
-                    accept_value_residual = not is_first_mem and neural_memory_add_value_residual,
                     **neural_memory_kwargs
                 )
-                is_first_mem = False
             ff = FeedForward(dim = dim, mult = ff_mult)
             self.layers.append(ModuleList([
@@ -765,8 +757,6 @@ class MemoryAsContextTransformer(Module):
         value_residual = None
-        mem_value_residual = None
         # aux losses
         kv_recon_losses = self.zero
@@ -794,28 +784,21 @@ class MemoryAsContextTransformer(Module):
                 mem_input, add_residual = mem_hyper_conn(x)
                 if not is_inferencing:
-                    (retrieved, next_neural_mem_cache, next_mem_value_residual), mem_kv_aux_loss = mem(
+                    (retrieved, next_neural_mem_cache), mem_kv_aux_loss = mem(
                         mem_input,
                         return_aux_kv_loss = True,
-                        return_values = True,
-                        value_residual = mem_value_residual,
                         prev_layer_updates = neural_memory_updates
                     )
                     kv_recon_losses = kv_recon_losses + mem_kv_aux_loss
                 else:
-                    (retrieved, next_neural_mem_cache, next_mem_value_residual) = mem.forward_inference(
+                    (retrieved, next_neural_mem_cache) = mem.forward_inference(
                         mem_input,
                         state = next(neural_mem_caches, None),
-                        return_values = True,
-                        value_residual = mem_value_residual,
                         prev_layer_updates = neural_memory_updates
                     )
-                if self.mem_add_value_residual:
-                    mem_value_residual = next_mem_value_residual
                 if prev_neural_mem_update_for_weights:
                     neural_memory_updates = next_neural_mem_cache.updates

titans_pytorch/memory_models.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-from torch import nn
+from torch import nn, cat
 import torch.nn.functional as F
 from torch.nn import Module, ModuleList, Parameter, ParameterList

titans_pytorch/neural_memory.py CHANGED Viewed

@@ -67,6 +67,9 @@ def safe_cat(inputs, dim = -2):
 def identity(t):
     return t
+def dict_get_shape(td):
+    return {k: v.shape for k, v in td.items()}
 def pair(v):
     return (v, v) if not isinstance(v, tuple) else v
@@ -258,7 +261,6 @@ class NeuralMemory(Module):
         pre_rmsnorm = True,
         post_rmsnorm = True,
         qk_rmsnorm = False,
-        accept_value_residual = False,
         max_grad_norm: float | None = None,
         use_accelerated_scan = False,
         activation: Module | None = None,
@@ -302,19 +304,34 @@ class NeuralMemory(Module):
             nn.Sigmoid()
         ) if heads > 1 else None
-        # memory mlp
+        # memory model
         if not exists(model):
             model = MemoryMLP(dim_head, **default_model_kwargs)
+        # validate memory model
         assert not exists(next(model.buffers(), None)), 'model cannot have buffers for now'
+        test_shape = (3, 2, dim_head)
+        with torch.no_grad():
+            try:
+                test_input = torch.randn(test_shape)
+                mem_model_output = model(test_input)
+            except:
+                raise RuntimeError(f'memory model unable to accept a tensor of shape {test_shape}')
+            assert mem_model_output.shape == test_shape, 'output of memory model needs to be same shape as input'
         # the memory is the weights of the model
         self.memory_model = model
         self.num_memory_parameter_tensors = len(set(model.parameters()))
+        self.init_weight_shape = dict_get_shape(dict(model.named_parameters()))
         # the chunk size within the paper where adaptive step, momentum, weight decay are shared
         self.chunk_size = chunk_size
@@ -343,19 +360,6 @@ class NeuralMemory(Module):
         self.to_keys_values = Sequential(LinearNoBias(dim, dim_inner * 2), activation)
         self.store_memory_loss_fn = store_memory_loss_fn
-        # value residual learning
-        self.learned_value_residual = Sequential(
-            LinearNoBias(dim, heads),
-            Rearrange('b n h -> b h n 1'),
-            nn.Sigmoid()
-        ) if accept_value_residual else None
-        # empty memory embed
-        self.empty_memory_embed = nn.Parameter(torch.zeros(dim))
-        nn.init.normal_(self.empty_memory_embed, std = 0.02)
         # `chunk_size` refers to chunk size used for storing to memory model weights
         chunk_size = self.store_chunk_size
@@ -417,9 +421,6 @@ class NeuralMemory(Module):
         weights = TensorDict(dict(self.memory_model.named_parameters()))
         return weights
-    def init_empty_memory_embed(self, batch, seq_len):
-        return repeat(self.empty_memory_embed, 'd -> b n d', b = batch, n = seq_len)
     def store_memories(
         self,
         seq,
@@ -428,10 +429,7 @@ class NeuralMemory(Module):
         prev_layer_updates: dict[str, Tensor] | None = None,
         return_aux_kv_loss = False,
         chunk_size = None,
-        value_residual = None
     ):
-        assert xnor(exists(value_residual), exists(self.learned_value_residual))
         seq_len, heads, chunk_size = seq.shape[-2], self.heads, default(chunk_size, self.store_chunk_size)
         # handle edge case
@@ -446,7 +444,7 @@ class NeuralMemory(Module):
         round_down_seq_len = round_down_multiple(seq_len, chunk_size)
-        seq = seq[:, :round_down_seq_len]
+        seq, remainder = seq[:, :round_down_seq_len], seq[:, round_down_seq_len:]
         # per sample grad function
@@ -499,14 +497,6 @@ class NeuralMemory(Module):
         keys = self.k_norm(keys)
-        # maybe value residual learning
-        orig_values = values
-        if exists(self.learned_value_residual):
-            mix = self.learned_value_residual(seq)
-            values = values.lerp(value_residual, mix)
         # take care of chunking
         keys, values = tuple(rearrange(t, 'b h (n c) d -> (b h n) c d', c = chunk_size) for t in (keys, values))
@@ -581,13 +571,15 @@ class NeuralMemory(Module):
             if has_momentum:
                 next_momentum[param_name] = inverse_pack(momentum)
-        # compute next states for inference, or titans-xl like training
+        # determine next state for the storing of memories
         next_state = (next_last_update, next_last_momentum)
+        next_store_state = NeuralMemCache(seq_len, remainder, next_state, updates)
         # returns
-        output = (updates, next_state, orig_values)
+        output = (updates, next_store_state)
         if not return_aux_kv_loss:
             return output
@@ -606,16 +598,18 @@ class NeuralMemory(Module):
         seq = self.retrieve_norm(seq)
-        if seq_len < chunk_size:
-            return self.init_empty_memory_embed(batch, seq_len)
+        assert seq_len >= chunk_size, 'must be handled outside of retrieve'
+        needs_pad = chunk_size > 1
-        seq = seq[:, (chunk_size - 1):]
-        curtailed_seq_len = seq.shape[-2]
+        if needs_pad:
+            seq = pad_at_dim(seq, (1, 0), dim = 1)
+            seq_len_plus_one = seq.shape[-2]
-        next_seq_len = round_up_multiple(curtailed_seq_len, chunk_size)
+            next_seq_len = round_up_multiple(seq_len_plus_one, chunk_size)
-        padding = next_seq_len - curtailed_seq_len
-        seq = pad_at_dim(seq, (0, padding), dim = 1)
+            padding = next_seq_len - seq_len_plus_one
+            seq = pad_at_dim(seq, (0, padding), dim = 1)
         # the parameters of the memory model stores the memories of the key / values
         # when the MLP has only 1 weight matrix, it is equivalent to `kv` fast weight memories from linear attention literature (recall fetching of memories is q @ (kv)) / schmidhuber's paper
@@ -639,7 +633,9 @@ class NeuralMemory(Module):
         # fetch values from memory model
-        curr_weights = curr_weights.apply(lambda t: rearrange(t, 'b n ... -> (b n) ...'))
+        if dict_get_shape(curr_weights) != self.init_weight_shape:
+            curr_weights = curr_weights.apply(lambda t: rearrange(t, 'b n ... -> (b n) ...'))
         queries = rearrange(queries, 'b h (n c) d -> (b h n) c d', c = chunk_size)
         # forward functional call
@@ -665,10 +661,10 @@ class NeuralMemory(Module):
         # restore, pad with empty memory embed
-        empty_memory_embeds = self.init_empty_memory_embed(values.shape[0], chunk_size - 1)
-        values = torch.cat((empty_memory_embeds, values), dim = -2)
+        if needs_pad:
+            values = values[:, 1:(seq_len + 1)]
-        return values[:, :seq_len]
+        return values
     @torch.no_grad()
     def forward_inference(
@@ -676,8 +672,6 @@ class NeuralMemory(Module):
         token: Tensor,
         state = None,
         prev_layer_updates: dict[str, Tensor] | None = None,
-        return_values = False,
-        value_residual = None,
     ):
         # unpack previous state
@@ -704,12 +698,9 @@ class NeuralMemory(Module):
         # early return empty memory, when no memories are stored for steps < first chunk size
         if curr_seq_len < self.chunk_size:
-            empty_mem = self.init_empty_memory_embed(batch, 1)
-            output = empty_mem, NeuralMemCache(curr_seq_len, cache_store_seq, past_states, updates)
+            retrieve = self.retrieve_memories(token, weights, chunk_size = 1)
-            if return_values:
-                output = (*output, self.zero)
+            output = retrieve, NeuralMemCache(curr_seq_len, cache_store_seq, past_states, updates)
             return output
@@ -728,20 +719,18 @@ class NeuralMemory(Module):
             prev_layer_updates = TensorDict(prev_layer_updates)
             prev_layer_updates = prev_layer_updates.apply(lambda t: t[:, -1:])
-        values = None
         if store_seq_cache_len == self.chunk_size:
-            next_updates, next_states, values = self.store_memories(
+            next_updates, store_state = self.store_memories(
                 cache_store_seq,
                 weights,
                 past_state = past_states,
                 prev_layer_updates = prev_layer_updates,
-                value_residual = value_residual
             )
             updates = next_updates
             cache_store_seq = None
+            next_states = store_state.states
         # retrieve
@@ -749,14 +738,9 @@ class NeuralMemory(Module):
         # next state tuple
-        next_state = NeuralMemCache(curr_seq_len, cache_store_seq, next_states, updates)
+        next_store_state = NeuralMemCache(curr_seq_len, cache_store_seq, next_states, updates)
-        output = (retrieved, next_state)
-        if return_values:
-            output = (*output, values)
-        return output
+        return retrieved, next_store_state
     def forward(
         self,
@@ -767,50 +751,45 @@ class NeuralMemory(Module):
         return_aux_kv_loss = False,
         chunk_size = None,
         store_chunk_size = None,
-        return_values = False,
-        value_residual = None,
         return_next_state = False,
         prev_layer_updates: dict[str, Tensor] | None = None
     ):
         batch, seq_len = seq.shape[:2]
+        if not exists(mem_model_weights):
+            mem_model_weights = self.init_weights()
         if seq_len < self.retrieve_chunk_size:
-            out = self.init_empty_memory_embed(batch, seq_len)
+            retrieved = self.retrieve_memories(seq, mem_model_weights, chunk_size = 1)
             next_store_state = NeuralMemCache(seq_len, seq, None, None)
-            out = (out, next_store_state)
-            if return_values:
-                out = (*out, self.zero)
+            out = (retrieved, next_store_state)
             if not return_aux_kv_loss:
                 return out
             return out, self.zero
-        if not exists(mem_model_weights):
-            mem_model_weights = self.init_weights()
         # store
         store_seq = default(store_seq, seq)
-        store_seq_len = store_seq.shape[-2]
-        store_chunk_size = default(store_chunk_size, chunk_size, self.store_chunk_size)
-        remainder = store_seq_len % store_chunk_size
-        (updates, next_state, values), aux_kv_recon_loss = self.store_memories(
+        (updates, next_store_state), aux_kv_recon_loss = self.store_memories(
             store_seq,
             mem_model_weights,
             chunk_size = store_chunk_size,
             prev_layer_updates = prev_layer_updates,
-            value_residual = value_residual,
             return_aux_kv_loss = True
         )
         # retrieve
+        if exists(prev_layer_updates):
+            prev_layer_updates = prev_layer_updates.apply(lambda t: pad_at_dim(t, (1, 0), dim = 1))
+        updates = updates.apply(lambda t: pad_at_dim(t, (1, 0), dim = 1))
         retrieved = self.retrieve_memories(
             seq,
             mem_model_weights + updates,
@@ -818,21 +797,8 @@ class NeuralMemory(Module):
             prev_layer_updates = prev_layer_updates
         )
-        # determine state for the storing of memories
-        # for transformer-xl like training with neural memory as well as inferencing with initial prompt
-        cache_store_seq = None
-        if remainder > 0:
-            cache_store_seq = store_seq[:, -remainder:]
-        next_store_state = NeuralMemCache(seq_len, cache_store_seq, next_state, updates)
         output = (retrieved, next_store_state)
-        if return_values:
-            output = (*output, values)
         if not return_aux_kv_loss:
             return output

{titans_pytorch-0.2.1.dist-info → titans_pytorch-0.2.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.2.1
+Version: 0.2.5
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

titans_pytorch-0.2.5.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+titans_pytorch/__init__.py,sha256=Y3m_ZlpEqYwp-Md1ARhNGJxq8bQp8ty1o039nZOOJo0,276
+titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
+titans_pytorch/mac_transformer.py,sha256=UOJAMv7nTgkefBB7M7K3U0NnFkz75tFRG5WLXRdfnLw,26039
+titans_pytorch/memory_models.py,sha256=Ew28waD9gf1wn-5Nkdc676u1I92IqzaOAw-tv0JXMwc,3777
+titans_pytorch/neural_memory.py,sha256=YiBsMiqYn-Hva4yhxfaqkGV857vZIASxi5Z0TT0FC10,24606
+titans_pytorch-0.2.5.dist-info/METADATA,sha256=x3RePuTDf3rUT3vtvge1X3Ry18Y3tV_swCgycbtSCjQ,6819
+titans_pytorch-0.2.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+titans_pytorch-0.2.5.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+titans_pytorch-0.2.5.dist-info/RECORD,,

titans_pytorch-0.2.1.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-titans_pytorch/__init__.py,sha256=Y3m_ZlpEqYwp-Md1ARhNGJxq8bQp8ty1o039nZOOJo0,276
-titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
-titans_pytorch/mac_transformer.py,sha256=kqW90mpbFf1ZJ_mMkd6v9EQ5J__TwKMPy5cjHJF_26A,26742
-titans_pytorch/memory_models.py,sha256=LI9T36XB6YXIvvGWRw0ZMDlGpRC6KIv03OPzME2VAaU,3772
-titans_pytorch/neural_memory.py,sha256=vmKPOAlXBPXBnYPODrg_reWaIcr1xwtfQmuptGS6e5A,25559
-titans_pytorch-0.2.1.dist-info/METADATA,sha256=HPdcQb4SlT-eLFzOYLMwGInEKegL4M4yIpKWt1a6DTs,6819
-titans_pytorch-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-titans_pytorch-0.2.1.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-titans_pytorch-0.2.1.dist-info/RECORD,,

{titans_pytorch-0.2.1.dist-info → titans_pytorch-0.2.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{titans_pytorch-0.2.1.dist-info → titans_pytorch-0.2.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

titans-pytorch 0.2.1__py3-none-any.whl → 0.2.5__py3-none-any.whl

titans-pytorch 0.2.1py3-none-any.whl → 0.2.5py3-none-any.whl