PyPI - titans-pytorch - Versions diffs - 0.2.0__tar.gz → 0.2.4__tar.gz - Mend

titans-pytorch 0.2.0tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{titans_pytorch-0.2.0 → titans_pytorch-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.2.0
+Version: 0.2.4
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

{titans_pytorch-0.2.0 → titans_pytorch-0.2.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.2.0"
+version = "0.2.4"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.2.0 → titans_pytorch-0.2.4}/titans_pytorch/mac_transformer.py RENAMED Viewed

@@ -491,7 +491,8 @@ class MemoryAsContextTransformer(Module):
         aux_kv_recon_loss_weight = 0.,
         use_flex_attn = False,
         sliding_window_attn = False,
-        weight_tie_memory_model = False
+        weight_tie_memory_model = False,
+        prev_neural_mem_update_for_weights = None
     ):
         super().__init__()
@@ -533,11 +534,7 @@ class MemoryAsContextTransformer(Module):
             assert exists(neural_memory_model), '`neural_memory_model` must be explicitly set'
         self.weight_tie_memory_model = weight_tie_memory_model
-        # value residual learning for neural memory
-        is_first_mem = True
-        self.mem_add_value_residual = neural_memory_add_value_residual
+        self.prev_neural_mem_update_for_weights = default(prev_neural_mem_update_for_weights, weight_tie_memory_model)
         # mem, attn, and feedforward layers
@@ -568,12 +565,9 @@ class MemoryAsContextTransformer(Module):
                     dim = dim,
                     chunk_size = self.neural_memory_segment_len,
                     model = maybe_copy(neural_memory_model),
-                    accept_value_residual = not is_first_mem and neural_memory_add_value_residual,
                     **neural_memory_kwargs
                 )
-                is_first_mem = False
             ff = FeedForward(dim = dim, mult = ff_mult)
             self.layers.append(ModuleList([
@@ -702,7 +696,7 @@ class MemoryAsContextTransformer(Module):
         # math
-        batch, seq_len, neural_mem_segment_len, segment_len, num_longterm_mem_tokens, attn_window_size, weight_tie_memory_model = *x.shape, self.neural_memory_segment_len, self.segment_len, self.num_longterm_mem_tokens, self.attn_window_size, self.weight_tie_memory_model
+        batch, seq_len, neural_mem_segment_len, segment_len, num_longterm_mem_tokens, attn_window_size, prev_neural_mem_update_for_weights = *x.shape, self.neural_memory_segment_len, self.segment_len, self.num_longterm_mem_tokens, self.attn_window_size, self.prev_neural_mem_update_for_weights
         seq_len_with_mem = self.seq_len_with_longterm_mem(seq_len)
@@ -763,8 +757,6 @@ class MemoryAsContextTransformer(Module):
         value_residual = None
-        mem_value_residual = None
         # aux losses
         kv_recon_losses = self.zero
@@ -792,29 +784,22 @@ class MemoryAsContextTransformer(Module):
                 mem_input, add_residual = mem_hyper_conn(x)
                 if not is_inferencing:
-                    (retrieved, next_neural_mem_cache, next_mem_value_residual), mem_kv_aux_loss = mem(
+                    (retrieved, next_neural_mem_cache), mem_kv_aux_loss = mem(
                         mem_input,
                         return_aux_kv_loss = True,
-                        return_values = True,
-                        value_residual = mem_value_residual,
                         prev_layer_updates = neural_memory_updates
                     )
                     kv_recon_losses = kv_recon_losses + mem_kv_aux_loss
                 else:
-                    (retrieved, next_neural_mem_cache, next_mem_value_residual) = mem.forward_inference(
+                    (retrieved, next_neural_mem_cache) = mem.forward_inference(
                         mem_input,
                         state = next(neural_mem_caches, None),
-                        return_values = True,
-                        value_residual = mem_value_residual,
                         prev_layer_updates = neural_memory_updates
                     )
-                if self.mem_add_value_residual:
-                    mem_value_residual = next_mem_value_residual
-                if weight_tie_memory_model:
+                if prev_neural_mem_update_for_weights:
                     neural_memory_updates = next_neural_mem_cache.updates
                 if self.gate_attn_output:

{titans_pytorch-0.2.0 → titans_pytorch-0.2.4}/titans_pytorch/neural_memory.py RENAMED Viewed

@@ -67,6 +67,9 @@ def safe_cat(inputs, dim = -2):
 def identity(t):
     return t
+def dict_get_shape(td):
+    return {k: v.shape for k, v in td.items()}
 def pair(v):
     return (v, v) if not isinstance(v, tuple) else v
@@ -258,7 +261,6 @@ class NeuralMemory(Module):
         pre_rmsnorm = True,
         post_rmsnorm = True,
         qk_rmsnorm = False,
-        accept_value_residual = False,
         max_grad_norm: float | None = None,
         use_accelerated_scan = False,
         activation: Module | None = None,
@@ -315,6 +317,8 @@ class NeuralMemory(Module):
         self.num_memory_parameter_tensors = len(set(model.parameters()))
+        self.init_weight_shape = dict_get_shape(dict(model.named_parameters()))
         # the chunk size within the paper where adaptive step, momentum, weight decay are shared
         self.chunk_size = chunk_size
@@ -343,19 +347,6 @@ class NeuralMemory(Module):
         self.to_keys_values = Sequential(LinearNoBias(dim, dim_inner * 2), activation)
         self.store_memory_loss_fn = store_memory_loss_fn
-        # value residual learning
-        self.learned_value_residual = Sequential(
-            LinearNoBias(dim, heads),
-            Rearrange('b n h -> b h n 1'),
-            nn.Sigmoid()
-        ) if accept_value_residual else None
-        # empty memory embed
-        self.empty_memory_embed = nn.Parameter(torch.zeros(dim))
-        nn.init.normal_(self.empty_memory_embed, std = 0.02)
         # `chunk_size` refers to chunk size used for storing to memory model weights
         chunk_size = self.store_chunk_size
@@ -417,9 +408,6 @@ class NeuralMemory(Module):
         weights = TensorDict(dict(self.memory_model.named_parameters()))
         return weights
-    def init_empty_memory_embed(self, batch, seq_len):
-        return repeat(self.empty_memory_embed, 'd -> b n d', b = batch, n = seq_len)
     def store_memories(
         self,
         seq,
@@ -428,10 +416,7 @@ class NeuralMemory(Module):
         prev_layer_updates: dict[str, Tensor] | None = None,
         return_aux_kv_loss = False,
         chunk_size = None,
-        value_residual = None
     ):
-        assert xnor(exists(value_residual), exists(self.learned_value_residual))
         seq_len, heads, chunk_size = seq.shape[-2], self.heads, default(chunk_size, self.store_chunk_size)
         # handle edge case
@@ -446,7 +431,7 @@ class NeuralMemory(Module):
         round_down_seq_len = round_down_multiple(seq_len, chunk_size)
-        seq = seq[:, :round_down_seq_len]
+        seq, remainder = seq[:, :round_down_seq_len], seq[:, round_down_seq_len:]
         # per sample grad function
@@ -499,14 +484,6 @@ class NeuralMemory(Module):
         keys = self.k_norm(keys)
-        # maybe value residual learning
-        orig_values = values
-        if exists(self.learned_value_residual):
-            mix = self.learned_value_residual(seq)
-            values = values.lerp(value_residual, mix)
         # take care of chunking
         keys, values = tuple(rearrange(t, 'b h (n c) d -> (b h n) c d', c = chunk_size) for t in (keys, values))
@@ -581,13 +558,15 @@ class NeuralMemory(Module):
             if has_momentum:
                 next_momentum[param_name] = inverse_pack(momentum)
-        # compute next states for inference, or titans-xl like training
+        # determine next state for the storing of memories
         next_state = (next_last_update, next_last_momentum)
+        next_store_state = NeuralMemCache(seq_len, remainder, next_state, updates)
         # returns
-        output = (updates, next_state, orig_values)
+        output = (updates, next_store_state)
         if not return_aux_kv_loss:
             return output
@@ -606,16 +585,18 @@ class NeuralMemory(Module):
         seq = self.retrieve_norm(seq)
-        if seq_len < chunk_size:
-            return self.init_empty_memory_embed(batch, seq_len)
+        assert seq_len >= chunk_size, 'must be handled outside of retrieve'
+        needs_pad = chunk_size > 1
-        seq = seq[:, (chunk_size - 1):]
-        curtailed_seq_len = seq.shape[-2]
+        if needs_pad:
+            seq = pad_at_dim(seq, (1, 0), dim = 1)
+            seq_len_plus_one = seq.shape[-2]
-        next_seq_len = round_up_multiple(curtailed_seq_len, chunk_size)
+            next_seq_len = round_up_multiple(seq_len_plus_one, chunk_size)
-        padding = next_seq_len - curtailed_seq_len
-        seq = pad_at_dim(seq, (0, padding), dim = 1)
+            padding = next_seq_len - seq_len_plus_one
+            seq = pad_at_dim(seq, (0, padding), dim = 1)
         # the parameters of the memory model stores the memories of the key / values
         # when the MLP has only 1 weight matrix, it is equivalent to `kv` fast weight memories from linear attention literature (recall fetching of memories is q @ (kv)) / schmidhuber's paper
@@ -639,7 +620,9 @@ class NeuralMemory(Module):
         # fetch values from memory model
-        curr_weights = curr_weights.apply(lambda t: rearrange(t, 'b n ... -> (b n) ...'))
+        if dict_get_shape(curr_weights) != self.init_weight_shape:
+            curr_weights = curr_weights.apply(lambda t: rearrange(t, 'b n ... -> (b n) ...'))
         queries = rearrange(queries, 'b h (n c) d -> (b h n) c d', c = chunk_size)
         # forward functional call
@@ -665,10 +648,10 @@ class NeuralMemory(Module):
         # restore, pad with empty memory embed
-        empty_memory_embeds = self.init_empty_memory_embed(values.shape[0], chunk_size - 1)
-        values = torch.cat((empty_memory_embeds, values), dim = -2)
+        if needs_pad:
+            values = values[:, 1:(seq_len + 1)]
-        return values[:, :seq_len]
+        return values
     @torch.no_grad()
     def forward_inference(
@@ -676,8 +659,6 @@ class NeuralMemory(Module):
         token: Tensor,
         state = None,
         prev_layer_updates: dict[str, Tensor] | None = None,
-        return_values = False,
-        value_residual = None,
     ):
         # unpack previous state
@@ -704,12 +685,9 @@ class NeuralMemory(Module):
         # early return empty memory, when no memories are stored for steps < first chunk size
         if curr_seq_len < self.chunk_size:
-            empty_mem = self.init_empty_memory_embed(batch, 1)
-            output = empty_mem, NeuralMemCache(curr_seq_len, cache_store_seq, past_states, updates)
+            retrieve = self.retrieve_memories(token, weights, chunk_size = 1)
-            if return_values:
-                output = (*output, self.zero)
+            output = retrieve, NeuralMemCache(curr_seq_len, cache_store_seq, past_states, updates)
             return output
@@ -728,20 +706,18 @@ class NeuralMemory(Module):
             prev_layer_updates = TensorDict(prev_layer_updates)
             prev_layer_updates = prev_layer_updates.apply(lambda t: t[:, -1:])
-        values = None
         if store_seq_cache_len == self.chunk_size:
-            next_updates, next_states, values = self.store_memories(
+            next_updates, store_state = self.store_memories(
                 cache_store_seq,
                 weights,
                 past_state = past_states,
                 prev_layer_updates = prev_layer_updates,
-                value_residual = value_residual
             )
             updates = next_updates
             cache_store_seq = None
+            next_states = store_state.states
         # retrieve
@@ -749,14 +725,9 @@ class NeuralMemory(Module):
         # next state tuple
-        next_state = NeuralMemCache(curr_seq_len, cache_store_seq, next_states, updates)
-        output = (retrieved, next_state)
-        if return_values:
-            output = (*output, values)
+        next_store_state = NeuralMemCache(curr_seq_len, cache_store_seq, next_states, updates)
-        return output
+        return retrieved, next_store_state
     def forward(
         self,
@@ -767,50 +738,45 @@ class NeuralMemory(Module):
         return_aux_kv_loss = False,
         chunk_size = None,
         store_chunk_size = None,
-        return_values = False,
-        value_residual = None,
         return_next_state = False,
         prev_layer_updates: dict[str, Tensor] | None = None
     ):
         batch, seq_len = seq.shape[:2]
+        if not exists(mem_model_weights):
+            mem_model_weights = self.init_weights()
         if seq_len < self.retrieve_chunk_size:
-            out = self.init_empty_memory_embed(batch, seq_len)
+            retrieved = self.retrieve_memories(seq, mem_model_weights, chunk_size = 1)
             next_store_state = NeuralMemCache(seq_len, seq, None, None)
-            out = (out, next_store_state)
-            if return_values:
-                out = (*out, self.zero)
+            out = (retrieved, next_store_state)
             if not return_aux_kv_loss:
                 return out
             return out, self.zero
-        if not exists(mem_model_weights):
-            mem_model_weights = self.init_weights()
         # store
         store_seq = default(store_seq, seq)
-        store_seq_len = store_seq.shape[-2]
-        store_chunk_size = default(store_chunk_size, chunk_size, self.store_chunk_size)
-        remainder = store_seq_len % store_chunk_size
-        (updates, next_state, values), aux_kv_recon_loss = self.store_memories(
+        (updates, next_store_state), aux_kv_recon_loss = self.store_memories(
             store_seq,
             mem_model_weights,
             chunk_size = store_chunk_size,
             prev_layer_updates = prev_layer_updates,
-            value_residual = value_residual,
             return_aux_kv_loss = True
         )
         # retrieve
+        if exists(prev_layer_updates):
+            prev_layer_updates = prev_layer_updates.apply(lambda t: pad_at_dim(t, (1, 0), dim = 1))
+        updates = updates.apply(lambda t: pad_at_dim(t, (1, 0), dim = 1))
         retrieved = self.retrieve_memories(
             seq,
             mem_model_weights + updates,
@@ -818,21 +784,8 @@ class NeuralMemory(Module):
             prev_layer_updates = prev_layer_updates
         )
-        # determine state for the storing of memories
-        # for transformer-xl like training with neural memory as well as inferencing with initial prompt
-        cache_store_seq = None
-        if remainder > 0:
-            cache_store_seq = store_seq[:, -remainder:]
-        next_store_state = NeuralMemCache(seq_len, cache_store_seq, next_state, updates)
         output = (retrieved, next_store_state)
-        if return_values:
-            output = (*output, values)
         if not return_aux_kv_loss:
             return output

{titans_pytorch-0.2.0 → titans_pytorch-0.2.4}/train_mac.py RENAMED Viewed

@@ -34,7 +34,6 @@ NEURAL_MEM_LAYERS = (2, 4, 6)               # layers 2, 4, 6 have neural memory,
 NEURAL_MEM_GATE_ATTN_OUTPUT = False
 NEURAL_MEM_MOMENTUM = True
 NEURAL_MEM_QK_NORM = False
-NEURAL_MEM_ADD_VALUE_RESIDUAL = False
 WINDOW_SIZE = 32
 NEURAL_MEM_SEGMENT_LEN = WINDOW_SIZE // 2 # set smaller for more granularity for learning rate / momentum etc
 SLIDING_WINDOWS = True
@@ -91,7 +90,6 @@ model = MemoryAsContextTransformer(
     use_flex_attn = USE_FLEX_ATTN,
     sliding_window_attn = SLIDING_WINDOWS,
     weight_tie_memory_model = WEIGHT_TIE_MEMORY_MODEL,
-    neural_memory_add_value_residual = NEURAL_MEM_ADD_VALUE_RESIDUAL,
     neural_memory_model = MemoryMLP(
         dim = 64,
         depth = NEURAL_MEMORY_DEPTH
@@ -164,6 +162,6 @@ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval = 10., desc = 'training'):
         prime = decode_tokens(inp)
         print(f'%s \n\n %s', (prime, '*' * 100))
-        sample = model.sample(inp[None, ...], GENERATE_LENGTH, use_cache = True)
+        sample = model.sample(inp[None, ...], GENERATE_LENGTH, use_cache = USE_FAST_INFERENCE)
         output_str = decode_tokens(sample[0])
         print(output_str)