PyPI - titans-pytorch - Versions diffs - 0.2.9__tar.gz → 0.2.11__tar.gz - Mend

titans-pytorch 0.2.9tar.gz → 0.2.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{titans_pytorch-0.2.9 → titans_pytorch-0.2.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.2.9
+Version: 0.2.11
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

{titans_pytorch-0.2.9 → titans_pytorch-0.2.11}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.2.9"
+version = "0.2.11"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.2.9 → titans_pytorch-0.2.11}/tests/test_titans.py RENAMED Viewed

@@ -73,101 +73,62 @@ def test_titans_attn_memory():
     assert seq.shape == retrieved.shape
-def test_retrieve_store_diff_seq():
-    mem = NeuralMemory(
+def test_neural_mem_chaining_chunks():
+    mem  = NeuralMemory(
         dim = 384,
-        chunk_size = (64, 32),
+        dim_head = 64,
+        heads = 2,
+        chunk_size = 16
     )
-    retrieve_seq = torch.randn(2, 64 * 64, 384)
-    store_seq = torch.randn(2, 64 * 32, 384)
+    seq = torch.randn(2, 48, 384)
-    retrieved, _ = mem(retrieve_seq, store_seq = store_seq)
+    parallel_retrieved, state = mem(seq)
-    assert retrieve_seq.shape == retrieved.shape
+    seq_first, seq_second, seq_third = seq.split(16, dim = 1)
-def test_weight_tied_mlp_neural_mem():
-    from titans_pytorch import MemoryMLP
+    first_retrieved, state = mem(seq_first)
+    second_retrieved, state = mem(seq_second, state = state)
+    third_retrieved, state = mem(seq_third, state = state)
-    mlp = MemoryMLP(64, depth = 2)
+    assert torch.allclose(parallel_retrieved, torch.cat((first_retrieved, second_retrieved, third_retrieved), dim = 1), atol = 1e-5)
+def test_neural_mem_chaining_with_batch_size():
     mem  = NeuralMemory(
         dim = 384,
         dim_head = 64,
         heads = 2,
-        chunk_size = 2,
-        model = mlp
-    )
-    mem2 = NeuralMemory(
-        dim = 384,
-        dim_head = 64,
-        heads = 2,
-        chunk_size = 2,
-        model = mlp
-    )
-    mem3 = NeuralMemory(
-        dim = 384,
-        dim_head = 64,
-        heads = 2,
-        chunk_size = 2,
-        model = mlp
-    )
-    seq = torch.randn(2, 128, 384)
-    seq, cache = mem(seq)
-    seq, cache2 = mem2(seq, prev_layer_updates = cache.updates)
-    seq, cache3 = mem3(seq, prev_layer_updates = cache2.updates)
-def test_mac_with_weight_tied_neural_mem():
-    from titans_pytorch import MemoryMLP, MemoryAsContextTransformer
-    transformer = MemoryAsContextTransformer(
-        num_tokens = 256,
-        dim = 256,
-        depth = 2,
-        segment_len = 2,
-        num_persist_mem_tokens = 0,
-        num_longterm_mem_tokens = 2,
-        neural_memory_segment_len = 2,
-        sliding_window_attn = True,
-        neural_memory_layers = (1, 2),
-        neural_memory_model = MemoryMLP(256, depth = 1),
-        num_residual_streams = 4,
-        weight_tie_memory_model = True,
-        neural_mem_gate_attn_output = True,
+        chunk_size = 16,
+        batch_size = 64
     )
+    seq = torch.randn(2, 112, 384)
-    ids = torch.randint(0, 256, (1, 1023))
-    logits = transformer(ids)
+    parallel_retrieved, state = mem(seq)
-    assert logits.shape == (1, 1023, 256)
+    seq_first, seq_second, seq_third = seq[:, :16], seq[:, 16:64], seq[:, 64:]
-def test_overriding_chunk_size():
-    mem = NeuralMemory(
-        dim = 384,
-        chunk_size = 64,
-    )
-    seq = torch.randn(2, 128 * 16, 384)
-    store_seq = torch.randn(2, 128 * 8, 384)
+    first_retrieved, state = mem(seq_first)
+    second_retrieved, state = mem(seq_second, state = state)
+    third_retrieved, state = mem(seq_third, state = state)
-    retrieved, _ = mem(seq, store_seq, chunk_size = 16, store_chunk_size = 8)
+    parallel_part_retrieved = torch.cat((first_retrieved, second_retrieved, third_retrieved), dim = 1)
-    assert seq.shape == retrieved.shape
+    assert torch.allclose(parallel_retrieved, parallel_part_retrieved, atol = 1e-5)
 @pytest.mark.parametrize('seq_len', (1023, 17))
 @pytest.mark.parametrize('num_persist_mem_tokens', (0, 16))
 @pytest.mark.parametrize('num_longterm_mem_tokens', (0, 16))
 @pytest.mark.parametrize('neural_mem_gate_attn_output', (False, True))
+@pytest.mark.parametrize('neural_mem_segment_len', (8, 16))
+@pytest.mark.parametrize('neural_mem_batch_size', (None, 64))
 def test_mac(
     seq_len,
     num_persist_mem_tokens,
     num_longterm_mem_tokens,
-    neural_mem_gate_attn_output
+    neural_mem_gate_attn_output,
+    neural_mem_segment_len,
+    neural_mem_batch_size
 ):
     transformer = MemoryAsContextTransformer(
         num_tokens = 256,
@@ -176,7 +137,9 @@ def test_mac(
         num_persist_mem_tokens = num_persist_mem_tokens,
         num_longterm_mem_tokens = num_longterm_mem_tokens,
         segment_len = 128,
-        neural_mem_gate_attn_output = neural_mem_gate_attn_output
+        neural_mem_gate_attn_output = neural_mem_gate_attn_output,
+        neural_memory_segment_len = neural_mem_segment_len,
+        neural_memory_batch_size = neural_mem_batch_size,
     )
     x = torch.randint(0, 256, (1, seq_len))

{titans_pytorch-0.2.9 → titans_pytorch-0.2.11}/titans_pytorch/mac_transformer.py RENAMED Viewed

@@ -481,6 +481,7 @@ class MemoryAsContextTransformer(Module):
         neural_memory_add_value_residual = False,
         num_longterm_mem_tokens = 0,
         num_persist_mem_tokens = 0,
+        neural_memory_batch_size = None,
         dim_head = 64,
         heads = 8,
         ff_mult = 4,
@@ -488,11 +489,8 @@ class MemoryAsContextTransformer(Module):
         neural_memory_model: Module | None = None,
         neural_memory_kwargs: dict = dict(),
         neural_memory_layers: tuple[int, ...] | None = None,
-        aux_kv_recon_loss_weight = 1.,
         use_flex_attn = False,
         sliding_window_attn = False,
-        weight_tie_memory_model = False,
-        prev_neural_mem_update_for_weights = None
     ):
         super().__init__()
@@ -526,16 +524,6 @@ class MemoryAsContextTransformer(Module):
         neural_memory_layers = default(neural_memory_layers, layers)
-        # weight tying neural memory model
-        maybe_copy = deepcopy if not weight_tie_memory_model else identity
-        if weight_tie_memory_model:
-            assert exists(neural_memory_model), '`neural_memory_model` must be explicitly set'
-        self.weight_tie_memory_model = weight_tie_memory_model
-        self.prev_neural_mem_update_for_weights = default(prev_neural_mem_update_for_weights, weight_tie_memory_model)
         # mem, attn, and feedforward layers
         for layer in layers:
@@ -564,7 +552,8 @@ class MemoryAsContextTransformer(Module):
                 mem = NeuralMemory(
                     dim = dim,
                     chunk_size = self.neural_memory_segment_len,
-                    model = maybe_copy(neural_memory_model),
+                    batch_size = neural_memory_batch_size,
+                    model = deepcopy(neural_memory_model),
                     **neural_memory_kwargs
                 )
@@ -585,10 +574,7 @@ class MemoryAsContextTransformer(Module):
         self.gate_attn_output = neural_mem_gate_attn_output
-        # auxiliary loss on kv recon
-        self.has_aux_kv_recon_loss = aux_kv_recon_loss_weight > 0.
-        self.aux_kv_recon_loss_weight = aux_kv_recon_loss_weight
+        # zero for maybe aux loss + device
         self.register_buffer('zero', torch.tensor(0.), persistent = False)
@@ -696,7 +682,7 @@ class MemoryAsContextTransformer(Module):
         # math
-        batch, seq_len, neural_mem_segment_len, segment_len, num_longterm_mem_tokens, attn_window_size, prev_neural_mem_update_for_weights = *x.shape, self.neural_memory_segment_len, self.segment_len, self.num_longterm_mem_tokens, self.attn_window_size, self.prev_neural_mem_update_for_weights
+        batch, seq_len, neural_mem_segment_len, segment_len, num_longterm_mem_tokens, attn_window_size = *x.shape, self.neural_memory_segment_len, self.segment_len, self.num_longterm_mem_tokens, self.attn_window_size
         seq_len_with_mem = self.seq_len_with_longterm_mem(seq_len)
@@ -749,18 +735,10 @@ class MemoryAsContextTransformer(Module):
         next_kv_caches = []
         next_neural_mem_caches = []
-        # weight tied neural memory
-        neural_memory_updates = None
         # value residual
         value_residual = None
-        # aux losses
-        kv_recon_losses = self.zero
         # when inferencing, only do one token at a time
         if is_inferencing:
@@ -784,24 +762,16 @@ class MemoryAsContextTransformer(Module):
                 mem_input, add_residual = mem_hyper_conn(x)
                 if not is_inferencing:
-                    (retrieved, next_neural_mem_cache), mem_kv_aux_loss = mem(
-                        mem_input,
-                        return_aux_kv_loss = True,
-                        prev_layer_updates = neural_memory_updates
+                    retrieved, next_neural_mem_cache = mem(
+                        mem_input
                     )
-                    kv_recon_losses = kv_recon_losses + mem_kv_aux_loss
                 else:
                     (retrieved, next_neural_mem_cache) = mem.forward_inference(
                         mem_input,
                         state = next(neural_mem_caches, None),
-                        prev_layer_updates = neural_memory_updates
                     )
-                if prev_neural_mem_update_for_weights:
-                    neural_memory_updates = next_neural_mem_cache.updates
                 if self.gate_attn_output:
                     attn_out_gates = retrieved.sigmoid()
                 else:
@@ -883,14 +853,4 @@ class MemoryAsContextTransformer(Module):
             return logits, next_cache
-        ar_loss = F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)
-        losses = ar_loss
-        if self.has_aux_kv_recon_loss:
-            losses = losses + kv_recon_losses * self.aux_kv_recon_loss_weight
-        if not return_loss_breakdown:
-            return losses
-        return losses, (ar_loss, kv_recon_losses)
+        return F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)

{titans_pytorch-0.2.9 → titans_pytorch-0.2.11}/titans_pytorch/neural_memory.py RENAMED Viewed

@@ -6,7 +6,7 @@ from functools import partial
 from collections import namedtuple
 import torch
-from torch import nn, cat, Tensor
+from torch import nn, cat, tensor, Tensor
 import torch.nn.functional as F
 from torch.nn import Linear, Module, Parameter, ParameterList
 from torch.func import functional_call, vmap, grad
@@ -38,7 +38,13 @@ w - num memory network weight parameters
 LinearNoBias = partial(Linear, bias = False)
-NeuralMemCache = namedtuple('NeuralMemCache', ['seq', 'cache_store_segment', 'states', 'updates'])
+NeuralMemCache = namedtuple('NeuralMemCache', [
+    'seq_index',
+    'weights',
+    'cache_store_segment',
+    'states',
+    'updates',
+])
 # functions
@@ -57,6 +63,9 @@ def identity(t):
 def xnor(x, y):
     return not (x ^ y)
+def divisible_by(num, den):
+    return (num % den) == 0
 def safe_cat(inputs, dim = -2):
     inputs = tuple(filter(exists, inputs))
@@ -67,9 +76,18 @@ def safe_cat(inputs, dim = -2):
     return cat(inputs, dim = dim)
+def is_empty_tensor(t):
+    return t.numel() == 0
 def dict_get_shape(td):
     return {k: v.shape for k, v in td.items()}
+def rearrange_dict_values(td, pattern, **kwargs):
+    return td.apply(lambda t: rearrange(t, pattern, **kwargs))
+def repeat_dict_values(td, pattern, **kwargs):
+    return td.apply(lambda t: repeat(t, pattern, **kwargs))
 def pair(v):
     return (v, v) if not isinstance(v, tuple) else v
@@ -106,6 +124,9 @@ def softclamp_max(t, max_value):
     return ((t / half_max_value).tanh() * half_max_value) + half_max_value
 def softclamp_grad_norm(t, max_value):
+    if is_empty_tensor(t):
+        return t
     t, inverse = pack_one_with_inverse(t, 'bn *')
     norm = t.norm(dim = -1, keepdim = True)
@@ -195,6 +216,12 @@ class AssocScan(Module):
     ):
         remove_prev = default(remove_prev, exists(prev))
+        inputs, inverse_pack_weight_shape = pack_one_with_inverse(inputs, 'b n *')
+        gates, _ = pack_one_with_inverse(gates, 'b n *')
+        if exists(prev):
+            prev, _ = pack_one_with_inverse(prev, 'b *')
         if exists(prev):
             inputs, _ = pack([prev, inputs], 'b * d')
             gates = pad_at_dim(gates, (1, 0), value = 1., dim = -2)
@@ -205,7 +232,7 @@ class AssocScan(Module):
             if remove_prev:
                 out = out[:, 1:]
-            return out
+            return inverse_pack_weight_shape(out)
         from accelerated_scan.triton import scan as triton_scan
         from accelerated_scan.warp import scan as warp_scan
@@ -226,6 +253,7 @@ class AssocScan(Module):
             outputs = outputs[..., :seq_len]
             outputs = rearrange(outputs, 'b d n -> b n d')
             return outputs
         out = accelerate_scan_fn(gates, inputs)
@@ -233,7 +261,7 @@ class AssocScan(Module):
         if remove_prev:
             out = out[:, 1:]
-        return out
+        return inverse_pack_weight_shape(out)
 # main neural memory
@@ -248,12 +276,13 @@ class NeuralMemory(Module):
         self,
         dim,
         chunk_size: int | tuple[int, int] = 1,
+        batch_size = None,
         dim_head = None,
         heads = 1,
         model: Module | None = None,
         store_memory_loss_fn: Callable = default_loss_fn,
         adaptive_step_transform: Callable | None = None,
-        default_step_transform_max_lr = 1e-2,
+        default_step_transform_max_lr = 1.,
         per_parameter_lr_modulation = False, # allow outer network to control learning rate per weight matrix of memory network
         max_mem_layer_modulation = 1e1, # max of 10.
         attn_pool_chunks = False,
@@ -274,6 +303,13 @@ class NeuralMemory(Module):
         self.retrieve_chunk_size, self.store_chunk_size = pair(chunk_size)
+        # batch size
+        if exists(batch_size):
+            assert divisible_by(batch_size, self.store_chunk_size)
+        self.batch_size = batch_size
         # associative scan
         self.assoc_scan = AssocScan(use_accelerated = use_accelerated_scan)
@@ -342,14 +378,13 @@ class NeuralMemory(Module):
             pred = functional_call(self.memory_model, params, inputs)
             loss = self.store_memory_loss_fn(pred, target) # simple mse loss in paper - eq (12) - |M(k) - v|²
             weighted_loss = loss * loss_weights
-            return weighted_loss.sum(), weighted_loss.mean()
+            return weighted_loss.sum()
         # two functions
-        grad_fn = grad(forward_and_loss, has_aux = True)
+        grad_fn = grad(forward_and_loss)
-        self.per_sample_grad_fn = vmap(grad_fn, in_dims = (None, 0, 0, 0))
-        self.per_sample_grad_fn_expanded_weights = vmap(grad_fn, in_dims = (0,) * 4)
+        self.per_sample_grad_fn = vmap(grad_fn, in_dims = (0, 0, 0, 0))
         # queries for retrieving from the model
@@ -417,56 +452,58 @@ class NeuralMemory(Module):
         self.register_buffer('zero', torch.tensor(0.), persistent = False)
-    def init_weights(self):
+    def init_weights(
+        self,
+        batch,
+    ):
         weights = TensorDict(dict(self.memory_model.named_parameters()))
+        weights = repeat_dict_values(weights, '... -> bh ...', bh = batch * self.heads)
         return weights
+    def init_momentum(
+        self,
+        batch,
+    ):
+        weights = TensorDict(dict(self.memory_model.named_parameters()))
+        zeros = weights.clone().zero_()
+        zeros = repeat_dict_values(zeros, '... -> bh ...', bh = batch * self.heads)
+        return zeros
     def store_memories(
         self,
         seq,
-        weights: dict[str, Tensor],
+        weights: dict[str, Tensor] | None = None,
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
-        prev_layer_updates: dict[str, Tensor] | None = None,
-        return_aux_kv_loss = False,
-        chunk_size = None,
+        seq_index = 0
     ):
-        seq_len, heads, chunk_size = seq.shape[-2], self.heads, default(chunk_size, self.store_chunk_size)
-        # handle edge case
-        if seq_len < chunk_size:
-            return TensorDict(weights).clone().zero_(), self.zero
-        seq = self.store_norm(seq)
+        batch, seq_len, heads, chunk_size = *seq.shape[:2], self.heads, self.store_chunk_size
         # curtail sequence by multiple of the chunk size
         # only a complete chunk of the sequence provides the memory for the next chunk
         round_down_seq_len = round_down_multiple(seq_len, chunk_size)
+        num_chunks = round_down_seq_len // chunk_size
         seq, remainder = seq[:, :round_down_seq_len], seq[:, round_down_seq_len:]
-        # per sample grad function
-        per_sample_grad_fn = self.per_sample_grad_fn
+        next_seq_len_index = seq_index + round_down_seq_len
+        # init weights if needed
         # weights of the memory network
+        if not exists(weights):
+            weights = self.init_weights(batch)
         weights = TensorDict(weights)
         # allow for neural memory of a previous layer to influence surprise of current layer
-        weights_for_surprise = weights
-        if exists(prev_layer_updates):
-            prev_layer_updates = TensorDict(prev_layer_updates)
-            weights_for_surprise = weights_for_surprise + prev_layer_updates
-            per_sample_grad_fn = self.per_sample_grad_fn_expanded_weights # the weights will now have a batch * chunk dimension
+        weights_for_surprise = repeat_dict_values(weights, 'b ... -> b n ...', n = num_chunks)
         # derive learned hparams for optimization of memory network
+        seq = self.store_norm(seq)
         adaptive_lr = self.to_adaptive_step(seq)
         adaptive_lr = self.adaptive_step_transform(adaptive_lr)
@@ -474,7 +511,7 @@ class NeuralMemory(Module):
         decay_factor = self.to_decay_factor(chunked_seq).sigmoid()
-        need_layer_lr_mod = exists(self.to_layer_modulation)
+        need_layer_lr_mod = exists(self.to_layer_modulation) and num_chunks > 0
         has_momentum = exists(self.to_momentum)
         if has_momentum:
@@ -505,12 +542,11 @@ class NeuralMemory(Module):
         # flatten batch and time if surprise depends on previous layer memory model
-        if exists(prev_layer_updates):
-            weights_for_surprise = weights_for_surprise.apply(lambda t: rearrange(t, 'b n ... -> (b n) ...'))
+        weights_for_surprise = rearrange_dict_values(weights_for_surprise, 'b n ... -> (b n) ...')
         # get grads and extra auxiliary loss (for backwarding through qkv projection in base neural memory module)
-        grads, aux_kv_recon_loss = per_sample_grad_fn(dict(weights_for_surprise), keys, adaptive_lr, values)
+        grads = self.per_sample_grad_fn(dict(weights_for_surprise), keys, adaptive_lr, values)
         grads = TensorDict(grads)
@@ -521,7 +557,7 @@ class NeuralMemory(Module):
         # restore batch and sequence dimension
-        grads = grads.apply(lambda t: rearrange(t, '(b n) ... -> b n ...', b = batch * heads))
+        grads = rearrange_dict_values(grads, '(b n) ... -> b n ...', b = batch * heads)
         # maybe per layer modulation
@@ -535,19 +571,25 @@ class NeuralMemory(Module):
         # past states
         if not exists(past_state):
-            empty_dict = {key: None for key in weights.keys()}
             # minibatch_init_weight corresponds to W0 in figure 7 of TTT paper
             minibatch_init_weight = weights
+            init_momentum = self.init_momentum(batch)
-            if dict_get_shape(weights) == self.init_weight_shape:
-                minibatch_init_weight = weights.apply(lambda t: repeat(t, '... -> b 1 (...)', b = batch * heads))
-            past_state = (minibatch_init_weight, empty_dict)
+            past_state = (minibatch_init_weight, init_momentum)
         past_last_update, past_last_momentum = past_state
+        # early return if sequence length less than chunk size
+        if num_chunks == 0:
+            updates = rearrange_dict_values(weights, 'bh ... -> bh 1 ...')
+            next_store_state = NeuralMemCache(next_seq_len_index, weights, remainder, past_state, updates)
+            output = (updates, next_store_state)
+            return output
         # momentum + weight decay - momentum is the new contribution, as most linear RNNs have learned forgetting gates
         next_momentum = TensorDict() if has_momentum else None
@@ -558,8 +600,6 @@ class NeuralMemory(Module):
         for (param_name, surprise), (_, last_update), (_, last_momentum) in zip(surprises.items(), past_last_update.items(), past_last_momentum.items()):
-            surprise, inverse_pack = pack_one_with_inverse(surprise, 'b n *')
             update = surprise
             # derive momentum with associative scan - eq (10)
@@ -571,62 +611,51 @@ class NeuralMemory(Module):
             # use associative scan again for learned forgetting (weight decay) - eq (13)
-            update = self.assoc_scan(1. - decay_factor, update, prev = last_update)
+            update = self.assoc_scan(1. - decay_factor, update, prev = last_update, remove_prev = False)
             next_last_update[param_name] = update[:, -1]
-            updates[param_name] = inverse_pack(update)
+            updates[param_name] = update
             if has_momentum:
-                next_momentum[param_name] = inverse_pack(momentum)
+                next_momentum[param_name] = momentum
         # determine next state for the storing of memories
         next_state = (next_last_update, next_last_momentum)
-        next_store_state = NeuralMemCache(seq_len, remainder, next_state, updates)
+        next_store_state = NeuralMemCache(next_seq_len_index, weights, remainder, next_state, updates)
         # returns
         output = (updates, next_store_state)
-        if not return_aux_kv_loss:
-            return output
-        return output, aux_kv_recon_loss.mean()
+        return output
     def retrieve_memories(
         self,
         seq,
         past_weights: dict[str, Tensor],
-        chunk_size = None,
-        prev_layer_updates: dict[str, Tensor] | None = None
     ):
-        chunk_size = default(chunk_size, self.retrieve_chunk_size)
+        chunk_size = self.retrieve_chunk_size
         batch, seq_len = seq.shape[:2]
         seq = self.retrieve_norm(seq)
-        assert seq_len >= chunk_size, 'must be handled outside of retrieve'
         needs_pad = chunk_size > 1
-        if needs_pad:
-            seq = pad_at_dim(seq, (1, 0), dim = 1)
-            seq_len_plus_one = seq.shape[-2]
+        seq = pad_at_dim(seq, (1, 0), dim = 1)
+        seq_len_plus_one = seq.shape[-2]
-            next_seq_len = round_up_multiple(seq_len_plus_one, chunk_size)
+        next_seq_len = round_up_multiple(seq_len_plus_one, chunk_size)
-            padding = next_seq_len - seq_len_plus_one
-            seq = pad_at_dim(seq, (0, padding), dim = 1)
+        padding = next_seq_len - seq_len_plus_one
+        seq = pad_at_dim(seq, (0, padding), dim = 1)
         # the parameters of the memory model stores the memories of the key / values
         # when the MLP has only 1 weight matrix, it is equivalent to `kv` fast weight memories from linear attention literature (recall fetching of memories is q @ (kv)) / schmidhuber's paper
         curr_weights = TensorDict(past_weights)
-        if exists(prev_layer_updates):
-            curr_weights = curr_weights + TensorDict(prev_layer_updates)
         # sequence Float['b n d'] to queries
         queries = self.to_queries(seq)
@@ -642,7 +671,7 @@ class NeuralMemory(Module):
         # fetch values from memory model
         if dict_get_shape(curr_weights) != self.init_weight_shape:
-            curr_weights = curr_weights.apply(lambda t: rearrange(t, 'b n ... -> (b n) ...'))
+            curr_weights = rearrange_dict_values(curr_weights, 'b n ... -> (b n) ...')
         queries = rearrange(queries, 'b h (n c) d -> (b h n) c d', c = chunk_size)
@@ -669,8 +698,7 @@ class NeuralMemory(Module):
         # restore, pad with empty memory embed
-        if needs_pad:
-            values = values[:, 1:(seq_len + 1)]
+        values = values[:, 1:(seq_len + 1)]
         return values
@@ -678,16 +706,14 @@ class NeuralMemory(Module):
     def forward_inference(
         self,
         token: Tensor,
-        state = None,
-        prev_layer_updates: dict[str, Tensor] | None = None,
+        state: NeuralMemCache | None = None,
     ):
         # unpack previous state
         if not exists(state):
-            state = (0, None, None, None)
+            state = (0, None, None, None, None)
-        seq_index, cache_store_seq, past_states, updates = state
+        seq_index, weights, cache_store_seq, past_states, updates = state
         curr_seq_len = seq_index + 1
         batch = token.shape[0]
@@ -695,9 +721,7 @@ class NeuralMemory(Module):
         if token.ndim == 2:
             token = rearrange(token, 'b d -> b 1 d')
-        # get memory model weights
-        weights = self.init_weights()
+        assert token.shape[1] == 1
         # increment the sequence cache which is at most the chunk size
@@ -708,7 +732,7 @@ class NeuralMemory(Module):
         if curr_seq_len < self.chunk_size:
             retrieve = self.retrieve_memories(token, weights, chunk_size = 1)
-            output = retrieve, NeuralMemCache(curr_seq_len, cache_store_seq, past_states, updates)
+            output = retrieve, NeuralMemCache(curr_seq_len, weights, cache_store_seq, past_states, updates)
             return output
@@ -719,21 +743,16 @@ class NeuralMemory(Module):
         if not exists(updates):
             updates = weights.clone().zero_()
-            updates = updates.apply(lambda t: repeat(t, '... -> b 1 ...', b = batch))
+            updates = repeat_dict_values(updates, '... -> b 1 ...', b = batch)
         else:
             updates = updates.apply(lambda t: t[:, -1:])
-        if exists(prev_layer_updates):
-            prev_layer_updates = TensorDict(prev_layer_updates)
-            prev_layer_updates = prev_layer_updates.apply(lambda t: t[:, -1:])
         if store_seq_cache_len == self.chunk_size:
             next_updates, store_state = self.store_memories(
                 cache_store_seq,
                 weights,
                 past_state = past_states,
-                prev_layer_updates = prev_layer_updates,
             )
             updates = next_updates
@@ -746,7 +765,7 @@ class NeuralMemory(Module):
         # next state tuple
-        next_store_state = NeuralMemCache(curr_seq_len, cache_store_seq, next_states, updates)
+        next_store_state = NeuralMemCache(curr_seq_len, weights, cache_store_seq, next_states, updates)
         return retrieved, next_store_state
@@ -754,63 +773,99 @@ class NeuralMemory(Module):
         self,
         seq,
         store_seq = None,
-        mem_model_weights: dict[str, Tensor] | None = None,
-        past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
-        return_aux_kv_loss = False,
-        chunk_size = None,
-        store_chunk_size = None,
-        return_next_state = False,
-        prev_layer_updates: dict[str, Tensor] | None = None
+        state: NeuralMemCache | None = None,
     ):
-        batch, seq_len = seq.shape[:2]
+        if not exists(state):
+            state = (0, None, None, None, None)
-        if not exists(mem_model_weights):
-            mem_model_weights = self.init_weights()
+        seq_index, weights, cache_store_seq, past_state, updates = state
-        if seq_len < self.retrieve_chunk_size:
-            retrieved = self.retrieve_memories(seq, mem_model_weights, chunk_size = 1)
+        assert not exists(cache_store_seq) or is_empty_tensor(cache_store_seq)
-            next_store_state = NeuralMemCache(seq_len, seq, None, None)
+        # store
-            out = (retrieved, next_store_state)
+        store_seq = default(store_seq, seq)
-            if not return_aux_kv_loss:
-                return out
+        # functions
-            return out, self.zero
+        # compute split sizes of sequence
+        # for now manually update weights to last update at the correct boundaries
-        # store
+        store_seq_len, chunk_size, batch_size = store_seq.shape[-2], self.chunk_size, self.batch_size
-        store_seq = default(store_seq, seq)
+        need_update_weights = exists(batch_size)
-        (updates, next_store_state), aux_kv_recon_loss = self.store_memories(
-            store_seq,
-            mem_model_weights,
-            chunk_size = store_chunk_size,
-            prev_layer_updates = prev_layer_updates,
-            return_aux_kv_loss = True
-        )
+        # determine split sizes and when to update
-        # retrieve
+        if need_update_weights:
+            update_after_final_store = divisible_by(seq_index + store_seq_len, batch_size)
+            seq_range = torch.arange(store_seq_len) + seq_index + 1
+            batch_boundary = divisible_by(seq_range, batch_size)
+            indices = seq_range[batch_boundary] - seq_index
+            indices = F.pad(indices, (1, 0), value = 0)
+            if indices[-1] != store_seq_len:
+                indices = F.pad(indices, (0, 1), value = store_seq_len)
+            split_sizes = (indices[1:] - indices[:-1]).tolist()
+            assert sum(split_sizes) == store_seq_len
+        else:
+            split_sizes = (store_seq_len,)
+            update_after_final_store = False
-        retrieve_chunk_size = default(chunk_size, self.retrieve_chunk_size)
+        # accumulate updates
-        if retrieve_chunk_size != 1:
-            if exists(prev_layer_updates):
-                prev_layer_updates = prev_layer_updates.apply(lambda t: pad_at_dim(t, (1, 0), dim = 1))
+        updates = None
-            updates = updates.apply(lambda t: pad_at_dim(t, (1, 0), dim = 1))
+        def accum_updates(past_updates, future_updates):
+            if not exists(past_updates):
+                return future_updates
+            return TensorDict({param_name: cat((past_update[:, :-1], future_update), dim = 1) for (param_name, past_update), (_, future_update) in zip(past_updates.items(), future_updates.items())})
+        # loop through chunks of store sequences
+        store_seqs = store_seq.split(split_sizes, dim = -2)
+        for ind, store_seq_chunk in enumerate(store_seqs):
+            is_last = ind == (len(store_seqs) - 1)
+            # store
+            next_updates, next_neural_mem_state = self.store_memories(
+                store_seq_chunk,
+                weights,
+                seq_index = seq_index,
+                past_state = past_state,
+            )
+            seq_index = next_neural_mem_state.seq_index
+            past_state = next_neural_mem_state.states
+            updates = accum_updates(updates, next_updates)
+            if is_last and not update_after_final_store:
+                continue
+            # update weights once batch size is fulfilled
+            last_update, _ = past_state
+            weights = last_update
+            next_neural_mem_state = list(next_neural_mem_state)
+            next_neural_mem_state[1] = last_update
+            next_neural_mem_state = NeuralMemCache(*next_neural_mem_state)
+        # retrieve
         retrieved = self.retrieve_memories(
             seq,
-            updates,
-            chunk_size = chunk_size,
-            prev_layer_updates = prev_layer_updates
+            updates
         )
-        output = (retrieved, next_store_state)
-        if not return_aux_kv_loss:
-            return output
-        return output, aux_kv_recon_loss
+        return retrieved, next_neural_mem_state

{titans_pytorch-0.2.9 → titans_pytorch-0.2.11}/train_mac.py RENAMED Viewed

@@ -35,13 +35,11 @@ NEURAL_MEM_GATE_ATTN_OUTPUT = False
 NEURAL_MEM_MOMENTUM = True
 NEURAL_MEM_QK_NORM = True
 WINDOW_SIZE = 32
-NEURAL_MEM_SEGMENT_LEN = WINDOW_SIZE // 2       # set smaller for more granularity for learning rate / momentum etc
+NEURAL_MEM_SEGMENT_LEN = 2                      # set smaller for more granularity for learning rate / momentum etc
+NEURAL_MEM_BATCH_SIZE = 128                     # set smaller to update the neural memory weights more often as it traverses the sequence
 SLIDING_WINDOWS = True
-WEIGHT_TIE_MEMORY_MODEL = False                 # set to have memory MLP shared across layers
-PREV_MEM_UPDATE_FOR_WEIGHTS = True,
 STORE_ATTN_POOL_CHUNKS = True                   # whether to use attention pooling for chunk derived momentum, per-layer lr mod, decay
 MEMORY_MODEL_PER_LAYER_LEARNED_LR = True
-KV_RECON_LOSS_WEIGHT = 1.
 # experiment related
@@ -86,12 +84,10 @@ model = MemoryAsContextTransformer(
     num_longterm_mem_tokens = NUM_LONGTERM_MEM,
     neural_memory_layers = NEURAL_MEM_LAYERS,
     neural_memory_segment_len = NEURAL_MEM_SEGMENT_LEN,
+    neural_memory_batch_size = NEURAL_MEM_BATCH_SIZE,
     neural_mem_gate_attn_output = NEURAL_MEM_GATE_ATTN_OUTPUT,
-    aux_kv_recon_loss_weight = KV_RECON_LOSS_WEIGHT,
     use_flex_attn = USE_FLEX_ATTN,
     sliding_window_attn = SLIDING_WINDOWS,
-    weight_tie_memory_model = WEIGHT_TIE_MEMORY_MODEL,
-    prev_neural_mem_update_for_weights = PREV_MEM_UPDATE_FOR_WEIGHTS,
     neural_memory_model = MemoryMLP(
         dim = 64,
         depth = NEURAL_MEMORY_DEPTH
@@ -143,20 +139,20 @@ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval = 10., desc = 'training'):
     model.train()
     for __ in range(GRADIENT_ACCUMULATE_EVERY):
-        loss, (ar_loss, kv_recon_losses) = model(next(train_loader), return_loss = True, return_loss_breakdown = True)
+        loss = model(next(train_loader), return_loss = True)
         loss.backward()
-    print(f'training loss: {ar_loss.item()}')
+    print(f'training loss: {loss.item()}')
     torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
     optim.step()
     optim.zero_grad()
-    wandb.log(dict(loss = ar_loss.item()))
+    wandb.log(dict(loss = loss.item()))
     if i % VALIDATE_EVERY == 0:
         model.eval()
         with torch.no_grad():
-            loss, (ar_loss, _) = model(next(val_loader), return_loss = True, return_loss_breakdown = True)
-            print(f'validation loss: {ar_loss.item()}')
+            loss = model(next(val_loader), return_loss = True)
+            print(f'validation loss: {loss.item()}')
     if SHOULD_GENERATE and i % GENERATE_EVERY == 0:
         model.eval()