PyPI - titans-pytorch - Versions diffs - 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl - Mend

titans-pytorch 0.2.9py3-none-any.whl → 0.2.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

titans_pytorch/mac_transformer.py CHANGED Viewed

@@ -481,6 +481,7 @@ class MemoryAsContextTransformer(Module):
         neural_memory_add_value_residual = False,
         num_longterm_mem_tokens = 0,
         num_persist_mem_tokens = 0,
+        neural_memory_batch_size = None,
         dim_head = 64,
         heads = 8,
         ff_mult = 4,
@@ -488,11 +489,8 @@ class MemoryAsContextTransformer(Module):
         neural_memory_model: Module | None = None,
         neural_memory_kwargs: dict = dict(),
         neural_memory_layers: tuple[int, ...] | None = None,
-        aux_kv_recon_loss_weight = 1.,
         use_flex_attn = False,
         sliding_window_attn = False,
-        weight_tie_memory_model = False,
-        prev_neural_mem_update_for_weights = None
     ):
         super().__init__()
@@ -526,16 +524,6 @@ class MemoryAsContextTransformer(Module):
         neural_memory_layers = default(neural_memory_layers, layers)
-        # weight tying neural memory model
-        maybe_copy = deepcopy if not weight_tie_memory_model else identity
-        if weight_tie_memory_model:
-            assert exists(neural_memory_model), '`neural_memory_model` must be explicitly set'
-        self.weight_tie_memory_model = weight_tie_memory_model
-        self.prev_neural_mem_update_for_weights = default(prev_neural_mem_update_for_weights, weight_tie_memory_model)
         # mem, attn, and feedforward layers
         for layer in layers:
@@ -564,7 +552,8 @@ class MemoryAsContextTransformer(Module):
                 mem = NeuralMemory(
                     dim = dim,
                     chunk_size = self.neural_memory_segment_len,
-                    model = maybe_copy(neural_memory_model),
+                    batch_size = neural_memory_batch_size,
+                    model = deepcopy(neural_memory_model),
                     **neural_memory_kwargs
                 )
@@ -585,10 +574,7 @@ class MemoryAsContextTransformer(Module):
         self.gate_attn_output = neural_mem_gate_attn_output
-        # auxiliary loss on kv recon
-        self.has_aux_kv_recon_loss = aux_kv_recon_loss_weight > 0.
-        self.aux_kv_recon_loss_weight = aux_kv_recon_loss_weight
+        # zero for maybe aux loss + device
         self.register_buffer('zero', torch.tensor(0.), persistent = False)
@@ -696,7 +682,7 @@ class MemoryAsContextTransformer(Module):
         # math
-        batch, seq_len, neural_mem_segment_len, segment_len, num_longterm_mem_tokens, attn_window_size, prev_neural_mem_update_for_weights = *x.shape, self.neural_memory_segment_len, self.segment_len, self.num_longterm_mem_tokens, self.attn_window_size, self.prev_neural_mem_update_for_weights
+        batch, seq_len, neural_mem_segment_len, segment_len, num_longterm_mem_tokens, attn_window_size = *x.shape, self.neural_memory_segment_len, self.segment_len, self.num_longterm_mem_tokens, self.attn_window_size
         seq_len_with_mem = self.seq_len_with_longterm_mem(seq_len)
@@ -749,18 +735,10 @@ class MemoryAsContextTransformer(Module):
         next_kv_caches = []
         next_neural_mem_caches = []
-        # weight tied neural memory
-        neural_memory_updates = None
         # value residual
         value_residual = None
-        # aux losses
-        kv_recon_losses = self.zero
         # when inferencing, only do one token at a time
         if is_inferencing:
@@ -784,24 +762,16 @@ class MemoryAsContextTransformer(Module):
                 mem_input, add_residual = mem_hyper_conn(x)
                 if not is_inferencing:
-                    (retrieved, next_neural_mem_cache), mem_kv_aux_loss = mem(
-                        mem_input,
-                        return_aux_kv_loss = True,
-                        prev_layer_updates = neural_memory_updates
+                    retrieved, next_neural_mem_cache = mem(
+                        mem_input
                     )
-                    kv_recon_losses = kv_recon_losses + mem_kv_aux_loss
                 else:
                     (retrieved, next_neural_mem_cache) = mem.forward_inference(
                         mem_input,
                         state = next(neural_mem_caches, None),
-                        prev_layer_updates = neural_memory_updates
                     )
-                if prev_neural_mem_update_for_weights:
-                    neural_memory_updates = next_neural_mem_cache.updates
                 if self.gate_attn_output:
                     attn_out_gates = retrieved.sigmoid()
                 else:
@@ -883,14 +853,4 @@ class MemoryAsContextTransformer(Module):
             return logits, next_cache
-        ar_loss = F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)
-        losses = ar_loss
-        if self.has_aux_kv_recon_loss:
-            losses = losses + kv_recon_losses * self.aux_kv_recon_loss_weight
-        if not return_loss_breakdown:
-            return losses
-        return losses, (ar_loss, kv_recon_losses)
+        return F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)

titans_pytorch/neural_memory.py CHANGED Viewed

@@ -6,7 +6,7 @@ from functools import partial
 from collections import namedtuple
 import torch
-from torch import nn, cat, Tensor
+from torch import nn, cat, tensor, Tensor
 import torch.nn.functional as F
 from torch.nn import Linear, Module, Parameter, ParameterList
 from torch.func import functional_call, vmap, grad
@@ -38,7 +38,13 @@ w - num memory network weight parameters
 LinearNoBias = partial(Linear, bias = False)
-NeuralMemCache = namedtuple('NeuralMemCache', ['seq', 'cache_store_segment', 'states', 'updates'])
+NeuralMemCache = namedtuple('NeuralMemCache', [
+    'seq_index',
+    'weights',
+    'cache_store_segment',
+    'states',
+    'updates',
+])
 # functions
@@ -57,6 +63,9 @@ def identity(t):
 def xnor(x, y):
     return not (x ^ y)
+def divisible_by(num, den):
+    return (num % den) == 0
 def safe_cat(inputs, dim = -2):
     inputs = tuple(filter(exists, inputs))
@@ -67,9 +76,18 @@ def safe_cat(inputs, dim = -2):
     return cat(inputs, dim = dim)
+def is_empty_tensor(t):
+    return t.numel() == 0
 def dict_get_shape(td):
     return {k: v.shape for k, v in td.items()}
+def rearrange_dict_values(td, pattern, **kwargs):
+    return td.apply(lambda t: rearrange(t, pattern, **kwargs))
+def repeat_dict_values(td, pattern, **kwargs):
+    return td.apply(lambda t: repeat(t, pattern, **kwargs))
 def pair(v):
     return (v, v) if not isinstance(v, tuple) else v
@@ -106,6 +124,9 @@ def softclamp_max(t, max_value):
     return ((t / half_max_value).tanh() * half_max_value) + half_max_value
 def softclamp_grad_norm(t, max_value):
+    if is_empty_tensor(t):
+        return t
     t, inverse = pack_one_with_inverse(t, 'bn *')
     norm = t.norm(dim = -1, keepdim = True)
@@ -195,6 +216,12 @@ class AssocScan(Module):
     ):
         remove_prev = default(remove_prev, exists(prev))
+        inputs, inverse_pack_weight_shape = pack_one_with_inverse(inputs, 'b n *')
+        gates, _ = pack_one_with_inverse(gates, 'b n *')
+        if exists(prev):
+            prev, _ = pack_one_with_inverse(prev, 'b *')
         if exists(prev):
             inputs, _ = pack([prev, inputs], 'b * d')
             gates = pad_at_dim(gates, (1, 0), value = 1., dim = -2)
@@ -205,7 +232,7 @@ class AssocScan(Module):
             if remove_prev:
                 out = out[:, 1:]
-            return out
+            return inverse_pack_weight_shape(out)
         from accelerated_scan.triton import scan as triton_scan
         from accelerated_scan.warp import scan as warp_scan
@@ -226,6 +253,7 @@ class AssocScan(Module):
             outputs = outputs[..., :seq_len]
             outputs = rearrange(outputs, 'b d n -> b n d')
             return outputs
         out = accelerate_scan_fn(gates, inputs)
@@ -233,7 +261,7 @@ class AssocScan(Module):
         if remove_prev:
             out = out[:, 1:]
-        return out
+        return inverse_pack_weight_shape(out)
 # main neural memory
@@ -248,12 +276,13 @@ class NeuralMemory(Module):
         self,
         dim,
         chunk_size: int | tuple[int, int] = 1,
+        batch_size = None,
         dim_head = None,
         heads = 1,
         model: Module | None = None,
         store_memory_loss_fn: Callable = default_loss_fn,
         adaptive_step_transform: Callable | None = None,
-        default_step_transform_max_lr = 1e-2,
+        default_step_transform_max_lr = 1.,
         per_parameter_lr_modulation = False, # allow outer network to control learning rate per weight matrix of memory network
         max_mem_layer_modulation = 1e1, # max of 10.
         attn_pool_chunks = False,
@@ -274,6 +303,13 @@ class NeuralMemory(Module):
         self.retrieve_chunk_size, self.store_chunk_size = pair(chunk_size)
+        # batch size
+        if exists(batch_size):
+            assert divisible_by(batch_size, self.store_chunk_size)
+        self.batch_size = batch_size
         # associative scan
         self.assoc_scan = AssocScan(use_accelerated = use_accelerated_scan)
@@ -342,14 +378,13 @@ class NeuralMemory(Module):
             pred = functional_call(self.memory_model, params, inputs)
             loss = self.store_memory_loss_fn(pred, target) # simple mse loss in paper - eq (12) - |M(k) - v|²
             weighted_loss = loss * loss_weights
-            return weighted_loss.sum(), weighted_loss.mean()
+            return weighted_loss.sum()
         # two functions
-        grad_fn = grad(forward_and_loss, has_aux = True)
+        grad_fn = grad(forward_and_loss)
-        self.per_sample_grad_fn = vmap(grad_fn, in_dims = (None, 0, 0, 0))
-        self.per_sample_grad_fn_expanded_weights = vmap(grad_fn, in_dims = (0,) * 4)
+        self.per_sample_grad_fn = vmap(grad_fn, in_dims = (0, 0, 0, 0))
         # queries for retrieving from the model
@@ -417,56 +452,58 @@ class NeuralMemory(Module):
         self.register_buffer('zero', torch.tensor(0.), persistent = False)
-    def init_weights(self):
+    def init_weights(
+        self,
+        batch,
+    ):
         weights = TensorDict(dict(self.memory_model.named_parameters()))
+        weights = repeat_dict_values(weights, '... -> bh ...', bh = batch * self.heads)
         return weights
+    def init_momentum(
+        self,
+        batch,
+    ):
+        weights = TensorDict(dict(self.memory_model.named_parameters()))
+        zeros = weights.clone().zero_()
+        zeros = repeat_dict_values(zeros, '... -> bh ...', bh = batch * self.heads)
+        return zeros
     def store_memories(
         self,
         seq,
-        weights: dict[str, Tensor],
+        weights: dict[str, Tensor] | None = None,
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
-        prev_layer_updates: dict[str, Tensor] | None = None,
-        return_aux_kv_loss = False,
-        chunk_size = None,
+        seq_index = 0
     ):
-        seq_len, heads, chunk_size = seq.shape[-2], self.heads, default(chunk_size, self.store_chunk_size)
-        # handle edge case
-        if seq_len < chunk_size:
-            return TensorDict(weights).clone().zero_(), self.zero
-        seq = self.store_norm(seq)
+        batch, seq_len, heads, chunk_size = *seq.shape[:2], self.heads, self.store_chunk_size
         # curtail sequence by multiple of the chunk size
         # only a complete chunk of the sequence provides the memory for the next chunk
         round_down_seq_len = round_down_multiple(seq_len, chunk_size)
+        num_chunks = round_down_seq_len // chunk_size
         seq, remainder = seq[:, :round_down_seq_len], seq[:, round_down_seq_len:]
-        # per sample grad function
-        per_sample_grad_fn = self.per_sample_grad_fn
+        next_seq_len_index = seq_index + round_down_seq_len
+        # init weights if needed
         # weights of the memory network
+        if not exists(weights):
+            weights = self.init_weights(batch)
         weights = TensorDict(weights)
         # allow for neural memory of a previous layer to influence surprise of current layer
-        weights_for_surprise = weights
-        if exists(prev_layer_updates):
-            prev_layer_updates = TensorDict(prev_layer_updates)
-            weights_for_surprise = weights_for_surprise + prev_layer_updates
-            per_sample_grad_fn = self.per_sample_grad_fn_expanded_weights # the weights will now have a batch * chunk dimension
+        weights_for_surprise = repeat_dict_values(weights, 'b ... -> b n ...', n = num_chunks)
         # derive learned hparams for optimization of memory network
+        seq = self.store_norm(seq)
         adaptive_lr = self.to_adaptive_step(seq)
         adaptive_lr = self.adaptive_step_transform(adaptive_lr)
@@ -474,7 +511,7 @@ class NeuralMemory(Module):
         decay_factor = self.to_decay_factor(chunked_seq).sigmoid()
-        need_layer_lr_mod = exists(self.to_layer_modulation)
+        need_layer_lr_mod = exists(self.to_layer_modulation) and num_chunks > 0
         has_momentum = exists(self.to_momentum)
         if has_momentum:
@@ -505,12 +542,11 @@ class NeuralMemory(Module):
         # flatten batch and time if surprise depends on previous layer memory model
-        if exists(prev_layer_updates):
-            weights_for_surprise = weights_for_surprise.apply(lambda t: rearrange(t, 'b n ... -> (b n) ...'))
+        weights_for_surprise = rearrange_dict_values(weights_for_surprise, 'b n ... -> (b n) ...')
         # get grads and extra auxiliary loss (for backwarding through qkv projection in base neural memory module)
-        grads, aux_kv_recon_loss = per_sample_grad_fn(dict(weights_for_surprise), keys, adaptive_lr, values)
+        grads = self.per_sample_grad_fn(dict(weights_for_surprise), keys, adaptive_lr, values)
         grads = TensorDict(grads)
@@ -521,7 +557,7 @@ class NeuralMemory(Module):
         # restore batch and sequence dimension
-        grads = grads.apply(lambda t: rearrange(t, '(b n) ... -> b n ...', b = batch * heads))
+        grads = rearrange_dict_values(grads, '(b n) ... -> b n ...', b = batch * heads)
         # maybe per layer modulation
@@ -535,19 +571,25 @@ class NeuralMemory(Module):
         # past states
         if not exists(past_state):
-            empty_dict = {key: None for key in weights.keys()}
             # minibatch_init_weight corresponds to W0 in figure 7 of TTT paper
             minibatch_init_weight = weights
+            init_momentum = self.init_momentum(batch)
-            if dict_get_shape(weights) == self.init_weight_shape:
-                minibatch_init_weight = weights.apply(lambda t: repeat(t, '... -> b 1 (...)', b = batch * heads))
-            past_state = (minibatch_init_weight, empty_dict)
+            past_state = (minibatch_init_weight, init_momentum)
         past_last_update, past_last_momentum = past_state
+        # early return if sequence length less than chunk size
+        if num_chunks == 0:
+            updates = rearrange_dict_values(weights, 'bh ... -> bh 1 ...')
+            next_store_state = NeuralMemCache(next_seq_len_index, weights, remainder, past_state, updates)
+            output = (updates, next_store_state)
+            return output
         # momentum + weight decay - momentum is the new contribution, as most linear RNNs have learned forgetting gates
         next_momentum = TensorDict() if has_momentum else None
@@ -558,8 +600,6 @@ class NeuralMemory(Module):
         for (param_name, surprise), (_, last_update), (_, last_momentum) in zip(surprises.items(), past_last_update.items(), past_last_momentum.items()):
-            surprise, inverse_pack = pack_one_with_inverse(surprise, 'b n *')
             update = surprise
             # derive momentum with associative scan - eq (10)
@@ -571,62 +611,51 @@ class NeuralMemory(Module):
             # use associative scan again for learned forgetting (weight decay) - eq (13)
-            update = self.assoc_scan(1. - decay_factor, update, prev = last_update)
+            update = self.assoc_scan(1. - decay_factor, update, prev = last_update, remove_prev = False)
             next_last_update[param_name] = update[:, -1]
-            updates[param_name] = inverse_pack(update)
+            updates[param_name] = update
             if has_momentum:
-                next_momentum[param_name] = inverse_pack(momentum)
+                next_momentum[param_name] = momentum
         # determine next state for the storing of memories
         next_state = (next_last_update, next_last_momentum)
-        next_store_state = NeuralMemCache(seq_len, remainder, next_state, updates)
+        next_store_state = NeuralMemCache(next_seq_len_index, weights, remainder, next_state, updates)
         # returns
         output = (updates, next_store_state)
-        if not return_aux_kv_loss:
-            return output
-        return output, aux_kv_recon_loss.mean()
+        return output
     def retrieve_memories(
         self,
         seq,
         past_weights: dict[str, Tensor],
-        chunk_size = None,
-        prev_layer_updates: dict[str, Tensor] | None = None
     ):
-        chunk_size = default(chunk_size, self.retrieve_chunk_size)
+        chunk_size = self.retrieve_chunk_size
         batch, seq_len = seq.shape[:2]
         seq = self.retrieve_norm(seq)
-        assert seq_len >= chunk_size, 'must be handled outside of retrieve'
         needs_pad = chunk_size > 1
-        if needs_pad:
-            seq = pad_at_dim(seq, (1, 0), dim = 1)
-            seq_len_plus_one = seq.shape[-2]
+        seq = pad_at_dim(seq, (1, 0), dim = 1)
+        seq_len_plus_one = seq.shape[-2]
-            next_seq_len = round_up_multiple(seq_len_plus_one, chunk_size)
+        next_seq_len = round_up_multiple(seq_len_plus_one, chunk_size)
-            padding = next_seq_len - seq_len_plus_one
-            seq = pad_at_dim(seq, (0, padding), dim = 1)
+        padding = next_seq_len - seq_len_plus_one
+        seq = pad_at_dim(seq, (0, padding), dim = 1)
         # the parameters of the memory model stores the memories of the key / values
         # when the MLP has only 1 weight matrix, it is equivalent to `kv` fast weight memories from linear attention literature (recall fetching of memories is q @ (kv)) / schmidhuber's paper
         curr_weights = TensorDict(past_weights)
-        if exists(prev_layer_updates):
-            curr_weights = curr_weights + TensorDict(prev_layer_updates)
         # sequence Float['b n d'] to queries
         queries = self.to_queries(seq)
@@ -642,7 +671,7 @@ class NeuralMemory(Module):
         # fetch values from memory model
         if dict_get_shape(curr_weights) != self.init_weight_shape:
-            curr_weights = curr_weights.apply(lambda t: rearrange(t, 'b n ... -> (b n) ...'))
+            curr_weights = rearrange_dict_values(curr_weights, 'b n ... -> (b n) ...')
         queries = rearrange(queries, 'b h (n c) d -> (b h n) c d', c = chunk_size)
@@ -669,8 +698,7 @@ class NeuralMemory(Module):
         # restore, pad with empty memory embed
-        if needs_pad:
-            values = values[:, 1:(seq_len + 1)]
+        values = values[:, 1:(seq_len + 1)]
         return values
@@ -678,16 +706,14 @@ class NeuralMemory(Module):
     def forward_inference(
         self,
         token: Tensor,
-        state = None,
-        prev_layer_updates: dict[str, Tensor] | None = None,
+        state: NeuralMemCache | None = None,
     ):
         # unpack previous state
         if not exists(state):
-            state = (0, None, None, None)
+            state = (0, None, None, None, None)
-        seq_index, cache_store_seq, past_states, updates = state
+        seq_index, weights, cache_store_seq, past_states, updates = state
         curr_seq_len = seq_index + 1
         batch = token.shape[0]
@@ -695,9 +721,7 @@ class NeuralMemory(Module):
         if token.ndim == 2:
             token = rearrange(token, 'b d -> b 1 d')
-        # get memory model weights
-        weights = self.init_weights()
+        assert token.shape[1] == 1
         # increment the sequence cache which is at most the chunk size
@@ -708,7 +732,7 @@ class NeuralMemory(Module):
         if curr_seq_len < self.chunk_size:
             retrieve = self.retrieve_memories(token, weights, chunk_size = 1)
-            output = retrieve, NeuralMemCache(curr_seq_len, cache_store_seq, past_states, updates)
+            output = retrieve, NeuralMemCache(curr_seq_len, weights, cache_store_seq, past_states, updates)
             return output
@@ -719,21 +743,16 @@ class NeuralMemory(Module):
         if not exists(updates):
             updates = weights.clone().zero_()
-            updates = updates.apply(lambda t: repeat(t, '... -> b 1 ...', b = batch))
+            updates = repeat_dict_values(updates, '... -> b 1 ...', b = batch)
         else:
             updates = updates.apply(lambda t: t[:, -1:])
-        if exists(prev_layer_updates):
-            prev_layer_updates = TensorDict(prev_layer_updates)
-            prev_layer_updates = prev_layer_updates.apply(lambda t: t[:, -1:])
         if store_seq_cache_len == self.chunk_size:
             next_updates, store_state = self.store_memories(
                 cache_store_seq,
                 weights,
                 past_state = past_states,
-                prev_layer_updates = prev_layer_updates,
             )
             updates = next_updates
@@ -746,7 +765,7 @@ class NeuralMemory(Module):
         # next state tuple
-        next_store_state = NeuralMemCache(curr_seq_len, cache_store_seq, next_states, updates)
+        next_store_state = NeuralMemCache(curr_seq_len, weights, cache_store_seq, next_states, updates)
         return retrieved, next_store_state
@@ -754,63 +773,99 @@ class NeuralMemory(Module):
         self,
         seq,
         store_seq = None,
-        mem_model_weights: dict[str, Tensor] | None = None,
-        past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
-        return_aux_kv_loss = False,
-        chunk_size = None,
-        store_chunk_size = None,
-        return_next_state = False,
-        prev_layer_updates: dict[str, Tensor] | None = None
+        state: NeuralMemCache | None = None,
     ):
-        batch, seq_len = seq.shape[:2]
+        if not exists(state):
+            state = (0, None, None, None, None)
-        if not exists(mem_model_weights):
-            mem_model_weights = self.init_weights()
+        seq_index, weights, cache_store_seq, past_state, updates = state
-        if seq_len < self.retrieve_chunk_size:
-            retrieved = self.retrieve_memories(seq, mem_model_weights, chunk_size = 1)
+        assert not exists(cache_store_seq) or is_empty_tensor(cache_store_seq)
-            next_store_state = NeuralMemCache(seq_len, seq, None, None)
+        # store
-            out = (retrieved, next_store_state)
+        store_seq = default(store_seq, seq)
-            if not return_aux_kv_loss:
-                return out
+        # functions
-            return out, self.zero
+        # compute split sizes of sequence
+        # for now manually update weights to last update at the correct boundaries
-        # store
+        store_seq_len, chunk_size, batch_size = store_seq.shape[-2], self.chunk_size, self.batch_size
-        store_seq = default(store_seq, seq)
+        need_update_weights = exists(batch_size)
-        (updates, next_store_state), aux_kv_recon_loss = self.store_memories(
-            store_seq,
-            mem_model_weights,
-            chunk_size = store_chunk_size,
-            prev_layer_updates = prev_layer_updates,
-            return_aux_kv_loss = True
-        )
+        # determine split sizes and when to update
-        # retrieve
+        if need_update_weights:
+            update_after_final_store = divisible_by(seq_index + store_seq_len, batch_size)
+            seq_range = torch.arange(store_seq_len) + seq_index + 1
+            batch_boundary = divisible_by(seq_range, batch_size)
+            indices = seq_range[batch_boundary] - seq_index
+            indices = F.pad(indices, (1, 0), value = 0)
+            if indices[-1] != store_seq_len:
+                indices = F.pad(indices, (0, 1), value = store_seq_len)
+            split_sizes = (indices[1:] - indices[:-1]).tolist()
+            assert sum(split_sizes) == store_seq_len
+        else:
+            split_sizes = (store_seq_len,)
+            update_after_final_store = False
-        retrieve_chunk_size = default(chunk_size, self.retrieve_chunk_size)
+        # accumulate updates
-        if retrieve_chunk_size != 1:
-            if exists(prev_layer_updates):
-                prev_layer_updates = prev_layer_updates.apply(lambda t: pad_at_dim(t, (1, 0), dim = 1))
+        updates = None
-            updates = updates.apply(lambda t: pad_at_dim(t, (1, 0), dim = 1))
+        def accum_updates(past_updates, future_updates):
+            if not exists(past_updates):
+                return future_updates
+            return TensorDict({param_name: cat((past_update[:, :-1], future_update), dim = 1) for (param_name, past_update), (_, future_update) in zip(past_updates.items(), future_updates.items())})
+        # loop through chunks of store sequences
+        store_seqs = store_seq.split(split_sizes, dim = -2)
+        for ind, store_seq_chunk in enumerate(store_seqs):
+            is_last = ind == (len(store_seqs) - 1)
+            # store
+            next_updates, next_neural_mem_state = self.store_memories(
+                store_seq_chunk,
+                weights,
+                seq_index = seq_index,
+                past_state = past_state,
+            )
+            seq_index = next_neural_mem_state.seq_index
+            past_state = next_neural_mem_state.states
+            updates = accum_updates(updates, next_updates)
+            if is_last and not update_after_final_store:
+                continue
+            # update weights once batch size is fulfilled
+            last_update, _ = past_state
+            weights = last_update
+            next_neural_mem_state = list(next_neural_mem_state)
+            next_neural_mem_state[1] = last_update
+            next_neural_mem_state = NeuralMemCache(*next_neural_mem_state)
+        # retrieve
         retrieved = self.retrieve_memories(
             seq,
-            updates,
-            chunk_size = chunk_size,
-            prev_layer_updates = prev_layer_updates
+            updates
         )
-        output = (retrieved, next_store_state)
-        if not return_aux_kv_loss:
-            return output
-        return output, aux_kv_recon_loss
+        return retrieved, next_neural_mem_state

{titans_pytorch-0.2.9.dist-info → titans_pytorch-0.2.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.2.9
+Version: 0.2.11
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

titans_pytorch-0.2.11.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+titans_pytorch/__init__.py,sha256=Y3m_ZlpEqYwp-Md1ARhNGJxq8bQp8ty1o039nZOOJo0,276
+titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
+titans_pytorch/mac_transformer.py,sha256=RfJ1SvQH5_4PmlB7g-13wPAqYtCCUJxfmtaL0oBrRCU,24563
+titans_pytorch/memory_models.py,sha256=Q9SAIyAbStF5Tz0EhvRbn3yAdE3nk3xKc1ndieIe714,4671
+titans_pytorch/neural_memory.py,sha256=1wX8dbGENHWk7sfz7IFF1G8KY4U5tsNh3cqSDxTUf2U,26150
+titans_pytorch-0.2.11.dist-info/METADATA,sha256=CMLW5FSamLp0cPhIohOD_yXjCXoxqCPzwJrA0e83vQE,6812
+titans_pytorch-0.2.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+titans_pytorch-0.2.11.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+titans_pytorch-0.2.11.dist-info/RECORD,,

titans_pytorch-0.2.9.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-titans_pytorch/__init__.py,sha256=Y3m_ZlpEqYwp-Md1ARhNGJxq8bQp8ty1o039nZOOJo0,276
-titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
-titans_pytorch/mac_transformer.py,sha256=UOJAMv7nTgkefBB7M7K3U0NnFkz75tFRG5WLXRdfnLw,26039
-titans_pytorch/memory_models.py,sha256=Q9SAIyAbStF5Tz0EhvRbn3yAdE3nk3xKc1ndieIe714,4671
-titans_pytorch/neural_memory.py,sha256=YVbKl7DYKFWUgCawDTxXIEgJAcl7nq5OaZytmovIl8Q,24899
-titans_pytorch-0.2.9.dist-info/METADATA,sha256=fSFt54zXLKB5gRhLTJd9551O0pF2qcYNlR7039yJiD0,6811
-titans_pytorch-0.2.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-titans_pytorch-0.2.9.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-titans_pytorch-0.2.9.dist-info/RECORD,,

{titans_pytorch-0.2.9.dist-info → titans_pytorch-0.2.11.dist-info}/WHEEL RENAMED Viewed

File without changes

{titans_pytorch-0.2.9.dist-info → titans_pytorch-0.2.11.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

titans-pytorch 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl

titans-pytorch 0.2.9py3-none-any.whl → 0.2.11py3-none-any.whl