PyPI - titans-pytorch - Versions diffs - 0.2.10__py3-none-any.whl → 0.2.11__py3-none-any.whl - Mend

titans-pytorch 0.2.10py3-none-any.whl → 0.2.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

titans_pytorch/mac_transformer.py CHANGED Viewed

@@ -481,6 +481,7 @@ class MemoryAsContextTransformer(Module):
         neural_memory_add_value_residual = False,
         num_longterm_mem_tokens = 0,
         num_persist_mem_tokens = 0,
+        neural_memory_batch_size = None,
         dim_head = 64,
         heads = 8,
         ff_mult = 4,
@@ -551,6 +552,7 @@ class MemoryAsContextTransformer(Module):
                 mem = NeuralMemory(
                     dim = dim,
                     chunk_size = self.neural_memory_segment_len,
+                    batch_size = neural_memory_batch_size,
                     model = deepcopy(neural_memory_model),
                     **neural_memory_kwargs
                 )

titans_pytorch/neural_memory.py CHANGED Viewed

@@ -6,7 +6,7 @@ from functools import partial
 from collections import namedtuple
 import torch
-from torch import nn, cat, Tensor
+from torch import nn, cat, tensor, Tensor
 import torch.nn.functional as F
 from torch.nn import Linear, Module, Parameter, ParameterList
 from torch.func import functional_call, vmap, grad
@@ -39,7 +39,7 @@ w - num memory network weight parameters
 LinearNoBias = partial(Linear, bias = False)
 NeuralMemCache = namedtuple('NeuralMemCache', [
-    'seq',
+    'seq_index',
     'weights',
     'cache_store_segment',
     'states',
@@ -63,6 +63,9 @@ def identity(t):
 def xnor(x, y):
     return not (x ^ y)
+def divisible_by(num, den):
+    return (num % den) == 0
 def safe_cat(inputs, dim = -2):
     inputs = tuple(filter(exists, inputs))
@@ -73,6 +76,9 @@ def safe_cat(inputs, dim = -2):
     return cat(inputs, dim = dim)
+def is_empty_tensor(t):
+    return t.numel() == 0
 def dict_get_shape(td):
     return {k: v.shape for k, v in td.items()}
@@ -118,7 +124,7 @@ def softclamp_max(t, max_value):
     return ((t / half_max_value).tanh() * half_max_value) + half_max_value
 def softclamp_grad_norm(t, max_value):
-    if t.numel() == 0:
+    if is_empty_tensor(t):
         return t
     t, inverse = pack_one_with_inverse(t, 'bn *')
@@ -270,6 +276,7 @@ class NeuralMemory(Module):
         self,
         dim,
         chunk_size: int | tuple[int, int] = 1,
+        batch_size = None,
         dim_head = None,
         heads = 1,
         model: Module | None = None,
@@ -296,6 +303,13 @@ class NeuralMemory(Module):
         self.retrieve_chunk_size, self.store_chunk_size = pair(chunk_size)
+        # batch size
+        if exists(batch_size):
+            assert divisible_by(batch_size, self.store_chunk_size)
+        self.batch_size = batch_size
         # associative scan
         self.assoc_scan = AssocScan(use_accelerated = use_accelerated_scan)
@@ -460,9 +474,9 @@ class NeuralMemory(Module):
         seq,
         weights: dict[str, Tensor] | None = None,
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
-        chunk_size = None,
+        seq_index = 0
     ):
-        batch, seq_len, heads, chunk_size = *seq.shape[:2], self.heads, default(chunk_size, self.store_chunk_size)
+        batch, seq_len, heads, chunk_size = *seq.shape[:2], self.heads, self.store_chunk_size
         # curtail sequence by multiple of the chunk size
         # only a complete chunk of the sequence provides the memory for the next chunk
@@ -472,6 +486,8 @@ class NeuralMemory(Module):
         seq, remainder = seq[:, :round_down_seq_len], seq[:, round_down_seq_len:]
+        next_seq_len_index = seq_index + round_down_seq_len
         # init weights if needed
         # weights of the memory network
@@ -568,7 +584,7 @@ class NeuralMemory(Module):
         if num_chunks == 0:
             updates = rearrange_dict_values(weights, 'bh ... -> bh 1 ...')
-            next_store_state = NeuralMemCache(seq_len, weights, remainder, past_state, updates)
+            next_store_state = NeuralMemCache(next_seq_len_index, weights, remainder, past_state, updates)
             output = (updates, next_store_state)
@@ -607,7 +623,7 @@ class NeuralMemory(Module):
         next_state = (next_last_update, next_last_momentum)
-        next_store_state = NeuralMemCache(seq_len, weights, remainder, next_state, updates)
+        next_store_state = NeuralMemCache(next_seq_len_index, weights, remainder, next_state, updates)
         # returns
@@ -619,9 +635,8 @@ class NeuralMemory(Module):
         self,
         seq,
         past_weights: dict[str, Tensor],
-        chunk_size = None,
     ):
-        chunk_size = default(chunk_size, self.retrieve_chunk_size)
+        chunk_size = self.retrieve_chunk_size
         batch, seq_len = seq.shape[:2]
         seq = self.retrieve_norm(seq)
@@ -691,9 +706,8 @@ class NeuralMemory(Module):
     def forward_inference(
         self,
         token: Tensor,
-        state = None,
+        state: NeuralMemCache | None = None,
     ):
         # unpack previous state
         if not exists(state):
@@ -707,6 +721,8 @@ class NeuralMemory(Module):
         if token.ndim == 2:
             token = rearrange(token, 'b d -> b 1 d')
+        assert token.shape[1] == 1
         # increment the sequence cache which is at most the chunk size
         cache_store_seq = safe_cat((cache_store_seq, token), dim = -2)
@@ -757,32 +773,99 @@ class NeuralMemory(Module):
         self,
         seq,
         store_seq = None,
-        mem_model_weights: dict[str, Tensor] | None = None,
-        past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
-        chunk_size = None,
-        store_chunk_size = None,
-        return_next_state = False,
+        state: NeuralMemCache | None = None,
     ):
-        batch, seq_len = seq.shape[:2]
+        if not exists(state):
+            state = (0, None, None, None, None)
+        seq_index, weights, cache_store_seq, past_state, updates = state
+        assert not exists(cache_store_seq) or is_empty_tensor(cache_store_seq)
         # store
         store_seq = default(store_seq, seq)
-        updates, next_store_state = self.store_memories(
-            store_seq,
-            mem_model_weights,
-            chunk_size = store_chunk_size,
-        )
+        # functions
+        # compute split sizes of sequence
+        # for now manually update weights to last update at the correct boundaries
+        store_seq_len, chunk_size, batch_size = store_seq.shape[-2], self.chunk_size, self.batch_size
+        need_update_weights = exists(batch_size)
+        # determine split sizes and when to update
+        if need_update_weights:
+            update_after_final_store = divisible_by(seq_index + store_seq_len, batch_size)
+            seq_range = torch.arange(store_seq_len) + seq_index + 1
+            batch_boundary = divisible_by(seq_range, batch_size)
+            indices = seq_range[batch_boundary] - seq_index
+            indices = F.pad(indices, (1, 0), value = 0)
+            if indices[-1] != store_seq_len:
+                indices = F.pad(indices, (0, 1), value = store_seq_len)
+            split_sizes = (indices[1:] - indices[:-1]).tolist()
+            assert sum(split_sizes) == store_seq_len
+        else:
+            split_sizes = (store_seq_len,)
+            update_after_final_store = False
+        # accumulate updates
+        updates = None
+        def accum_updates(past_updates, future_updates):
+            if not exists(past_updates):
+                return future_updates
+            return TensorDict({param_name: cat((past_update[:, :-1], future_update), dim = 1) for (param_name, past_update), (_, future_update) in zip(past_updates.items(), future_updates.items())})
+        # loop through chunks of store sequences
+        store_seqs = store_seq.split(split_sizes, dim = -2)
+        for ind, store_seq_chunk in enumerate(store_seqs):
+            is_last = ind == (len(store_seqs) - 1)
+            # store
+            next_updates, next_neural_mem_state = self.store_memories(
+                store_seq_chunk,
+                weights,
+                seq_index = seq_index,
+                past_state = past_state,
+            )
+            seq_index = next_neural_mem_state.seq_index
+            past_state = next_neural_mem_state.states
+            updates = accum_updates(updates, next_updates)
+            if is_last and not update_after_final_store:
+                continue
+            # update weights once batch size is fulfilled
+            last_update, _ = past_state
+            weights = last_update
+            next_neural_mem_state = list(next_neural_mem_state)
+            next_neural_mem_state[1] = last_update
+            next_neural_mem_state = NeuralMemCache(*next_neural_mem_state)
         # retrieve
         retrieved = self.retrieve_memories(
             seq,
-            updates,
-            chunk_size = chunk_size,
+            updates
         )
-        output = (retrieved, next_store_state)
-        return output
+        return retrieved, next_neural_mem_state

{titans_pytorch-0.2.10.dist-info → titans_pytorch-0.2.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.2.10
+Version: 0.2.11
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

titans_pytorch-0.2.11.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+titans_pytorch/__init__.py,sha256=Y3m_ZlpEqYwp-Md1ARhNGJxq8bQp8ty1o039nZOOJo0,276
+titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
+titans_pytorch/mac_transformer.py,sha256=RfJ1SvQH5_4PmlB7g-13wPAqYtCCUJxfmtaL0oBrRCU,24563
+titans_pytorch/memory_models.py,sha256=Q9SAIyAbStF5Tz0EhvRbn3yAdE3nk3xKc1ndieIe714,4671
+titans_pytorch/neural_memory.py,sha256=1wX8dbGENHWk7sfz7IFF1G8KY4U5tsNh3cqSDxTUf2U,26150
+titans_pytorch-0.2.11.dist-info/METADATA,sha256=CMLW5FSamLp0cPhIohOD_yXjCXoxqCPzwJrA0e83vQE,6812
+titans_pytorch-0.2.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+titans_pytorch-0.2.11.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+titans_pytorch-0.2.11.dist-info/RECORD,,

titans_pytorch-0.2.10.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-titans_pytorch/__init__.py,sha256=Y3m_ZlpEqYwp-Md1ARhNGJxq8bQp8ty1o039nZOOJo0,276
-titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
-titans_pytorch/mac_transformer.py,sha256=dmS37yBN0j9OqoMCsojuIPfT1EXLN8ackRdZwPb8xDY,24463
-titans_pytorch/memory_models.py,sha256=Q9SAIyAbStF5Tz0EhvRbn3yAdE3nk3xKc1ndieIe714,4671
-titans_pytorch/neural_memory.py,sha256=kc-cV7dK3WhdqRfOCrPW91nA0F56jUK94TE1irckQ34,23487
-titans_pytorch-0.2.10.dist-info/METADATA,sha256=k7u9eQDNAWG3QqzGqhcdN21D6LYWWRWdd5wZFb560q0,6812
-titans_pytorch-0.2.10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-titans_pytorch-0.2.10.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-titans_pytorch-0.2.10.dist-info/RECORD,,

{titans_pytorch-0.2.10.dist-info → titans_pytorch-0.2.11.dist-info}/WHEEL RENAMED Viewed

File without changes

{titans_pytorch-0.2.10.dist-info → titans_pytorch-0.2.11.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

titans-pytorch 0.2.10__py3-none-any.whl → 0.2.11__py3-none-any.whl

titans-pytorch 0.2.10py3-none-any.whl → 0.2.11py3-none-any.whl