PyPI - titans-pytorch - Versions diffs - 0.2.10__tar.gz → 0.2.11__tar.gz - Mend

titans-pytorch 0.2.10tar.gz → 0.2.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{titans_pytorch-0.2.10 → titans_pytorch-0.2.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.2.10
+Version: 0.2.11
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

{titans_pytorch-0.2.10 → titans_pytorch-0.2.11}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.2.10"
+version = "0.2.11"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.2.10 → titans_pytorch-0.2.11}/tests/test_titans.py RENAMED Viewed

@@ -73,41 +73,62 @@ def test_titans_attn_memory():
     assert seq.shape == retrieved.shape
-def test_retrieve_store_diff_seq():
-    mem = NeuralMemory(
+def test_neural_mem_chaining_chunks():
+    mem  = NeuralMemory(
         dim = 384,
-        chunk_size = (64, 32),
+        dim_head = 64,
+        heads = 2,
+        chunk_size = 16
     )
-    retrieve_seq = torch.randn(2, 64 * 64, 384)
-    store_seq = torch.randn(2, 64 * 32, 384)
+    seq = torch.randn(2, 48, 384)
-    retrieved, _ = mem(retrieve_seq, store_seq = store_seq)
+    parallel_retrieved, state = mem(seq)
-    assert retrieve_seq.shape == retrieved.shape
+    seq_first, seq_second, seq_third = seq.split(16, dim = 1)
-def test_overriding_chunk_size():
-    mem = NeuralMemory(
+    first_retrieved, state = mem(seq_first)
+    second_retrieved, state = mem(seq_second, state = state)
+    third_retrieved, state = mem(seq_third, state = state)
+    assert torch.allclose(parallel_retrieved, torch.cat((first_retrieved, second_retrieved, third_retrieved), dim = 1), atol = 1e-5)
+def test_neural_mem_chaining_with_batch_size():
+    mem  = NeuralMemory(
         dim = 384,
-        chunk_size = 64,
+        dim_head = 64,
+        heads = 2,
+        chunk_size = 16,
+        batch_size = 64
     )
-    seq = torch.randn(2, 128 * 16, 384)
-    store_seq = torch.randn(2, 128 * 8, 384)
+    seq = torch.randn(2, 112, 384)
-    retrieved, _ = mem(seq, store_seq, chunk_size = 16, store_chunk_size = 8)
+    parallel_retrieved, state = mem(seq)
-    assert seq.shape == retrieved.shape
+    seq_first, seq_second, seq_third = seq[:, :16], seq[:, 16:64], seq[:, 64:]
+    first_retrieved, state = mem(seq_first)
+    second_retrieved, state = mem(seq_second, state = state)
+    third_retrieved, state = mem(seq_third, state = state)
+    parallel_part_retrieved = torch.cat((first_retrieved, second_retrieved, third_retrieved), dim = 1)
+    assert torch.allclose(parallel_retrieved, parallel_part_retrieved, atol = 1e-5)
 @pytest.mark.parametrize('seq_len', (1023, 17))
 @pytest.mark.parametrize('num_persist_mem_tokens', (0, 16))
 @pytest.mark.parametrize('num_longterm_mem_tokens', (0, 16))
 @pytest.mark.parametrize('neural_mem_gate_attn_output', (False, True))
+@pytest.mark.parametrize('neural_mem_segment_len', (8, 16))
+@pytest.mark.parametrize('neural_mem_batch_size', (None, 64))
 def test_mac(
     seq_len,
     num_persist_mem_tokens,
     num_longterm_mem_tokens,
-    neural_mem_gate_attn_output
+    neural_mem_gate_attn_output,
+    neural_mem_segment_len,
+    neural_mem_batch_size
 ):
     transformer = MemoryAsContextTransformer(
         num_tokens = 256,
@@ -116,7 +137,9 @@ def test_mac(
         num_persist_mem_tokens = num_persist_mem_tokens,
         num_longterm_mem_tokens = num_longterm_mem_tokens,
         segment_len = 128,
-        neural_mem_gate_attn_output = neural_mem_gate_attn_output
+        neural_mem_gate_attn_output = neural_mem_gate_attn_output,
+        neural_memory_segment_len = neural_mem_segment_len,
+        neural_memory_batch_size = neural_mem_batch_size,
     )
     x = torch.randint(0, 256, (1, seq_len))

{titans_pytorch-0.2.10 → titans_pytorch-0.2.11}/titans_pytorch/mac_transformer.py RENAMED Viewed

@@ -481,6 +481,7 @@ class MemoryAsContextTransformer(Module):
         neural_memory_add_value_residual = False,
         num_longterm_mem_tokens = 0,
         num_persist_mem_tokens = 0,
+        neural_memory_batch_size = None,
         dim_head = 64,
         heads = 8,
         ff_mult = 4,
@@ -551,6 +552,7 @@ class MemoryAsContextTransformer(Module):
                 mem = NeuralMemory(
                     dim = dim,
                     chunk_size = self.neural_memory_segment_len,
+                    batch_size = neural_memory_batch_size,
                     model = deepcopy(neural_memory_model),
                     **neural_memory_kwargs
                 )

{titans_pytorch-0.2.10 → titans_pytorch-0.2.11}/titans_pytorch/neural_memory.py RENAMED Viewed

@@ -6,7 +6,7 @@ from functools import partial
 from collections import namedtuple
 import torch
-from torch import nn, cat, Tensor
+from torch import nn, cat, tensor, Tensor
 import torch.nn.functional as F
 from torch.nn import Linear, Module, Parameter, ParameterList
 from torch.func import functional_call, vmap, grad
@@ -39,7 +39,7 @@ w - num memory network weight parameters
 LinearNoBias = partial(Linear, bias = False)
 NeuralMemCache = namedtuple('NeuralMemCache', [
-    'seq',
+    'seq_index',
     'weights',
     'cache_store_segment',
     'states',
@@ -63,6 +63,9 @@ def identity(t):
 def xnor(x, y):
     return not (x ^ y)
+def divisible_by(num, den):
+    return (num % den) == 0
 def safe_cat(inputs, dim = -2):
     inputs = tuple(filter(exists, inputs))
@@ -73,6 +76,9 @@ def safe_cat(inputs, dim = -2):
     return cat(inputs, dim = dim)
+def is_empty_tensor(t):
+    return t.numel() == 0
 def dict_get_shape(td):
     return {k: v.shape for k, v in td.items()}
@@ -118,7 +124,7 @@ def softclamp_max(t, max_value):
     return ((t / half_max_value).tanh() * half_max_value) + half_max_value
 def softclamp_grad_norm(t, max_value):
-    if t.numel() == 0:
+    if is_empty_tensor(t):
         return t
     t, inverse = pack_one_with_inverse(t, 'bn *')
@@ -270,6 +276,7 @@ class NeuralMemory(Module):
         self,
         dim,
         chunk_size: int | tuple[int, int] = 1,
+        batch_size = None,
         dim_head = None,
         heads = 1,
         model: Module | None = None,
@@ -296,6 +303,13 @@ class NeuralMemory(Module):
         self.retrieve_chunk_size, self.store_chunk_size = pair(chunk_size)
+        # batch size
+        if exists(batch_size):
+            assert divisible_by(batch_size, self.store_chunk_size)
+        self.batch_size = batch_size
         # associative scan
         self.assoc_scan = AssocScan(use_accelerated = use_accelerated_scan)
@@ -460,9 +474,9 @@ class NeuralMemory(Module):
         seq,
         weights: dict[str, Tensor] | None = None,
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
-        chunk_size = None,
+        seq_index = 0
     ):
-        batch, seq_len, heads, chunk_size = *seq.shape[:2], self.heads, default(chunk_size, self.store_chunk_size)
+        batch, seq_len, heads, chunk_size = *seq.shape[:2], self.heads, self.store_chunk_size
         # curtail sequence by multiple of the chunk size
         # only a complete chunk of the sequence provides the memory for the next chunk
@@ -472,6 +486,8 @@ class NeuralMemory(Module):
         seq, remainder = seq[:, :round_down_seq_len], seq[:, round_down_seq_len:]
+        next_seq_len_index = seq_index + round_down_seq_len
         # init weights if needed
         # weights of the memory network
@@ -568,7 +584,7 @@ class NeuralMemory(Module):
         if num_chunks == 0:
             updates = rearrange_dict_values(weights, 'bh ... -> bh 1 ...')
-            next_store_state = NeuralMemCache(seq_len, weights, remainder, past_state, updates)
+            next_store_state = NeuralMemCache(next_seq_len_index, weights, remainder, past_state, updates)
             output = (updates, next_store_state)
@@ -607,7 +623,7 @@ class NeuralMemory(Module):
         next_state = (next_last_update, next_last_momentum)
-        next_store_state = NeuralMemCache(seq_len, weights, remainder, next_state, updates)
+        next_store_state = NeuralMemCache(next_seq_len_index, weights, remainder, next_state, updates)
         # returns
@@ -619,9 +635,8 @@ class NeuralMemory(Module):
         self,
         seq,
         past_weights: dict[str, Tensor],
-        chunk_size = None,
     ):
-        chunk_size = default(chunk_size, self.retrieve_chunk_size)
+        chunk_size = self.retrieve_chunk_size
         batch, seq_len = seq.shape[:2]
         seq = self.retrieve_norm(seq)
@@ -691,9 +706,8 @@ class NeuralMemory(Module):
     def forward_inference(
         self,
         token: Tensor,
-        state = None,
+        state: NeuralMemCache | None = None,
     ):
         # unpack previous state
         if not exists(state):
@@ -707,6 +721,8 @@ class NeuralMemory(Module):
         if token.ndim == 2:
             token = rearrange(token, 'b d -> b 1 d')
+        assert token.shape[1] == 1
         # increment the sequence cache which is at most the chunk size
         cache_store_seq = safe_cat((cache_store_seq, token), dim = -2)
@@ -757,32 +773,99 @@ class NeuralMemory(Module):
         self,
         seq,
         store_seq = None,
-        mem_model_weights: dict[str, Tensor] | None = None,
-        past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
-        chunk_size = None,
-        store_chunk_size = None,
-        return_next_state = False,
+        state: NeuralMemCache | None = None,
     ):
-        batch, seq_len = seq.shape[:2]
+        if not exists(state):
+            state = (0, None, None, None, None)
+        seq_index, weights, cache_store_seq, past_state, updates = state
+        assert not exists(cache_store_seq) or is_empty_tensor(cache_store_seq)
         # store
         store_seq = default(store_seq, seq)
-        updates, next_store_state = self.store_memories(
-            store_seq,
-            mem_model_weights,
-            chunk_size = store_chunk_size,
-        )
+        # functions
+        # compute split sizes of sequence
+        # for now manually update weights to last update at the correct boundaries
+        store_seq_len, chunk_size, batch_size = store_seq.shape[-2], self.chunk_size, self.batch_size
+        need_update_weights = exists(batch_size)
+        # determine split sizes and when to update
+        if need_update_weights:
+            update_after_final_store = divisible_by(seq_index + store_seq_len, batch_size)
+            seq_range = torch.arange(store_seq_len) + seq_index + 1
+            batch_boundary = divisible_by(seq_range, batch_size)
+            indices = seq_range[batch_boundary] - seq_index
+            indices = F.pad(indices, (1, 0), value = 0)
+            if indices[-1] != store_seq_len:
+                indices = F.pad(indices, (0, 1), value = store_seq_len)
+            split_sizes = (indices[1:] - indices[:-1]).tolist()
+            assert sum(split_sizes) == store_seq_len
+        else:
+            split_sizes = (store_seq_len,)
+            update_after_final_store = False
+        # accumulate updates
+        updates = None
+        def accum_updates(past_updates, future_updates):
+            if not exists(past_updates):
+                return future_updates
+            return TensorDict({param_name: cat((past_update[:, :-1], future_update), dim = 1) for (param_name, past_update), (_, future_update) in zip(past_updates.items(), future_updates.items())})
+        # loop through chunks of store sequences
+        store_seqs = store_seq.split(split_sizes, dim = -2)
+        for ind, store_seq_chunk in enumerate(store_seqs):
+            is_last = ind == (len(store_seqs) - 1)
+            # store
+            next_updates, next_neural_mem_state = self.store_memories(
+                store_seq_chunk,
+                weights,
+                seq_index = seq_index,
+                past_state = past_state,
+            )
+            seq_index = next_neural_mem_state.seq_index
+            past_state = next_neural_mem_state.states
+            updates = accum_updates(updates, next_updates)
+            if is_last and not update_after_final_store:
+                continue
+            # update weights once batch size is fulfilled
+            last_update, _ = past_state
+            weights = last_update
+            next_neural_mem_state = list(next_neural_mem_state)
+            next_neural_mem_state[1] = last_update
+            next_neural_mem_state = NeuralMemCache(*next_neural_mem_state)
         # retrieve
         retrieved = self.retrieve_memories(
             seq,
-            updates,
-            chunk_size = chunk_size,
+            updates
         )
-        output = (retrieved, next_store_state)
-        return output
+        return retrieved, next_neural_mem_state

{titans_pytorch-0.2.10 → titans_pytorch-0.2.11}/train_mac.py RENAMED Viewed

@@ -35,7 +35,8 @@ NEURAL_MEM_GATE_ATTN_OUTPUT = False
 NEURAL_MEM_MOMENTUM = True
 NEURAL_MEM_QK_NORM = True
 WINDOW_SIZE = 32
-NEURAL_MEM_SEGMENT_LEN = WINDOW_SIZE // 2       # set smaller for more granularity for learning rate / momentum etc
+NEURAL_MEM_SEGMENT_LEN = 2                      # set smaller for more granularity for learning rate / momentum etc
+NEURAL_MEM_BATCH_SIZE = 128                     # set smaller to update the neural memory weights more often as it traverses the sequence
 SLIDING_WINDOWS = True
 STORE_ATTN_POOL_CHUNKS = True                   # whether to use attention pooling for chunk derived momentum, per-layer lr mod, decay
 MEMORY_MODEL_PER_LAYER_LEARNED_LR = True
@@ -83,6 +84,7 @@ model = MemoryAsContextTransformer(
     num_longterm_mem_tokens = NUM_LONGTERM_MEM,
     neural_memory_layers = NEURAL_MEM_LAYERS,
     neural_memory_segment_len = NEURAL_MEM_SEGMENT_LEN,
+    neural_memory_batch_size = NEURAL_MEM_BATCH_SIZE,
     neural_mem_gate_attn_output = NEURAL_MEM_GATE_ATTN_OUTPUT,
     use_flex_attn = USE_FLEX_ATTN,
     sliding_window_attn = SLIDING_WINDOWS,