PyPI - titans-pytorch - Versions diffs - 0.3.2__tar.gz → 0.3.4__tar.gz - Mend

titans-pytorch 0.3.2tar.gz → 0.3.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{titans_pytorch-0.3.2 → titans_pytorch-0.3.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.3.2
+Version: 0.3.4
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

{titans_pytorch-0.3.2 → titans_pytorch-0.3.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.3.2"
+version = "0.3.4"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.3.2 → titans_pytorch-0.3.4}/tests/test_titans.py RENAMED Viewed

@@ -31,6 +31,7 @@ def torch_default_dtype(dtype):
 @pytest.mark.parametrize('qk_rmsnorm', (False, True))
 @pytest.mark.parametrize('max_grad_norm', (None, 2.))
 @pytest.mark.parametrize('per_parameter_lr_modulation', (False, True))
+@pytest.mark.parametrize('per_head_learned_parameters', (False, True))
 def test_titans(
     seq_len,
     silu,
@@ -39,10 +40,11 @@ def test_titans(
     momentum,
     qk_rmsnorm,
     max_grad_norm,
-    per_parameter_lr_modulation
+    per_parameter_lr_modulation,
+    per_head_learned_parameters
 ):
     mem = NeuralMemory(
-        dim = 384,
+        dim = 16,
         chunk_size = chunk_size,
         activation = nn.SiLU() if silu else None,
         attn_pool_chunks = attn_pool_chunks,
@@ -50,9 +52,10 @@ def test_titans(
         momentum = momentum,
         qk_rmsnorm = qk_rmsnorm,
         per_parameter_lr_modulation = per_parameter_lr_modulation,
+        per_head_learned_parameters = per_head_learned_parameters
     )
-    seq = torch.randn(2, seq_len, 384)
+    seq = torch.randn(2, seq_len, 16)
     retrieved, _ = mem(seq)
     assert seq.shape == retrieved.shape
@@ -61,14 +64,14 @@ def test_titans_attn_memory():
     from titans_pytorch.memory_models import MemoryAttention
     mem = NeuralMemory(
-        dim = 384,
+        dim = 16,
         chunk_size = 64,
         model = MemoryAttention(
-            dim = 384
+            dim = 16
         )
     )
-    seq = torch.randn(2, 1024, 384)
+    seq = torch.randn(2, 1024, 16)
     retrieved, _ = mem(seq)
     assert seq.shape == retrieved.shape
@@ -78,14 +81,14 @@ def test_neural_mem_chaining_chunks(
     gated_transition
 ):
     mem  = NeuralMemory(
-        dim = 384,
-        dim_head = 64,
+        dim = 16,
+        dim_head = 16,
         heads = 2,
         chunk_size = 16,
         gated_transition = gated_transition
     )
-    seq = torch.randn(2, 48, 384)
+    seq = torch.randn(2, 48, 16)
     parallel_retrieved, state = mem(seq)
@@ -99,21 +102,21 @@ def test_neural_mem_chaining_chunks(
 def test_neural_mem_chaining_with_weight_residual():
     mem  = NeuralMemory(
-        dim = 384,
-        dim_head = 64,
+        dim = 16,
+        dim_head = 16,
         heads = 2,
         chunk_size = 64
     )
     mem2 = NeuralMemory(
-        dim = 384,
-        dim_head = 64,
+        dim = 16,
+        dim_head = 16,
         heads = 2,
         chunk_size = 64,
         accept_weight_residual = True
     )
-    seq = torch.randn(2, 256, 384)
+    seq = torch.randn(2, 256, 16)
     seq, state = mem(seq)
@@ -124,18 +127,18 @@ def test_neural_mem_chaining_with_weight_residual():
     first_retrieved, state1 = mem2(seq_first, prev_weights = state.updates)
     second_retrieved, state2 = mem2(seq_second, state = state1, prev_weights = state.updates)
-    assert torch.allclose(parallel_retrieved, torch.cat((first_retrieved, second_retrieved), dim = 1), atol = 1e-6)
+    assert torch.allclose(parallel_retrieved, torch.cat((first_retrieved, second_retrieved), dim = 1), atol = 1e-5)
 def test_neural_mem_chaining_with_batch_size():
     mem  = NeuralMemory(
-        dim = 384,
-        dim_head = 64,
+        dim = 16,
+        dim_head = 16,
         heads = 2,
         chunk_size = 16,
         batch_size = 64
     )
-    seq = torch.randn(2, 112, 384)
+    seq = torch.randn(2, 112, 16)
     parallel_retrieved, state = mem(seq)
@@ -169,7 +172,7 @@ def test_mac(
 ):
     transformer = MemoryAsContextTransformer(
         num_tokens = 256,
-        dim = 256,
+        dim = 16,
         depth = 2,
         num_persist_mem_tokens = num_persist_mem_tokens,
         num_longterm_mem_tokens = num_longterm_mem_tokens,
@@ -201,7 +204,7 @@ def test_mac_sampling(
 ):
     transformer = MemoryAsContextTransformer(
         num_tokens = 256,
-        dim = 256,
+        dim = 16,
         depth = 4,
         segment_len = 32,
         num_persist_mem_tokens = 4,
@@ -235,12 +238,12 @@ def test_neural_mem_inference(
 ):
     mem = NeuralMemory(
-        dim = 384,
+        dim = 16,
         chunk_size = mem_chunk_size,
         gated_transition = gated_transition
     )
-    seq = torch.randn(2, seq_len, 384)
+    seq = torch.randn(2, seq_len, 16)
     parallel_retrieved, _ = mem(seq)
     assert seq.shape == parallel_retrieved.shape
@@ -282,7 +285,7 @@ def test_flex(
         pytest.skip()
     attn = SegmentedAttention(
-        dim = 512,
+        dim = 16,
         segment_len = 32,
         num_persist_mem_tokens = 1,
         num_longterm_mem_tokens = 1,
@@ -290,7 +293,7 @@ def test_flex(
         sliding = sliding
     ).cuda()
-    seq = torch.randn(1, seq_len, 512).cuda()
+    seq = torch.randn(1, seq_len, 16).cuda()
     out_flex, _ = attn(seq)
     out_non_flex, _ = attn(seq, disable_flex_attn = True)
@@ -307,8 +310,8 @@ def test_assoc_scan():
     seq_len = 128
     mid_point = seq_len // 2
-    gates = torch.randn(2, seq_len, 512).sigmoid()
-    inputs = torch.randn(2, seq_len, 512)
+    gates = torch.randn(2, seq_len, 16).sigmoid()
+    inputs = torch.randn(2, seq_len, 16)
     output = scan(gates, inputs)

{titans_pytorch-0.3.2 → titans_pytorch-0.3.4}/titans_pytorch/associative_scan.py RENAMED Viewed

@@ -3,18 +3,39 @@ from typing import Callable
 import torch
 from torch import Tensor
+from torch.nn import Module
 import torch.nn.functional as F
+from einops import rearrange, repeat, reduce, pack, unpack
 # taken from S5-pytorch repository
 # https://github.com/i404788/s5-pytorch/blob/74e2fdae00b915a62c914bf3615c0b8a4279eb84/s5/jax_compat.py#L51-L134
 # helper functions
+def exists(v):
+    return v is not None
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
 def pad_at_dim(t, pad, dim = -1, value = 0.):
     dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1)
     zeros = ((0, 0) * dims_from_right)
     return F.pad(t, (*zeros, *pad), value = value)
+def pack_one_with_inverse(t, pattern):
+    packed, packed_shape = pack([t], pattern)
+    def inverse(out, inv_pattern = None):
+        inv_pattern = default(inv_pattern, pattern)
+        return unpack(out, packed_shape, inv_pattern)[0]
+    return packed, inverse
 # the operator that is needed
 @torch.jit.script
@@ -88,3 +109,69 @@ def _interleave(a, b):
     interleaved = torch.flatten(stacked, start_dim=1, end_dim=2)
     return interleaved[:, :output_axis_len]
+# associative scan wrapper around naive and accelerated version
+class AssocScan(Module):
+    def __init__(
+        self,
+        use_accelerated = False
+    ):
+        super().__init__()
+        self.use_accelerated = use_accelerated
+    def forward(
+        self,
+        gates,
+        inputs,
+        prev = None,
+        remove_prev = None
+    ):
+        remove_prev = default(remove_prev, exists(prev))
+        inputs, inverse_pack_weight_shape = pack_one_with_inverse(inputs, 'b n *')
+        gates, _ = pack_one_with_inverse(gates, 'b n *')
+        if exists(prev):
+            prev, _ = pack_one_with_inverse(prev, 'b *')
+        if exists(prev):
+            inputs, _ = pack([prev, inputs], 'b * d')
+            gates = pad_at_dim(gates, (1, 0), value = 1., dim = -2)
+        if not self.use_accelerated:
+            _, out = associative_scan(binary_operator, (gates, inputs))
+            if remove_prev:
+                out = out[:, 1:]
+            return inverse_pack_weight_shape(out)
+        from accelerated_scan.triton import scan as triton_scan
+        from accelerated_scan.warp import scan as warp_scan
+        scan = triton_scan if gates.is_cuda else warp_scan
+        def accelerate_scan_fn(gates, inputs):
+            gates = gates.expand_as(inputs)
+            gates, inputs = tuple(rearrange(t, 'b n d -> b d n') for t in (gates, inputs))
+            seq_len = gates.shape[-1]
+            next_power_two_seq_len = 2 ** max(5, int(math.ceil(math.log2(seq_len))))
+            gates = F.pad(gates, (0, next_power_two_seq_len - seq_len))
+            inputs = F.pad(inputs, (0, next_power_two_seq_len - seq_len))
+            outputs = scan(gates.contiguous(), inputs.contiguous())
+            outputs = outputs[..., :seq_len]
+            outputs = rearrange(outputs, 'b d n -> b n d')
+            return outputs
+        out = accelerate_scan_fn(gates, inputs)
+        if remove_prev:
+            out = out[:, 1:]
+        return inverse_pack_weight_shape(out)

{titans_pytorch-0.3.2 → titans_pytorch-0.3.4}/titans_pytorch/memory_models.py RENAMED Viewed

@@ -37,7 +37,7 @@ class MemoryMLP(Module):
         self,
         dim,
         depth,
-        expansion_factor = 4.
+        expansion_factor = 2.
     ):
         super().__init__()
         dim_hidden = int(dim * expansion_factor)

{titans_pytorch-0.3.2 → titans_pytorch-0.3.4}/titans_pytorch/neural_memory.py RENAMED Viewed

@@ -8,16 +8,12 @@ from collections import namedtuple
 import torch
 from torch import nn, cat, tensor, Tensor
 import torch.nn.functional as F
-from torch.nn import Linear, Module, Parameter, ParameterList
+from torch.nn import Linear, Module, Parameter, ParameterList, ParameterDict
 from torch.func import functional_call, vmap, grad
 from tensordict import TensorDict
-from titans_pytorch.associative_scan import (
-    associative_scan,
-    binary_operator,
-    pad_at_dim
-)
+from titans_pytorch.associative_scan import AssocScan
 from titans_pytorch.memory_models import(
     MemoryMLP
@@ -79,8 +75,8 @@ def safe_cat(inputs, dim = -2):
 def is_empty_tensor(t):
     return t.numel() == 0
-def dict_get_shape(td):
-    return {k: v.shape for k, v in td.items()}
+def dict_get_value_shapes(td):
+    return [v.shape for k, v in td.items()]
 def rearrange_dict_values(td, pattern, **kwargs):
     return td.apply(lambda t: rearrange(t, pattern, **kwargs))
@@ -97,6 +93,11 @@ def round_down_multiple(seq, mult):
 def round_up_multiple(seq, mult):
     return math.ceil(seq / mult) * mult
+def pad_at_dim(t, pad, dim = -1, value = 0.):
+    dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1)
+    zeros = ((0, 0) * dims_from_right)
+    return F.pad(t, (*zeros, *pad), value = value)
 def pack_one_with_inverse(t, pattern):
     packed, packed_shape = pack([t], pattern)
@@ -197,72 +198,6 @@ class AttentionPool(Module):
         return reduce(x * attn, 'b n c d -> b n d', 'sum')
-# associative scan wrapper
-class AssocScan(Module):
-    def __init__(
-        self,
-        use_accelerated = False
-    ):
-        super().__init__()
-        self.use_accelerated = use_accelerated
-    def forward(
-        self,
-        gates,
-        inputs,
-        prev = None,
-        remove_prev = None
-    ):
-        remove_prev = default(remove_prev, exists(prev))
-        inputs, inverse_pack_weight_shape = pack_one_with_inverse(inputs, 'b n *')
-        gates, _ = pack_one_with_inverse(gates, 'b n *')
-        if exists(prev):
-            prev, _ = pack_one_with_inverse(prev, 'b *')
-        if exists(prev):
-            inputs, _ = pack([prev, inputs], 'b * d')
-            gates = pad_at_dim(gates, (1, 0), value = 1., dim = -2)
-        if not self.use_accelerated:
-            _, out = associative_scan(binary_operator, (gates, inputs))
-            if remove_prev:
-                out = out[:, 1:]
-            return inverse_pack_weight_shape(out)
-        from accelerated_scan.triton import scan as triton_scan
-        from accelerated_scan.warp import scan as warp_scan
-        scan = triton_scan if gates.is_cuda else warp_scan
-        def accelerate_scan_fn(gates, inputs):
-            gates = gates.expand_as(inputs)
-            gates, inputs = tuple(rearrange(t, 'b n d -> b d n') for t in (gates, inputs))
-            seq_len = gates.shape[-1]
-            next_power_two_seq_len = 2 ** max(5, int(math.ceil(math.log2(seq_len))))
-            gates = F.pad(gates, (0, next_power_two_seq_len - seq_len))
-            inputs = F.pad(inputs, (0, next_power_two_seq_len - seq_len))
-            outputs = scan(gates.contiguous(), inputs.contiguous())
-            outputs = outputs[..., :seq_len]
-            outputs = rearrange(outputs, 'b d n -> b n d')
-            return outputs
-        out = accelerate_scan_fn(gates, inputs)
-        if remove_prev:
-            out = out[:, 1:]
-        return inverse_pack_weight_shape(out)
 # main neural memory
 def default_adaptive_step_transform(adaptive_step, max_lr = 1e-2):
@@ -285,6 +220,7 @@ class NeuralMemory(Module):
         default_step_transform_max_lr = 1.,
         per_parameter_lr_modulation = False, # allow outer network to control learning rate per weight matrix of memory network
         max_mem_layer_modulation = 1., # max of 10.
+        per_head_learned_parameters = True,
         attn_pool_chunks = False,
         momentum = True,
         pre_rmsnorm = True,
@@ -370,9 +306,21 @@ class NeuralMemory(Module):
         self.memory_model = model
-        self.num_memory_parameter_tensors = len(set(model.parameters()))
+        mem_model_params = dict(model.named_parameters())
+        self.num_memory_parameter_tensors = len(mem_model_params)
+        self.memory_model_parameter_names = [*mem_model_params.keys()]
+        memory_model_parameters = [*mem_model_params.values()]
+        if per_head_learned_parameters:
+            memory_model_parameters = [repeat(p, '... -> h ...', h = heads) for p in memory_model_parameters]
+        self.init_weight_shape = [p.shape for p in memory_model_parameters]
-        self.init_weight_shape = dict_get_shape(dict(model.named_parameters()))
+        self.memory_model_parameters = ParameterList(memory_model_parameters)
+        self.per_head_learned_parameters = per_head_learned_parameters
         # the chunk size within the paper where adaptive step, momentum, weight decay are shared
@@ -488,21 +436,32 @@ class NeuralMemory(Module):
         self.register_buffer('zero', torch.tensor(0.), persistent = False)
+    @property
+    def memory_model_parameter_dict(self):
+        return TensorDict(dict(zip(self.memory_model_parameter_names, self.memory_model_parameters)))
     def init_weights(
         self,
         batch,
     ):
-        weights = TensorDict(dict(self.memory_model.named_parameters()))
-        weights = repeat_dict_values(weights, '... -> bh ...', bh = batch * self.heads)
+        if self.per_head_learned_parameters:
+            weights = repeat_dict_values(self.memory_model_parameter_dict, 'h ... -> (b h) ...', b = batch)
+        else:
+            weights = repeat_dict_values(self.memory_model_parameter_dict, '... -> bh ...', bh = batch * self.heads)
         return weights
     def init_momentum(
         self,
         batch,
     ):
-        weights = TensorDict(dict(self.memory_model.named_parameters()))
-        zeros = weights.clone().zero_()
-        zeros = repeat_dict_values(zeros, '... -> bh ...', bh = batch * self.heads)
+        zeros = self.memory_model_parameter_dict.clone().zero_()
+        if self.per_head_learned_parameters:
+            zeros = repeat_dict_values(zeros, 'h ... -> (b h) ...', b = batch)
+        else:
+            zeros = repeat_dict_values(zeros, '... -> bh ...', bh = batch * self.heads)
         return zeros
     def store_memories(
@@ -690,16 +649,27 @@ class NeuralMemory(Module):
     def retrieve_memories(
         self,
         seq,
-        past_weights: dict[str, Tensor],
-        chunk_size = None,
-        need_pad = True
+        weights: dict[str, Tensor],
     ):
-        chunk_size = default(chunk_size, self.retrieve_chunk_size)
+        chunk_size = self.retrieve_chunk_size
+        weights_have_expanded_shape = dict_get_value_shapes(weights) != self.init_weight_shape
         batch, seq_len = seq.shape[:2]
-        seq = self.retrieve_norm(seq)
+        # auto infer single token decoding, if there are only 1 set of weights and 1 token
+        is_one_token = seq_len == 1
+        is_one_weight = (not weights_have_expanded_shape) or next(iter(weights.values())).shape[1] == 1
+        is_single_token_decode = is_one_token and is_one_weight
-        need_pad = need_pad or chunk_size > 1
+        if is_single_token_decode:
+            chunk_size = 1
+        # padding related, for chunked processing
+        need_pad = chunk_size > 1 or not is_one_weight
         if need_pad:
             seq = pad_at_dim(seq, (1, 0), dim = 1)
@@ -714,7 +684,11 @@ class NeuralMemory(Module):
         # the parameters of the memory model stores the memories of the key / values
         # when the MLP has only 1 weight matrix, it is equivalent to `kv` fast weight memories from linear attention literature (recall fetching of memories is q @ (kv)) / schmidhuber's paper
-        curr_weights = TensorDict(past_weights)
+        weights = TensorDict(weights)
+        # pre norm
+        seq = self.retrieve_norm(seq)
         # sequence Float['b n d'] to queries
@@ -730,14 +704,14 @@ class NeuralMemory(Module):
         # fetch values from memory model
-        if dict_get_shape(curr_weights) != self.init_weight_shape:
-            curr_weights = rearrange_dict_values(curr_weights, 'b n ... -> (b n) ...')
+        if weights_have_expanded_shape:
+            weights = rearrange_dict_values(weights, 'b n ... -> (b n) ...')
         queries = rearrange(queries, 'b h (n c) d -> (b h n) c d', c = chunk_size)
         # forward functional call
-        values = functional_call(self.memory_model, dict(curr_weights), queries)
+        values = functional_call(self.memory_model, dict(weights), queries)
         # reconstitute batch dimension
@@ -885,22 +859,13 @@ class NeuralMemory(Module):
         # retrieve
-        need_pad = True
-        retrieve_chunk_size = None
         if is_single_token:
-            retrieve_chunk_size = 1
-            need_pad = False
             last_update, _ = next_neural_mem_state.states
             updates = rearrange_dict_values(last_update, 'b ... -> b 1 ...')
         retrieved = self.retrieve_memories(
             seq,
-            updates,
-            chunk_size = retrieve_chunk_size,
-            need_pad = need_pad,
+            updates
         )
         return retrieved, next_neural_mem_state

{titans_pytorch-0.3.2 → titans_pytorch-0.3.4}/train_mac.py RENAMED Viewed

@@ -10,7 +10,11 @@ from torch.utils.data import DataLoader, Dataset
 from adam_atan2_pytorch import AdoptAtan2
-from titans_pytorch import MemoryAsContextTransformer, MemoryMLP
+from titans_pytorch import (
+    MemoryAsContextTransformer,
+    MemoryMLP,
+    MemoryAttention
+)
 # constants
@@ -35,6 +39,7 @@ NEURAL_MEM_GATE_ATTN_OUTPUT = False
 NEURAL_MEM_MOMENTUM = True
 NEURAL_MEM_QK_NORM = True
 NEURAL_MEM_MAX_LR = 1e-1
+USE_MEM_ATTENTION_MODEL = False
 WINDOW_SIZE = 32
 NEURAL_MEM_SEGMENT_LEN = 4                      # set smaller for more granularity for learning rate / momentum etc
 NEURAL_MEM_BATCH_SIZE = 128                     # set smaller to update the neural memory weights more often as it traverses the sequence
@@ -75,6 +80,18 @@ def decode_token(token):
 def decode_tokens(tokens):
     return ''.join(list(map(decode_token, tokens)))
+# memory model
+if USE_MEM_ATTENTION_MODEL:
+    neural_memory_model = MemoryAttention(
+        dim = 64
+    )
+else:
+    neural_memory_model = MemoryMLP(
+        dim = 64,
+        depth = NEURAL_MEMORY_DEPTH
+    )
 # instantiate memory-as-context transformer
 model = MemoryAsContextTransformer(
@@ -91,10 +108,7 @@ model = MemoryAsContextTransformer(
     neural_mem_weight_residual = NEURAL_MEM_WEIGHT_RESIDUAL,
     use_flex_attn = USE_FLEX_ATTN,
     sliding_window_attn = SLIDING_WINDOWS,
-    neural_memory_model = MemoryMLP(
-        dim = 64,
-        depth = NEURAL_MEMORY_DEPTH
-    ),
+    neural_memory_model = neural_memory_model,
     neural_memory_kwargs = dict(
         dim_head = 64,
         heads = 4,