PyPI - titans-pytorch - Versions diffs - 0.3.3__tar.gz → 0.3.5__tar.gz - Mend

titans-pytorch 0.3.3tar.gz → 0.3.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{titans_pytorch-0.3.3 → titans_pytorch-0.3.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.3.3
+Version: 0.3.5
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

{titans_pytorch-0.3.3 → titans_pytorch-0.3.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.3.3"
+version = "0.3.5"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.3.3 → titans_pytorch-0.3.5}/tests/test_titans.py RENAMED Viewed

@@ -31,6 +31,7 @@ def torch_default_dtype(dtype):
 @pytest.mark.parametrize('qk_rmsnorm', (False, True))
 @pytest.mark.parametrize('max_grad_norm', (None, 2.))
 @pytest.mark.parametrize('per_parameter_lr_modulation', (False, True))
+@pytest.mark.parametrize('per_head_learned_parameters', (False, True))
 def test_titans(
     seq_len,
     silu,
@@ -39,7 +40,8 @@ def test_titans(
     momentum,
     qk_rmsnorm,
     max_grad_norm,
-    per_parameter_lr_modulation
+    per_parameter_lr_modulation,
+    per_head_learned_parameters
 ):
     mem = NeuralMemory(
         dim = 16,
@@ -50,6 +52,7 @@ def test_titans(
         momentum = momentum,
         qk_rmsnorm = qk_rmsnorm,
         per_parameter_lr_modulation = per_parameter_lr_modulation,
+        per_head_learned_parameters = per_head_learned_parameters
     )
     seq = torch.randn(2, seq_len, 16)

{titans_pytorch-0.3.3 → titans_pytorch-0.3.5}/titans_pytorch/associative_scan.py RENAMED Viewed

@@ -3,18 +3,39 @@ from typing import Callable
 import torch
 from torch import Tensor
+from torch.nn import Module
 import torch.nn.functional as F
+from einops import rearrange, repeat, reduce, pack, unpack
 # taken from S5-pytorch repository
 # https://github.com/i404788/s5-pytorch/blob/74e2fdae00b915a62c914bf3615c0b8a4279eb84/s5/jax_compat.py#L51-L134
 # helper functions
+def exists(v):
+    return v is not None
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
 def pad_at_dim(t, pad, dim = -1, value = 0.):
     dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1)
     zeros = ((0, 0) * dims_from_right)
     return F.pad(t, (*zeros, *pad), value = value)
+def pack_one_with_inverse(t, pattern):
+    packed, packed_shape = pack([t], pattern)
+    def inverse(out, inv_pattern = None):
+        inv_pattern = default(inv_pattern, pattern)
+        return unpack(out, packed_shape, inv_pattern)[0]
+    return packed, inverse
 # the operator that is needed
 @torch.jit.script
@@ -88,3 +109,69 @@ def _interleave(a, b):
     interleaved = torch.flatten(stacked, start_dim=1, end_dim=2)
     return interleaved[:, :output_axis_len]
+# associative scan wrapper around naive and accelerated version
+class AssocScan(Module):
+    def __init__(
+        self,
+        use_accelerated = False
+    ):
+        super().__init__()
+        self.use_accelerated = use_accelerated
+    def forward(
+        self,
+        gates,
+        inputs,
+        prev = None,
+        remove_prev = None
+    ):
+        remove_prev = default(remove_prev, exists(prev))
+        inputs, inverse_pack_weight_shape = pack_one_with_inverse(inputs, 'b n *')
+        gates, _ = pack_one_with_inverse(gates, 'b n *')
+        if exists(prev):
+            prev, _ = pack_one_with_inverse(prev, 'b *')
+        if exists(prev):
+            inputs, _ = pack([prev, inputs], 'b * d')
+            gates = pad_at_dim(gates, (1, 0), value = 1., dim = -2)
+        if not self.use_accelerated:
+            _, out = associative_scan(binary_operator, (gates, inputs))
+            if remove_prev:
+                out = out[:, 1:]
+            return inverse_pack_weight_shape(out)
+        from accelerated_scan.triton import scan as triton_scan
+        from accelerated_scan.warp import scan as warp_scan
+        scan = triton_scan if gates.is_cuda else warp_scan
+        def accelerate_scan_fn(gates, inputs):
+            gates = gates.expand_as(inputs)
+            gates, inputs = tuple(rearrange(t, 'b n d -> b d n') for t in (gates, inputs))
+            seq_len = gates.shape[-1]
+            next_power_two_seq_len = 2 ** max(5, int(math.ceil(math.log2(seq_len))))
+            gates = F.pad(gates, (0, next_power_two_seq_len - seq_len))
+            inputs = F.pad(inputs, (0, next_power_two_seq_len - seq_len))
+            outputs = scan(gates.contiguous(), inputs.contiguous())
+            outputs = outputs[..., :seq_len]
+            outputs = rearrange(outputs, 'b d n -> b n d')
+            return outputs
+        out = accelerate_scan_fn(gates, inputs)
+        if remove_prev:
+            out = out[:, 1:]
+        return inverse_pack_weight_shape(out)

{titans_pytorch-0.3.3 → titans_pytorch-0.3.5}/titans_pytorch/memory_models.py RENAMED Viewed

@@ -30,6 +30,25 @@ class LayerNorm(Module):
         return self.ln(x) * (gamma + 1.)
+# norm + residual wrapper, as used in original TTT paper
+# but could be removed
+class ResidualNorm(Module):
+    def __init__(
+        self,
+        dim,
+        model: Module
+    ):
+        super().__init__()
+        self.norm = LayerNorm(dim)
+        self.model = model
+    def forward(self, x):
+        out = self.model(x)
+        return self.norm(out) + x
 # memory mlp proposed in TTT
 class MemoryMLP(Module):
@@ -45,8 +64,6 @@ class MemoryMLP(Module):
         self.weights = ParameterList([Parameter(torch.randn(dim_in, dim_out)) for dim_in, dim_out in zip(dims[:-1], dims[1:])])
-        self.ln = LayerNorm(dim)
         for weight in self.weights:
             nn.init.xavier_uniform_(weight)
@@ -54,8 +71,6 @@ class MemoryMLP(Module):
         self,
         x
     ):
-        residual = x
         for ind, weight in enumerate(self.weights):
             is_first = ind == 0
@@ -64,7 +79,7 @@ class MemoryMLP(Module):
             x = x @ weight
-        return self.ln(x) + residual
+        return x
 # memory mlp, but with gated residual + final projection
@@ -97,7 +112,6 @@ class GatedResidualMemoryMLP(Module):
         self,
         x
     ):
-        residual = x
         for weight1, weight2, to_gates in self.weights:
             res = x
@@ -111,9 +125,7 @@ class GatedResidualMemoryMLP(Module):
             gates = cat((branch_out, res), dim = -1) @ to_gates
             x = res.lerp(branch_out, gates.sigmoid())
-        out = x @ self.final_proj
-        return self.ln(out) + residual
+        return x @ self.final_proj
 # memory mlp with factorized weights
 # so can tradeoff capacity for smaller chunk sizes
@@ -143,7 +155,6 @@ class FactorizedMemoryMLP(Module):
         self,
         x
     ):
-        residual = x
         for ind, (weight1, weight2) in enumerate(self.weights):
             is_first = ind == 0
@@ -153,7 +164,7 @@ class FactorizedMemoryMLP(Module):
             x = x @ weight1 @ weight2
-        return self.ln(x) + residual
+        return x
 # improvised attention as memory module
@@ -182,7 +193,6 @@ class MemoryAttention(Module):
             nn.init.xavier_uniform_(weight)
     def forward(self, x):
-        residual = x
         wq, wk, wv, ffw1, ffw2 = self.weights
@@ -202,4 +212,4 @@ class MemoryAttention(Module):
         h = F.gelu(x @ ffw1)
         ff_out = h @ ffw2
-        return self.ln(attn_out + ff_out) + residual
+        return attn_out + ff_out

{titans_pytorch-0.3.3 → titans_pytorch-0.3.5}/titans_pytorch/neural_memory.py RENAMED Viewed

@@ -8,19 +8,16 @@ from collections import namedtuple
 import torch
 from torch import nn, cat, tensor, Tensor
 import torch.nn.functional as F
-from torch.nn import Linear, Module, Parameter, ParameterList
+from torch.nn import Linear, Module, Parameter, ParameterList, ParameterDict
 from torch.func import functional_call, vmap, grad
 from tensordict import TensorDict
-from titans_pytorch.associative_scan import (
-    associative_scan,
-    binary_operator,
-    pad_at_dim
-)
+from titans_pytorch.associative_scan import AssocScan
 from titans_pytorch.memory_models import(
-    MemoryMLP
+    MemoryMLP,
+    ResidualNorm
 )
 import einx
@@ -79,8 +76,8 @@ def safe_cat(inputs, dim = -2):
 def is_empty_tensor(t):
     return t.numel() == 0
-def dict_get_shape(td):
-    return {k: v.shape for k, v in td.items()}
+def dict_get_value_shapes(td):
+    return [v.shape for k, v in td.items()]
 def rearrange_dict_values(td, pattern, **kwargs):
     return td.apply(lambda t: rearrange(t, pattern, **kwargs))
@@ -97,6 +94,11 @@ def round_down_multiple(seq, mult):
 def round_up_multiple(seq, mult):
     return math.ceil(seq / mult) * mult
+def pad_at_dim(t, pad, dim = -1, value = 0.):
+    dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1)
+    zeros = ((0, 0) * dims_from_right)
+    return F.pad(t, (*zeros, *pad), value = value)
 def pack_one_with_inverse(t, pattern):
     packed, packed_shape = pack([t], pattern)
@@ -197,72 +199,6 @@ class AttentionPool(Module):
         return reduce(x * attn, 'b n c d -> b n d', 'sum')
-# associative scan wrapper
-class AssocScan(Module):
-    def __init__(
-        self,
-        use_accelerated = False
-    ):
-        super().__init__()
-        self.use_accelerated = use_accelerated
-    def forward(
-        self,
-        gates,
-        inputs,
-        prev = None,
-        remove_prev = None
-    ):
-        remove_prev = default(remove_prev, exists(prev))
-        inputs, inverse_pack_weight_shape = pack_one_with_inverse(inputs, 'b n *')
-        gates, _ = pack_one_with_inverse(gates, 'b n *')
-        if exists(prev):
-            prev, _ = pack_one_with_inverse(prev, 'b *')
-        if exists(prev):
-            inputs, _ = pack([prev, inputs], 'b * d')
-            gates = pad_at_dim(gates, (1, 0), value = 1., dim = -2)
-        if not self.use_accelerated:
-            _, out = associative_scan(binary_operator, (gates, inputs))
-            if remove_prev:
-                out = out[:, 1:]
-            return inverse_pack_weight_shape(out)
-        from accelerated_scan.triton import scan as triton_scan
-        from accelerated_scan.warp import scan as warp_scan
-        scan = triton_scan if gates.is_cuda else warp_scan
-        def accelerate_scan_fn(gates, inputs):
-            gates = gates.expand_as(inputs)
-            gates, inputs = tuple(rearrange(t, 'b n d -> b d n') for t in (gates, inputs))
-            seq_len = gates.shape[-1]
-            next_power_two_seq_len = 2 ** max(5, int(math.ceil(math.log2(seq_len))))
-            gates = F.pad(gates, (0, next_power_two_seq_len - seq_len))
-            inputs = F.pad(inputs, (0, next_power_two_seq_len - seq_len))
-            outputs = scan(gates.contiguous(), inputs.contiguous())
-            outputs = outputs[..., :seq_len]
-            outputs = rearrange(outputs, 'b d n -> b n d')
-            return outputs
-        out = accelerate_scan_fn(gates, inputs)
-        if remove_prev:
-            out = out[:, 1:]
-        return inverse_pack_weight_shape(out)
 # main neural memory
 def default_adaptive_step_transform(adaptive_step, max_lr = 1e-2):
@@ -285,6 +221,7 @@ class NeuralMemory(Module):
         default_step_transform_max_lr = 1.,
         per_parameter_lr_modulation = False, # allow outer network to control learning rate per weight matrix of memory network
         max_mem_layer_modulation = 1., # max of 10.
+        per_head_learned_parameters = True,
         attn_pool_chunks = False,
         momentum = True,
         pre_rmsnorm = True,
@@ -298,6 +235,7 @@ class NeuralMemory(Module):
         init_decay_bias = None,
         accept_weight_residual = False,
         gated_transition = False,
+        mem_model_norm_add_residual = True, # by default, layernorm output and add residual as proposed in TTT paper, but could be removed
         default_model_kwargs: dict = dict(
             depth = 2,
             expansion_factor = 4.
@@ -368,11 +306,26 @@ class NeuralMemory(Module):
         # the memory is the weights of the model
+        if mem_model_norm_add_residual:
+            model = ResidualNorm(dim = dim_head, model = model)
         self.memory_model = model
-        self.num_memory_parameter_tensors = len(set(model.parameters()))
+        mem_model_params = dict(model.named_parameters())
+        self.num_memory_parameter_tensors = len(mem_model_params)
-        self.init_weight_shape = dict_get_shape(dict(model.named_parameters()))
+        self.memory_model_parameter_names = [*mem_model_params.keys()]
+        memory_model_parameters = [*mem_model_params.values()]
+        if per_head_learned_parameters:
+            memory_model_parameters = [repeat(p, '... -> h ...', h = heads) for p in memory_model_parameters]
+        self.init_weight_shape = [p.shape for p in memory_model_parameters]
+        self.memory_model_parameters = ParameterList(memory_model_parameters)
+        self.per_head_learned_parameters = per_head_learned_parameters
         # the chunk size within the paper where adaptive step, momentum, weight decay are shared
@@ -488,21 +441,32 @@ class NeuralMemory(Module):
         self.register_buffer('zero', torch.tensor(0.), persistent = False)
+    @property
+    def memory_model_parameter_dict(self):
+        return TensorDict(dict(zip(self.memory_model_parameter_names, self.memory_model_parameters)))
     def init_weights(
         self,
         batch,
     ):
-        weights = TensorDict(dict(self.memory_model.named_parameters()))
-        weights = repeat_dict_values(weights, '... -> bh ...', bh = batch * self.heads)
+        if self.per_head_learned_parameters:
+            weights = repeat_dict_values(self.memory_model_parameter_dict, 'h ... -> (b h) ...', b = batch)
+        else:
+            weights = repeat_dict_values(self.memory_model_parameter_dict, '... -> bh ...', bh = batch * self.heads)
         return weights
     def init_momentum(
         self,
         batch,
     ):
-        weights = TensorDict(dict(self.memory_model.named_parameters()))
-        zeros = weights.clone().zero_()
-        zeros = repeat_dict_values(zeros, '... -> bh ...', bh = batch * self.heads)
+        zeros = self.memory_model_parameter_dict.clone().zero_()
+        if self.per_head_learned_parameters:
+            zeros = repeat_dict_values(zeros, 'h ... -> (b h) ...', b = batch)
+        else:
+            zeros = repeat_dict_values(zeros, '... -> bh ...', bh = batch * self.heads)
         return zeros
     def store_memories(
@@ -694,7 +658,7 @@ class NeuralMemory(Module):
     ):
         chunk_size = self.retrieve_chunk_size
-        weights_have_expanded_shape = dict_get_shape(weights) != self.init_weight_shape
+        weights_have_expanded_shape = dict_get_value_shapes(weights) != self.init_weight_shape
         batch, seq_len = seq.shape[:2]

{titans_pytorch-0.3.3 → titans_pytorch-0.3.5}/train_mac.py RENAMED Viewed

@@ -10,7 +10,11 @@ from torch.utils.data import DataLoader, Dataset
 from adam_atan2_pytorch import AdoptAtan2
-from titans_pytorch import MemoryAsContextTransformer, MemoryMLP
+from titans_pytorch import (
+    MemoryAsContextTransformer,
+    MemoryMLP,
+    MemoryAttention
+)
 # constants
@@ -35,6 +39,7 @@ NEURAL_MEM_GATE_ATTN_OUTPUT = False
 NEURAL_MEM_MOMENTUM = True
 NEURAL_MEM_QK_NORM = True
 NEURAL_MEM_MAX_LR = 1e-1
+USE_MEM_ATTENTION_MODEL = False
 WINDOW_SIZE = 32
 NEURAL_MEM_SEGMENT_LEN = 4                      # set smaller for more granularity for learning rate / momentum etc
 NEURAL_MEM_BATCH_SIZE = 128                     # set smaller to update the neural memory weights more often as it traverses the sequence
@@ -75,6 +80,18 @@ def decode_token(token):
 def decode_tokens(tokens):
     return ''.join(list(map(decode_token, tokens)))
+# memory model
+if USE_MEM_ATTENTION_MODEL:
+    neural_memory_model = MemoryAttention(
+        dim = 64
+    )
+else:
+    neural_memory_model = MemoryMLP(
+        dim = 64,
+        depth = NEURAL_MEMORY_DEPTH
+    )
 # instantiate memory-as-context transformer
 model = MemoryAsContextTransformer(
@@ -91,10 +108,7 @@ model = MemoryAsContextTransformer(
     neural_mem_weight_residual = NEURAL_MEM_WEIGHT_RESIDUAL,
     use_flex_attn = USE_FLEX_ATTN,
     sliding_window_attn = SLIDING_WINDOWS,
-    neural_memory_model = MemoryMLP(
-        dim = 64,
-        depth = NEURAL_MEMORY_DEPTH
-    ),
+    neural_memory_model = neural_memory_model,
     neural_memory_kwargs = dict(
         dim_head = 64,
         heads = 4,