PyPI - titans-pytorch - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

titans-pytorch 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

titans_pytorch/associative_scan.py CHANGED Viewed

@@ -3,18 +3,39 @@ from typing import Callable
 import torch
 from torch import Tensor
+from torch.nn import Module
 import torch.nn.functional as F
+from einops import rearrange, repeat, reduce, pack, unpack
 # taken from S5-pytorch repository
 # https://github.com/i404788/s5-pytorch/blob/74e2fdae00b915a62c914bf3615c0b8a4279eb84/s5/jax_compat.py#L51-L134
 # helper functions
+def exists(v):
+    return v is not None
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
 def pad_at_dim(t, pad, dim = -1, value = 0.):
     dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1)
     zeros = ((0, 0) * dims_from_right)
     return F.pad(t, (*zeros, *pad), value = value)
+def pack_one_with_inverse(t, pattern):
+    packed, packed_shape = pack([t], pattern)
+    def inverse(out, inv_pattern = None):
+        inv_pattern = default(inv_pattern, pattern)
+        return unpack(out, packed_shape, inv_pattern)[0]
+    return packed, inverse
 # the operator that is needed
 @torch.jit.script
@@ -88,3 +109,69 @@ def _interleave(a, b):
     interleaved = torch.flatten(stacked, start_dim=1, end_dim=2)
     return interleaved[:, :output_axis_len]
+# associative scan wrapper around naive and accelerated version
+class AssocScan(Module):
+    def __init__(
+        self,
+        use_accelerated = False
+    ):
+        super().__init__()
+        self.use_accelerated = use_accelerated
+    def forward(
+        self,
+        gates,
+        inputs,
+        prev = None,
+        remove_prev = None
+    ):
+        remove_prev = default(remove_prev, exists(prev))
+        inputs, inverse_pack_weight_shape = pack_one_with_inverse(inputs, 'b n *')
+        gates, _ = pack_one_with_inverse(gates, 'b n *')
+        if exists(prev):
+            prev, _ = pack_one_with_inverse(prev, 'b *')
+        if exists(prev):
+            inputs, _ = pack([prev, inputs], 'b * d')
+            gates = pad_at_dim(gates, (1, 0), value = 1., dim = -2)
+        if not self.use_accelerated:
+            _, out = associative_scan(binary_operator, (gates, inputs))
+            if remove_prev:
+                out = out[:, 1:]
+            return inverse_pack_weight_shape(out)
+        from accelerated_scan.triton import scan as triton_scan
+        from accelerated_scan.warp import scan as warp_scan
+        scan = triton_scan if gates.is_cuda else warp_scan
+        def accelerate_scan_fn(gates, inputs):
+            gates = gates.expand_as(inputs)
+            gates, inputs = tuple(rearrange(t, 'b n d -> b d n') for t in (gates, inputs))
+            seq_len = gates.shape[-1]
+            next_power_two_seq_len = 2 ** max(5, int(math.ceil(math.log2(seq_len))))
+            gates = F.pad(gates, (0, next_power_two_seq_len - seq_len))
+            inputs = F.pad(inputs, (0, next_power_two_seq_len - seq_len))
+            outputs = scan(gates.contiguous(), inputs.contiguous())
+            outputs = outputs[..., :seq_len]
+            outputs = rearrange(outputs, 'b d n -> b n d')
+            return outputs
+        out = accelerate_scan_fn(gates, inputs)
+        if remove_prev:
+            out = out[:, 1:]
+        return inverse_pack_weight_shape(out)

titans_pytorch/memory_models.py CHANGED Viewed

@@ -37,7 +37,7 @@ class MemoryMLP(Module):
         self,
         dim,
         depth,
-        expansion_factor = 4.
+        expansion_factor = 2.
     ):
         super().__init__()
         dim_hidden = int(dim * expansion_factor)

titans_pytorch/neural_memory.py CHANGED Viewed

@@ -8,16 +8,12 @@ from collections import namedtuple
 import torch
 from torch import nn, cat, tensor, Tensor
 import torch.nn.functional as F
-from torch.nn import Linear, Module, Parameter, ParameterList
+from torch.nn import Linear, Module, Parameter, ParameterList, ParameterDict
 from torch.func import functional_call, vmap, grad
 from tensordict import TensorDict
-from titans_pytorch.associative_scan import (
-    associative_scan,
-    binary_operator,
-    pad_at_dim
-)
+from titans_pytorch.associative_scan import AssocScan
 from titans_pytorch.memory_models import(
     MemoryMLP
@@ -79,8 +75,8 @@ def safe_cat(inputs, dim = -2):
 def is_empty_tensor(t):
     return t.numel() == 0
-def dict_get_shape(td):
-    return {k: v.shape for k, v in td.items()}
+def dict_get_value_shapes(td):
+    return [v.shape for k, v in td.items()]
 def rearrange_dict_values(td, pattern, **kwargs):
     return td.apply(lambda t: rearrange(t, pattern, **kwargs))
@@ -97,6 +93,11 @@ def round_down_multiple(seq, mult):
 def round_up_multiple(seq, mult):
     return math.ceil(seq / mult) * mult
+def pad_at_dim(t, pad, dim = -1, value = 0.):
+    dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1)
+    zeros = ((0, 0) * dims_from_right)
+    return F.pad(t, (*zeros, *pad), value = value)
 def pack_one_with_inverse(t, pattern):
     packed, packed_shape = pack([t], pattern)
@@ -197,72 +198,6 @@ class AttentionPool(Module):
         return reduce(x * attn, 'b n c d -> b n d', 'sum')
-# associative scan wrapper
-class AssocScan(Module):
-    def __init__(
-        self,
-        use_accelerated = False
-    ):
-        super().__init__()
-        self.use_accelerated = use_accelerated
-    def forward(
-        self,
-        gates,
-        inputs,
-        prev = None,
-        remove_prev = None
-    ):
-        remove_prev = default(remove_prev, exists(prev))
-        inputs, inverse_pack_weight_shape = pack_one_with_inverse(inputs, 'b n *')
-        gates, _ = pack_one_with_inverse(gates, 'b n *')
-        if exists(prev):
-            prev, _ = pack_one_with_inverse(prev, 'b *')
-        if exists(prev):
-            inputs, _ = pack([prev, inputs], 'b * d')
-            gates = pad_at_dim(gates, (1, 0), value = 1., dim = -2)
-        if not self.use_accelerated:
-            _, out = associative_scan(binary_operator, (gates, inputs))
-            if remove_prev:
-                out = out[:, 1:]
-            return inverse_pack_weight_shape(out)
-        from accelerated_scan.triton import scan as triton_scan
-        from accelerated_scan.warp import scan as warp_scan
-        scan = triton_scan if gates.is_cuda else warp_scan
-        def accelerate_scan_fn(gates, inputs):
-            gates = gates.expand_as(inputs)
-            gates, inputs = tuple(rearrange(t, 'b n d -> b d n') for t in (gates, inputs))
-            seq_len = gates.shape[-1]
-            next_power_two_seq_len = 2 ** max(5, int(math.ceil(math.log2(seq_len))))
-            gates = F.pad(gates, (0, next_power_two_seq_len - seq_len))
-            inputs = F.pad(inputs, (0, next_power_two_seq_len - seq_len))
-            outputs = scan(gates.contiguous(), inputs.contiguous())
-            outputs = outputs[..., :seq_len]
-            outputs = rearrange(outputs, 'b d n -> b n d')
-            return outputs
-        out = accelerate_scan_fn(gates, inputs)
-        if remove_prev:
-            out = out[:, 1:]
-        return inverse_pack_weight_shape(out)
 # main neural memory
 def default_adaptive_step_transform(adaptive_step, max_lr = 1e-2):
@@ -285,6 +220,7 @@ class NeuralMemory(Module):
         default_step_transform_max_lr = 1.,
         per_parameter_lr_modulation = False, # allow outer network to control learning rate per weight matrix of memory network
         max_mem_layer_modulation = 1., # max of 10.
+        per_head_learned_parameters = True,
         attn_pool_chunks = False,
         momentum = True,
         pre_rmsnorm = True,
@@ -370,9 +306,21 @@ class NeuralMemory(Module):
         self.memory_model = model
-        self.num_memory_parameter_tensors = len(set(model.parameters()))
+        mem_model_params = dict(model.named_parameters())
+        self.num_memory_parameter_tensors = len(mem_model_params)
+        self.memory_model_parameter_names = [*mem_model_params.keys()]
+        memory_model_parameters = [*mem_model_params.values()]
+        if per_head_learned_parameters:
+            memory_model_parameters = [repeat(p, '... -> h ...', h = heads) for p in memory_model_parameters]
+        self.init_weight_shape = [p.shape for p in memory_model_parameters]
-        self.init_weight_shape = dict_get_shape(dict(model.named_parameters()))
+        self.memory_model_parameters = ParameterList(memory_model_parameters)
+        self.per_head_learned_parameters = per_head_learned_parameters
         # the chunk size within the paper where adaptive step, momentum, weight decay are shared
@@ -488,21 +436,32 @@ class NeuralMemory(Module):
         self.register_buffer('zero', torch.tensor(0.), persistent = False)
+    @property
+    def memory_model_parameter_dict(self):
+        return TensorDict(dict(zip(self.memory_model_parameter_names, self.memory_model_parameters)))
     def init_weights(
         self,
         batch,
     ):
-        weights = TensorDict(dict(self.memory_model.named_parameters()))
-        weights = repeat_dict_values(weights, '... -> bh ...', bh = batch * self.heads)
+        if self.per_head_learned_parameters:
+            weights = repeat_dict_values(self.memory_model_parameter_dict, 'h ... -> (b h) ...', b = batch)
+        else:
+            weights = repeat_dict_values(self.memory_model_parameter_dict, '... -> bh ...', bh = batch * self.heads)
         return weights
     def init_momentum(
         self,
         batch,
     ):
-        weights = TensorDict(dict(self.memory_model.named_parameters()))
-        zeros = weights.clone().zero_()
-        zeros = repeat_dict_values(zeros, '... -> bh ...', bh = batch * self.heads)
+        zeros = self.memory_model_parameter_dict.clone().zero_()
+        if self.per_head_learned_parameters:
+            zeros = repeat_dict_values(zeros, 'h ... -> (b h) ...', b = batch)
+        else:
+            zeros = repeat_dict_values(zeros, '... -> bh ...', bh = batch * self.heads)
         return zeros
     def store_memories(
@@ -690,16 +649,27 @@ class NeuralMemory(Module):
     def retrieve_memories(
         self,
         seq,
-        past_weights: dict[str, Tensor],
-        chunk_size = None,
-        need_pad = True
+        weights: dict[str, Tensor],
     ):
-        chunk_size = default(chunk_size, self.retrieve_chunk_size)
+        chunk_size = self.retrieve_chunk_size
+        weights_have_expanded_shape = dict_get_value_shapes(weights) != self.init_weight_shape
         batch, seq_len = seq.shape[:2]
-        seq = self.retrieve_norm(seq)
+        # auto infer single token decoding, if there are only 1 set of weights and 1 token
+        is_one_token = seq_len == 1
+        is_one_weight = (not weights_have_expanded_shape) or next(iter(weights.values())).shape[1] == 1
+        is_single_token_decode = is_one_token and is_one_weight
-        need_pad = need_pad or chunk_size > 1
+        if is_single_token_decode:
+            chunk_size = 1
+        # padding related, for chunked processing
+        need_pad = chunk_size > 1 or not is_one_weight
         if need_pad:
             seq = pad_at_dim(seq, (1, 0), dim = 1)
@@ -714,7 +684,11 @@ class NeuralMemory(Module):
         # the parameters of the memory model stores the memories of the key / values
         # when the MLP has only 1 weight matrix, it is equivalent to `kv` fast weight memories from linear attention literature (recall fetching of memories is q @ (kv)) / schmidhuber's paper
-        curr_weights = TensorDict(past_weights)
+        weights = TensorDict(weights)
+        # pre norm
+        seq = self.retrieve_norm(seq)
         # sequence Float['b n d'] to queries
@@ -730,14 +704,14 @@ class NeuralMemory(Module):
         # fetch values from memory model
-        if dict_get_shape(curr_weights) != self.init_weight_shape:
-            curr_weights = rearrange_dict_values(curr_weights, 'b n ... -> (b n) ...')
+        if weights_have_expanded_shape:
+            weights = rearrange_dict_values(weights, 'b n ... -> (b n) ...')
         queries = rearrange(queries, 'b h (n c) d -> (b h n) c d', c = chunk_size)
         # forward functional call
-        values = functional_call(self.memory_model, dict(curr_weights), queries)
+        values = functional_call(self.memory_model, dict(weights), queries)
         # reconstitute batch dimension
@@ -885,22 +859,13 @@ class NeuralMemory(Module):
         # retrieve
-        need_pad = True
-        retrieve_chunk_size = None
         if is_single_token:
-            retrieve_chunk_size = 1
-            need_pad = False
             last_update, _ = next_neural_mem_state.states
             updates = rearrange_dict_values(last_update, 'b ... -> b 1 ...')
         retrieved = self.retrieve_memories(
             seq,
-            updates,
-            chunk_size = retrieve_chunk_size,
-            need_pad = need_pad,
+            updates
         )
         return retrieved, next_neural_mem_state

{titans_pytorch-0.3.2.dist-info → titans_pytorch-0.3.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.3.2
+Version: 0.3.4
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

titans_pytorch-0.3.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+titans_pytorch/__init__.py,sha256=Y3m_ZlpEqYwp-Md1ARhNGJxq8bQp8ty1o039nZOOJo0,276
+titans_pytorch/associative_scan.py,sha256=CEPXaZ2fEPWF8ZBe5wihCqPSGi8PNyL0uVSgvY7eV-s,5147
+titans_pytorch/mac_transformer.py,sha256=5rO4GQxSyFWWEc3pc3xNyG0sK5EXE7MmxKI-_kEMl2M,24941
+titans_pytorch/memory_models.py,sha256=0KLHZN-y_7lwrhWSnFRaYJ3GiUV3tzVjxS9CxIx_eI8,4843
+titans_pytorch/neural_memory.py,sha256=9eyeEvYsP5OFlwLDRyVut99uVYGvXAElFPabVoZnGJw,27063
+titans_pytorch-0.3.4.dist-info/METADATA,sha256=2ZD_DovSYkVejsTWHq7_IOTN-Je0of1f-HOiojaQBhQ,6815
+titans_pytorch-0.3.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+titans_pytorch-0.3.4.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+titans_pytorch-0.3.4.dist-info/RECORD,,

titans_pytorch-0.3.2.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-titans_pytorch/__init__.py,sha256=Y3m_ZlpEqYwp-Md1ARhNGJxq8bQp8ty1o039nZOOJo0,276
-titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
-titans_pytorch/mac_transformer.py,sha256=5rO4GQxSyFWWEc3pc3xNyG0sK5EXE7MmxKI-_kEMl2M,24941
-titans_pytorch/memory_models.py,sha256=TJl7b9Rd5BP8aQXK8itap5YN3DyomUVxCRJDgPuRGBk,4843
-titans_pytorch/neural_memory.py,sha256=QiEnHnZfQ8ptuXNVy4NZf9-XMbMOl2_1PT_YIG1GQBc,27739
-titans_pytorch-0.3.2.dist-info/METADATA,sha256=Ar1OdcY09w-q3RlVKlxcgrtcVzZE6cRKqnjwQ4F-9Z8,6815
-titans_pytorch-0.3.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-titans_pytorch-0.3.2.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-titans_pytorch-0.3.2.dist-info/RECORD,,

{titans_pytorch-0.3.2.dist-info → titans_pytorch-0.3.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{titans_pytorch-0.3.2.dist-info → titans_pytorch-0.3.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

titans-pytorch 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

titans-pytorch 0.3.2py3-none-any.whl → 0.3.4py3-none-any.whl