PyPI - titans-pytorch - Versions diffs - 0.3.1__tar.gz → 0.3.3__tar.gz - Mend

titans-pytorch 0.3.1tar.gz → 0.3.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{titans_pytorch-0.3.1 → titans_pytorch-0.3.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.3.1
+Version: 0.3.3
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

{titans_pytorch-0.3.1 → titans_pytorch-0.3.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.3.1"
+version = "0.3.3"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.3.1 → titans_pytorch-0.3.3}/tests/test_titans.py RENAMED Viewed

@@ -42,7 +42,7 @@ def test_titans(
     per_parameter_lr_modulation
 ):
     mem = NeuralMemory(
-        dim = 384,
+        dim = 16,
         chunk_size = chunk_size,
         activation = nn.SiLU() if silu else None,
         attn_pool_chunks = attn_pool_chunks,
@@ -52,7 +52,7 @@ def test_titans(
         per_parameter_lr_modulation = per_parameter_lr_modulation,
     )
-    seq = torch.randn(2, seq_len, 384)
+    seq = torch.randn(2, seq_len, 16)
     retrieved, _ = mem(seq)
     assert seq.shape == retrieved.shape
@@ -61,14 +61,14 @@ def test_titans_attn_memory():
     from titans_pytorch.memory_models import MemoryAttention
     mem = NeuralMemory(
-        dim = 384,
+        dim = 16,
         chunk_size = 64,
         model = MemoryAttention(
-            dim = 384
+            dim = 16
         )
     )
-    seq = torch.randn(2, 1024, 384)
+    seq = torch.randn(2, 1024, 16)
     retrieved, _ = mem(seq)
     assert seq.shape == retrieved.shape
@@ -78,14 +78,14 @@ def test_neural_mem_chaining_chunks(
     gated_transition
 ):
     mem  = NeuralMemory(
-        dim = 384,
-        dim_head = 64,
+        dim = 16,
+        dim_head = 16,
         heads = 2,
         chunk_size = 16,
         gated_transition = gated_transition
     )
-    seq = torch.randn(2, 48, 384)
+    seq = torch.randn(2, 48, 16)
     parallel_retrieved, state = mem(seq)
@@ -99,21 +99,21 @@ def test_neural_mem_chaining_chunks(
 def test_neural_mem_chaining_with_weight_residual():
     mem  = NeuralMemory(
-        dim = 384,
-        dim_head = 64,
+        dim = 16,
+        dim_head = 16,
         heads = 2,
         chunk_size = 64
     )
     mem2 = NeuralMemory(
-        dim = 384,
-        dim_head = 64,
+        dim = 16,
+        dim_head = 16,
         heads = 2,
         chunk_size = 64,
         accept_weight_residual = True
     )
-    seq = torch.randn(2, 256, 384)
+    seq = torch.randn(2, 256, 16)
     seq, state = mem(seq)
@@ -124,18 +124,18 @@ def test_neural_mem_chaining_with_weight_residual():
     first_retrieved, state1 = mem2(seq_first, prev_weights = state.updates)
     second_retrieved, state2 = mem2(seq_second, state = state1, prev_weights = state.updates)
-    assert torch.allclose(parallel_retrieved, torch.cat((first_retrieved, second_retrieved), dim = 1), atol = 1e-6)
+    assert torch.allclose(parallel_retrieved, torch.cat((first_retrieved, second_retrieved), dim = 1), atol = 1e-5)
 def test_neural_mem_chaining_with_batch_size():
     mem  = NeuralMemory(
-        dim = 384,
-        dim_head = 64,
+        dim = 16,
+        dim_head = 16,
         heads = 2,
         chunk_size = 16,
         batch_size = 64
     )
-    seq = torch.randn(2, 112, 384)
+    seq = torch.randn(2, 112, 16)
     parallel_retrieved, state = mem(seq)
@@ -169,7 +169,7 @@ def test_mac(
 ):
     transformer = MemoryAsContextTransformer(
         num_tokens = 256,
-        dim = 256,
+        dim = 16,
         depth = 2,
         num_persist_mem_tokens = num_persist_mem_tokens,
         num_longterm_mem_tokens = num_longterm_mem_tokens,
@@ -201,7 +201,7 @@ def test_mac_sampling(
 ):
     transformer = MemoryAsContextTransformer(
         num_tokens = 256,
-        dim = 256,
+        dim = 16,
         depth = 4,
         segment_len = 32,
         num_persist_mem_tokens = 4,
@@ -235,12 +235,12 @@ def test_neural_mem_inference(
 ):
     mem = NeuralMemory(
-        dim = 384,
+        dim = 16,
         chunk_size = mem_chunk_size,
         gated_transition = gated_transition
     )
-    seq = torch.randn(2, seq_len, 384)
+    seq = torch.randn(2, seq_len, 16)
     parallel_retrieved, _ = mem(seq)
     assert seq.shape == parallel_retrieved.shape
@@ -282,7 +282,7 @@ def test_flex(
         pytest.skip()
     attn = SegmentedAttention(
-        dim = 512,
+        dim = 16,
         segment_len = 32,
         num_persist_mem_tokens = 1,
         num_longterm_mem_tokens = 1,
@@ -290,7 +290,7 @@ def test_flex(
         sliding = sliding
     ).cuda()
-    seq = torch.randn(1, seq_len, 512).cuda()
+    seq = torch.randn(1, seq_len, 16).cuda()
     out_flex, _ = attn(seq)
     out_non_flex, _ = attn(seq, disable_flex_attn = True)
@@ -307,8 +307,8 @@ def test_assoc_scan():
     seq_len = 128
     mid_point = seq_len // 2
-    gates = torch.randn(2, seq_len, 512).sigmoid()
-    inputs = torch.randn(2, seq_len, 512)
+    gates = torch.randn(2, seq_len, 16).sigmoid()
+    inputs = torch.randn(2, seq_len, 16)
     output = scan(gates, inputs)

{titans_pytorch-0.3.1 → titans_pytorch-0.3.3}/titans_pytorch/memory_models.py RENAMED Viewed

@@ -36,10 +36,14 @@ class MemoryMLP(Module):
     def __init__(
         self,
         dim,
-        depth
+        depth,
+        expansion_factor = 2.
     ):
         super().__init__()
-        self.weights = ParameterList([Parameter(torch.randn(dim, dim)) for _ in range(depth)])
+        dim_hidden = int(dim * expansion_factor)
+        dims = (dim, *((dim_hidden,) * (depth - 1)), dim)
+        self.weights = ParameterList([Parameter(torch.randn(dim_in, dim_out)) for dim_in, dim_out in zip(dims[:-1], dims[1:])])
         self.ln = LayerNorm(dim)

{titans_pytorch-0.3.1 → titans_pytorch-0.3.3}/titans_pytorch/neural_memory.py RENAMED Viewed

@@ -299,7 +299,8 @@ class NeuralMemory(Module):
         accept_weight_residual = False,
         gated_transition = False,
         default_model_kwargs: dict = dict(
-            depth = 2
+            depth = 2,
+            expansion_factor = 4.
         )
     ):
         super().__init__()
@@ -689,16 +690,27 @@ class NeuralMemory(Module):
     def retrieve_memories(
         self,
         seq,
-        past_weights: dict[str, Tensor],
-        chunk_size = None,
-        need_pad = True
+        weights: dict[str, Tensor],
     ):
-        chunk_size = default(chunk_size, self.retrieve_chunk_size)
+        chunk_size = self.retrieve_chunk_size
+        weights_have_expanded_shape = dict_get_shape(weights) != self.init_weight_shape
         batch, seq_len = seq.shape[:2]
-        seq = self.retrieve_norm(seq)
+        # auto infer single token decoding, if there are only 1 set of weights and 1 token
+        is_one_token = seq_len == 1
+        is_one_weight = (not weights_have_expanded_shape) or next(iter(weights.values())).shape[1] == 1
+        is_single_token_decode = is_one_token and is_one_weight
+        if is_single_token_decode:
+            chunk_size = 1
+        # padding related, for chunked processing
-        need_pad = need_pad or chunk_size > 1
+        need_pad = chunk_size > 1 or not is_one_weight
         if need_pad:
             seq = pad_at_dim(seq, (1, 0), dim = 1)
@@ -713,7 +725,11 @@ class NeuralMemory(Module):
         # the parameters of the memory model stores the memories of the key / values
         # when the MLP has only 1 weight matrix, it is equivalent to `kv` fast weight memories from linear attention literature (recall fetching of memories is q @ (kv)) / schmidhuber's paper
-        curr_weights = TensorDict(past_weights)
+        weights = TensorDict(weights)
+        # pre norm
+        seq = self.retrieve_norm(seq)
         # sequence Float['b n d'] to queries
@@ -729,14 +745,14 @@ class NeuralMemory(Module):
         # fetch values from memory model
-        if dict_get_shape(curr_weights) != self.init_weight_shape:
-            curr_weights = rearrange_dict_values(curr_weights, 'b n ... -> (b n) ...')
+        if weights_have_expanded_shape:
+            weights = rearrange_dict_values(weights, 'b n ... -> (b n) ...')
         queries = rearrange(queries, 'b h (n c) d -> (b h n) c d', c = chunk_size)
         # forward functional call
-        values = functional_call(self.memory_model, dict(curr_weights), queries)
+        values = functional_call(self.memory_model, dict(weights), queries)
         # reconstitute batch dimension
@@ -884,22 +900,13 @@ class NeuralMemory(Module):
         # retrieve
-        need_pad = True
-        retrieve_chunk_size = None
         if is_single_token:
-            retrieve_chunk_size = 1
-            need_pad = False
             last_update, _ = next_neural_mem_state.states
             updates = rearrange_dict_values(last_update, 'b ... -> b 1 ...')
         retrieved = self.retrieve_memories(
             seq,
-            updates,
-            chunk_size = retrieve_chunk_size,
-            need_pad = need_pad,
+            updates
         )
         return retrieved, next_neural_mem_state