PyPI - hippoformer - Versions diffs - 0.0.11__tar.gz → 0.0.14__tar.gz - Mend

@@ -1,9 +1,9 @@
 from __future__ import annotations
 import torch
-from torch import nn, Tensor, cat, stack, zeros_like, einsum, tensor
+from torch import nn, Tensor, cat, stack, arange, zeros_like, einsum, tensor
 import torch.nn.functional as F
-from torch.nn import Module
+from torch.nn import Module, ModuleList
 from torch.jit import ScriptModule, script_method
 from torch.func import vmap, grad, functional_call
@@ -221,8 +221,11 @@ class PathIntegration(Module):
         return self.rnn(transitions, prev_structural)
-# custom transformer
-# with the connections proposed by James Whittington that bridges to hippocampal models
+# custom transformer proposed by James Whittington that bridges to hippocampal models with a few twists
+# the mmTEM can be seen as a linear attention / TTT variant of what he proposed
+# needed for the baseline as well as the parallel block to bolster local time prediction
 # https://arxiv.org/abs/2112.04035
 def FeedForward(dim, mult = 4.):
@@ -238,19 +241,32 @@ class Attention(Module):
         self,
         dim_q,
         dim_kv,
+        window_size,
         dim_head = 64,
-        heads = 8
+        heads = 8,
+        implicit_mlp_expansion = 2 # for fair comparison, the attention should have an implicit mlp of 2 layers with a non-linearity, just like the meta-memory mlp in titans (linear attention)
     ):
         super().__init__()
         dim_inner = dim_head * heads
+        dim_mlp_inner = dim_head * heads * implicit_mlp_expansion
         self.scale = dim_head ** -0.5
         self.to_queries = nn.Linear(dim_q, dim_inner, bias = False)
-        self.to_key_values = nn.Linear(dim_kv, dim_inner * 2, bias = False)
+        self.to_w1_keys = nn.Linear(dim_kv, dim_inner, bias = False)
+        self.to_w1_values = nn.Linear(dim_kv, dim_mlp_inner, bias = False)
+        self.implicit_mlp_activation = nn.SiLU()
+        self.to_w2_keys = nn.Linear(dim_kv, dim_mlp_inner, bias = False)
+        self.to_w2_values = nn.Linear(dim_kv, dim_inner, bias = False)
         self.split_heads = Rearrange('b n (h d) -> b h n d', h = heads)
         self.merge_heads = Rearrange('b h n d -> b n (h d)')
+        self.window_size = window_size
         self.to_out = nn.Linear(dim_inner, dim_q, bias = False)
         self.attn_head_sink = nn.Parameter(torch.randn(heads) * 1e-2) # needed as the diagonal is masked out, and for attention sink
@@ -264,43 +280,59 @@ class Attention(Module):
         q = self.to_queries(queries_input)
-        k, v = self.to_key_values(key_values_input).chunk(2, dim = -1)
+        k1, v1, k2, v2 = [fn(key_values_input) for fn in (self.to_w1_keys, self.to_w1_values, self.to_w2_keys, self.to_w2_values)]
-        q, k, v = tuple(self.split_heads(t) for t in (q, k, v))
+        q, k1, v1, k2, v2 = tuple(self.split_heads(t) for t in (q, k1, v1, k2, v2))
         if exists(kv_cache):
-            ck, cv = kv_cache
-            k = cat((ck, k), dim = -2)
-            v = cat((cv, v), dim = -2)
+            ck1, cv1, vk2, cv2 = kv_cache
+            k1 = cat((ck1, k1), dim = -2)
+            v1 = cat((cv1, v1), dim = -2)
+            k2 = cat((ck2, k2), dim = -2)
+            v2 = cat((cv2, v2), dim = -2)
+        def attend(q, k, v):
+            q = q * self.scale
+            sim = einsum('b h i d, b h j d -> b h i j', q, k)
-        q = q * self.scale
+            # the diagonal is masked out
-        sim = einsum('b h i d, b h j d -> b h i j', q, k)
+            i, j = sim.shape[-2:]
-        # the diagonal is masked out
+            j_seq = arange(j, device = device)[:, None]
+            i_seq = arange(i, device = device)[None, :] + (j - i)
-        i, j = sim.shape[-2:]
-        causal_mask_without_diagonal = torch.ones((i, j), dtype = torch.bool, device = device).triu(j - i)
+            windowed_causal_mask_without_diagonal = (i_seq > j_seq) & ((i_seq - j_seq) <= self.window_size)
-        sim = sim.masked_fill(causal_mask_without_diagonal, -torch.finfo(sim.dtype).max)
+            sim = sim.masked_fill(windowed_causal_mask_without_diagonal, -torch.finfo(sim.dtype).max)
-        # attention sink, for token as well as for attention sinking - from gpt-oss
+            # attention sink, for token as well as for attention sinking - from gpt-oss
-        attn_sink = repeat(self.attn_head_sink, 'h -> b h i 1', b = batch, i = seq_len)
+            attn_sink = repeat(self.attn_head_sink, 'h -> b h i 1', b = batch, i = seq_len)
-        sim = cat((attn_sink, sim), dim = -1)
+            sim = cat((attn_sink, sim), dim = -1)
-        attn = sim.softmax(dim = -1)
+            attn = sim.softmax(dim = -1)
-        attn = attn[..., 1:] # remove sink
+            attn = attn[..., 1:] # remove sink
-        # aggregate
+            # aggregate
-        out = einsum('b h i j, b h j d -> b h i d', attn, v)
+            out = einsum('b h i j, b h j d -> b h i d', attn, v)
+            return out
+        # implicit memory mlp w1
+        hiddens = attend(q, k1, v1)
+        hiddens = self.implicit_mlp_activation(hiddens)
+        out = attend(hiddens, k2, v2)
+        # merge heads
         out = self.merge_heads(out)
-        return self.to_out(out), stack((k, v))
+        return self.to_out(out), (k1, v1, k2, v2)
 class TEMTransformerBlock(Module):
     def __init__(
@@ -314,7 +346,7 @@ class TEMTransformerBlock(Module):
     ):
         super().__init__()
-        self.attn = Attention(dim_structure, dim_structure + dim_encoded_sensory, dim_head = dim_head, heads = heads)
+        self.attn = Attention(dim_structure, dim_structure + dim_encoded_sensory, window_size, dim_head = dim_head, heads = heads)
         self.ff = FeedForward(dim_structure, ff_expansion_factor)
         self.window_size = window_size
@@ -337,6 +369,66 @@ class TEMTransformerBlock(Module):
         return x, next_kv_cache
+class TEMTransformer(Module):
+    def __init__(
+        self,
+        sensory_encoder_decoder: tuple[Module, Module],
+        dim_sensory,
+        dim_action,
+        dim_encoded_sensory,
+        dim_structure,
+        depth = 4,
+        transformer_kwargs: dict = dict(
+            dim_head = 64,
+            heads = 8,
+            ff_expansion_factor = 4,
+            window_size = 32
+        ),
+    ):
+        super().__init__()
+        self.sensory_encoder, self.sensory_decoder = sensory_encoder_decoder
+        self.path_integrator = nn.GRU(dim_action, dim_structure)
+        self.layers = ModuleList([])
+        for _ in range(depth):
+            block = TEMTransformerBlock(
+                dim_structure,
+                dim_encoded_sensory,
+                **transformer_kwargs
+            )
+            layers.append(block)
+    def forward(
+        self,
+        sensory,
+        actions,
+        prev_hiddens = None,  # for the GRU based path integrator
+        prev_kv_cache = None  # for the specialized transformer blocks for inducing the grid-cells
+    ):
+        structure, next_hiddens = self.gru_path_integrator(actions, prev_hiddens)
+        encoded_sensory = self.sensory_encoder(sensory)
+        next_kv_cache = []
+        for layer in self.layers:
+            structure, layer_next_cache = layer(structure, encoded_sensory)
+            next_kv_cache.append(layer_next_cache)
+        decoded_sensory = self.sensory_decoder(structure)
+        next_memories = (next_hiddens, stack(next_kv_cache))
+        pred_loss = F.mse_loss(encoded_sensory, decoded_sensory)
+        return pred_loss
 # proposed mmTEM
 class mmTEM(Module):

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hippoformer
-Version: 0.0.11
+Version: 0.0.14
 Summary: hippoformer
 Project-URL: Homepage, https://pypi.org/project/hippoformer/
 Project-URL: Repository, https://github.com/lucidrains/hippoformer
@@ -63,3 +63,14 @@ Implementation of [Hippoformer](https://openreview.net/forum?id=hxwV5EubAw), Int
     note    = {under review}
 }
 ```
+```bibtex
+@article{Li2020GridCA,
+    title     = {Grid Cells Are Ubiquitous in Neural Networks},
+    author    = {Songlin Li and Yangdong Deng and Zhihua Wang},
+    journal   = {ArXiv},
+    year      = {2020},
+    volume    = {abs/2003.03482},
+    url       = {https://api.semanticscholar.org/CorpusID:212634300}
+}
+```

@@ -16,3 +16,14 @@ Implementation of [Hippoformer](https://openreview.net/forum?id=hxwV5EubAw), Int
     note    = {under review}
 }
 ```
+```bibtex
+@article{Li2020GridCA,
+    title     = {Grid Cells Are Ubiquitous in Neural Networks},
+    author    = {Songlin Li and Yangdong Deng and Zhihua Wang},
+    journal   = {ArXiv},
+    year      = {2020},
+    volume    = {abs/2003.03482},
+    url       = {https://api.semanticscholar.org/CorpusID:212634300}
+}
+```

@@ -1,6 +1,6 @@
 [project]
 name = "hippoformer"
-version = "0.0.11"
+version = "0.0.14"
 description = "hippoformer"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

@@ -69,7 +69,7 @@ def test_mm_tem(
 def test_tem_t():
     from hippoformer.hippoformer import TEMTransformerBlock
-    block = TEMTransformerBlock(32, 16)
+    block = TEMTransformerBlock(32, 16, window_size = 3)
     structural_codes = torch.randn(1, 7, 32)
     encoded_sensory = torch.randn(1, 7, 16)

hippoformer 0.0.11__tar.gz → 0.0.14__tar.gz

hippoformer 0.0.11tar.gz → 0.0.14tar.gz