PyPI - titans-pytorch - Versions diffs - 0.3.4__tar.gz → 0.3.6__tar.gz - Mend

titans-pytorch 0.3.4tar.gz → 0.3.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{titans_pytorch-0.3.4 → titans_pytorch-0.3.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.3.4
+Version: 0.3.6
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

{titans_pytorch-0.3.4 → titans_pytorch-0.3.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.3.4"
+version = "0.3.6"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.3.4 → titans_pytorch-0.3.6}/titans_pytorch/memory_models.py RENAMED Viewed

@@ -30,6 +30,25 @@ class LayerNorm(Module):
         return self.ln(x) * (gamma + 1.)
+# norm + residual wrapper, as used in original TTT paper
+# but could be removed
+class ResidualNorm(Module):
+    def __init__(
+        self,
+        dim,
+        model: Module
+    ):
+        super().__init__()
+        self.norm = LayerNorm(dim)
+        self.model = model
+    def forward(self, x):
+        out = self.model(x)
+        return self.norm(out) + x
 # memory mlp proposed in TTT
 class MemoryMLP(Module):
@@ -45,8 +64,6 @@ class MemoryMLP(Module):
         self.weights = ParameterList([Parameter(torch.randn(dim_in, dim_out)) for dim_in, dim_out in zip(dims[:-1], dims[1:])])
-        self.ln = LayerNorm(dim)
         for weight in self.weights:
             nn.init.xavier_uniform_(weight)
@@ -54,8 +71,6 @@ class MemoryMLP(Module):
         self,
         x
     ):
-        residual = x
         for ind, weight in enumerate(self.weights):
             is_first = ind == 0
@@ -64,7 +79,7 @@ class MemoryMLP(Module):
             x = x @ weight
-        return self.ln(x) + residual
+        return x
 # memory mlp, but with gated residual + final projection
@@ -88,8 +103,6 @@ class GatedResidualMemoryMLP(Module):
         self.final_proj = Parameter(torch.randn(dim, dim))
-        self.ln = LayerNorm(dim)
         for param in self.parameters():
             nn.init.xavier_uniform_(param)
@@ -97,7 +110,6 @@ class GatedResidualMemoryMLP(Module):
         self,
         x
     ):
-        residual = x
         for weight1, weight2, to_gates in self.weights:
             res = x
@@ -111,9 +123,7 @@ class GatedResidualMemoryMLP(Module):
             gates = cat((branch_out, res), dim = -1) @ to_gates
             x = res.lerp(branch_out, gates.sigmoid())
-        out = x @ self.final_proj
-        return self.ln(out) + residual
+        return x @ self.final_proj
 # memory mlp with factorized weights
 # so can tradeoff capacity for smaller chunk sizes
@@ -133,8 +143,6 @@ class FactorizedMemoryMLP(Module):
             ]) for _ in range(depth)
         ])
-        self.ln = LayerNorm(dim)
         for weight1, weight2 in self.weights:
             nn.init.xavier_uniform_(weight1)
             nn.init.xavier_uniform_(weight2)
@@ -143,7 +151,6 @@ class FactorizedMemoryMLP(Module):
         self,
         x
     ):
-        residual = x
         for ind, (weight1, weight2) in enumerate(self.weights):
             is_first = ind == 0
@@ -153,7 +160,7 @@ class FactorizedMemoryMLP(Module):
             x = x @ weight1 @ weight2
-        return self.ln(x) + residual
+        return x
 # improvised attention as memory module
@@ -176,13 +183,10 @@ class MemoryAttention(Module):
             nn.Parameter(torch.randn(dim_ff_hidden, dim)), # ff w2
         ])
-        self.ln = LayerNorm(dim)
         for weight in self.weights:
             nn.init.xavier_uniform_(weight)
     def forward(self, x):
-        residual = x
         wq, wk, wv, ffw1, ffw2 = self.weights
@@ -202,4 +206,4 @@ class MemoryAttention(Module):
         h = F.gelu(x @ ffw1)
         ff_out = h @ ffw2
-        return self.ln(attn_out + ff_out) + residual
+        return attn_out + ff_out

{titans_pytorch-0.3.4 → titans_pytorch-0.3.6}/titans_pytorch/neural_memory.py RENAMED Viewed

@@ -16,7 +16,8 @@ from tensordict import TensorDict
 from titans_pytorch.associative_scan import AssocScan
 from titans_pytorch.memory_models import(
-    MemoryMLP
+    MemoryMLP,
+    ResidualNorm
 )
 import einx
@@ -234,6 +235,7 @@ class NeuralMemory(Module):
         init_decay_bias = None,
         accept_weight_residual = False,
         gated_transition = False,
+        mem_model_norm_add_residual = True, # by default, layernorm output and add residual as proposed in TTT paper, but could be removed
         default_model_kwargs: dict = dict(
             depth = 2,
             expansion_factor = 4.
@@ -304,6 +306,9 @@ class NeuralMemory(Module):
         # the memory is the weights of the model
+        if mem_model_norm_add_residual:
+            model = ResidualNorm(dim = dim_head, model = model)
         self.memory_model = model
         mem_model_params = dict(model.named_parameters())