PyPI - titans-pytorch - Versions diffs - 0.0.7__tar.gz → 0.0.9__tar.gz - Mend

titans-pytorch 0.0.7tar.gz → 0.0.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{titans_pytorch-0.0.7 → titans_pytorch-0.0.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.0.7
+Version: 0.0.9
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -39,7 +39,8 @@ Requires-Dist: einx>=0.3.0
 Requires-Dist: tensordict>=0.6.2
 Requires-Dist: torch>=2.3
 Provides-Extra: examples
-Requires-Dist: local-attention>=1.9.15; extra == 'examples'
+Requires-Dist: local-attention>=1.10.0; extra == 'examples'
+Requires-Dist: taylor-series-linear-attention; extra == 'examples'
 Provides-Extra: test
 Requires-Dist: pytest; extra == 'test'
 Description-Content-Type: text/markdown
@@ -64,16 +65,28 @@ $ pip install titans-pytorch
 import torch
 from titans_pytorch import NeuralMemory
-x = torch.randn(2, 64, 32)
 mem = NeuralMemory(
-    dim = 32,
-    chunk_size = 2
-)
+    dim = 384,
+    chunk_size = 64,
+    pre_rmsnorm = True
+).cuda()
+seq = torch.randn(2, 1024, 384).cuda()
+retrieved = mem(seq)
-out = mem(x)
+assert seq.shape == retrieved.shape
+```
-assert x.shape == out.shape
+## Experiments
+```bash
+$ pip install .[examples]
+```
+Then
+```bash
+$ python train.py
 ```
 ## Citations

{titans_pytorch-0.0.7 → titans_pytorch-0.0.9}/README.md RENAMED Viewed

@@ -18,16 +18,28 @@ $ pip install titans-pytorch
 import torch
 from titans_pytorch import NeuralMemory
-x = torch.randn(2, 64, 32)
 mem = NeuralMemory(
-    dim = 32,
-    chunk_size = 2
-)
+    dim = 384,
+    chunk_size = 64,
+    pre_rmsnorm = True
+).cuda()
+seq = torch.randn(2, 1024, 384).cuda()
+retrieved = mem(seq)
-out = mem(x)
+assert seq.shape == retrieved.shape
+```
-assert x.shape == out.shape
+## Experiments
+```bash
+$ pip install .[examples]
+```
+Then
+```bash
+$ python train.py
 ```
 ## Citations

{titans_pytorch-0.0.7 → titans_pytorch-0.0.9}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.0.7"
+version = "0.0.9"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -37,7 +37,8 @@ Repository = "https://github.com/lucidrains/titans-pytorch"
 [project.optional-dependencies]
 examples = [
-    "local-attention>=1.9.15"
+    "local-attention>=1.10.0",
+    "taylor-series-linear-attention"
 ]
 test = [
     "pytest"

{titans_pytorch-0.0.7 → titans_pytorch-0.0.9}/titans_pytorch/titans.py RENAMED Viewed

@@ -55,6 +55,10 @@ def pack_one_with_inverse(t, pattern):
     return packed, inverse
+def softclamp_max(t, max_value):
+    range_value = max_value / 2
+    return ((t / range_value).tanh() * range_value) + range_value
 # classes
 class MLP(Module):
@@ -92,7 +96,8 @@ class NeuralMemory(Module):
         chunk_size = 1,
         model: Module | None = None,
         store_memory_loss_fn: Callable = default_loss_fn,
-        pre_rmsnorm = False
+        pre_rmsnorm = False,
+        max_adaptive_step_size = 1e-5
     ):
         super().__init__()
@@ -144,6 +149,8 @@ class NeuralMemory(Module):
             Rearrange('... 1 -> ...')
         )
+        self.max_adaptive_step_size = max_adaptive_step_size
         # weight decay factor
         self.to_decay_factor = nn.Sequential(
@@ -188,7 +195,7 @@ class NeuralMemory(Module):
         batch = seq.shape[0]
-        adaptive_lr = self.to_adaptive_step(seq).tanh() * 0.5 + 0.5
+        adaptive_lr = softclamp_max(self.to_adaptive_step(seq), self.max_adaptive_step_size)
         adaptive_momentum = self.to_momentum(seq).sigmoid()
         decay_factor = self.to_decay_factor(seq).sigmoid()
@@ -304,7 +311,10 @@ class NeuralMemory(Module):
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
         return_next_memories = False
     ):
-        batch = seq.shape[0]
+        batch, seq_len = seq.shape[:2]
+        if seq_len < self.chunk_size:
+            return torch.zeros_like(seq)
         if exists(past_state):
             past_state = tuple(TensorDict(d) for d in past_state)

{titans_pytorch-0.0.7 → titans_pytorch-0.0.9}/train.py RENAMED Viewed

@@ -4,12 +4,15 @@ import gzip
 import numpy as np
 import torch
+from torch import nn
 from torch.optim import Adam
 from torch.nn import functional as F
 from torch.utils.data import DataLoader, Dataset
 from local_attention import LocalTransformer
+from taylor_series_linear_attention import TaylorSeriesLinearAttn
 from titans_pytorch.titans import NeuralMemory
 # constants
@@ -21,6 +24,7 @@ LEARNING_RATE = 2e-4
 VALIDATE_EVERY  = 100
 GENERATE_EVERY  = 500
 GENERATE_LENGTH = 512
+SHOULD_GENERATE = False
 SEQ_LEN = 512
 # helpers
@@ -38,13 +42,33 @@ def decode_tokens(tokens):
 # instantiate GPT-like decoder model
+titans_neural_memory = NeuralMemory(
+    dim = 384,
+    chunk_size = 64,
+    pre_rmsnorm = True
+)
+titans_neural_memory = nn.Sequential(
+    titans_neural_memory,
+    nn.RMSNorm(384)
+)
+linear_attn = TaylorSeriesLinearAttn(
+    dim = 384,
+    dim_head = 16,
+    heads = 16,
+    causal = True
+)
 model = LocalTransformer(
     num_tokens = 256,
-    dim = 512,
+    dim = 384,
     depth = 8,
     causal = True,
     local_attn_window_size = 64,
-    max_seq_len = SEQ_LEN
+    max_seq_len = SEQ_LEN,
+    global_attn_layer = titans_neural_memory,
+    layers_insert_global_attn = (4,)
 ).cuda()
 # prepare enwik8 data
@@ -97,7 +121,7 @@ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='training'):
             loss = model(next(val_loader), return_loss = True)
             print(f'validation loss: {loss.item()}')
-    if i % GENERATE_EVERY == 0:
+    if SHOULD_GENERATE and i % GENERATE_EVERY == 0:
         model.eval()
         inp = random.choice(val_dataset)[:-1]
         prime = decode_tokens(inp)