PyPI - titans-pytorch - Versions diffs - 0.0.1__tar.gz → 0.0.8__tar.gz - Mend

titans-pytorch 0.0.1tar.gz → 0.0.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{titans_pytorch-0.0.1 → titans_pytorch-0.0.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.0.1
+Version: 0.0.8
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -39,6 +39,8 @@ Requires-Dist: einx>=0.3.0
 Requires-Dist: tensordict>=0.6.2
 Requires-Dist: torch>=2.3
 Provides-Extra: examples
+Requires-Dist: local-attention>=1.10.0; extra == 'examples'
+Requires-Dist: taylor-series-linear-attention; extra == 'examples'
 Provides-Extra: test
 Requires-Dist: pytest; extra == 'test'
 Description-Content-Type: text/markdown
@@ -49,7 +51,7 @@ Description-Content-Type: text/markdown
 ## Titans - Pytorch (wip)
-Unofficial implementation of [Titans](https://arxiv.org/abs/2501.00663) in Pytorch. Will also contain some explorations into architectures beyond their simple 1-4 layer MLP for the neural memory module.
+Unofficial implementation of [Titans](https://arxiv.org/abs/2501.00663) in Pytorch. Will also contain some explorations into architectures beyond their simple 1-4 layer MLP for the neural memory module, if it works well to any degree.
 ## Install
@@ -63,13 +65,16 @@ $ pip install titans-pytorch
 import torch
 from titans_pytorch import NeuralMemory
-x = torch.randn(2, 64, 32)
+mem = NeuralMemory(
+    dim = 384,
+    chunk_size = 64,
+    pre_rmsnorm = True
+).cuda()
-mem = NeuralMemory(32)
+seq = torch.randn(2, 1024, 384).cuda()
+retrieved = mem(seq)
-out = mem(x)
-assert x.shape == out.shape
+assert seq.shape == retrieved.shape
 ```
 ## Citations

{titans_pytorch-0.0.1 → titans_pytorch-0.0.8}/README.md RENAMED Viewed

@@ -4,7 +4,7 @@
 ## Titans - Pytorch (wip)
-Unofficial implementation of [Titans](https://arxiv.org/abs/2501.00663) in Pytorch. Will also contain some explorations into architectures beyond their simple 1-4 layer MLP for the neural memory module.
+Unofficial implementation of [Titans](https://arxiv.org/abs/2501.00663) in Pytorch. Will also contain some explorations into architectures beyond their simple 1-4 layer MLP for the neural memory module, if it works well to any degree.
 ## Install
@@ -18,13 +18,16 @@ $ pip install titans-pytorch
 import torch
 from titans_pytorch import NeuralMemory
-x = torch.randn(2, 64, 32)
+mem = NeuralMemory(
+    dim = 384,
+    chunk_size = 64,
+    pre_rmsnorm = True
+).cuda()
-mem = NeuralMemory(32)
+seq = torch.randn(2, 1024, 384).cuda()
+retrieved = mem(seq)
-out = mem(x)
-assert x.shape == out.shape
+assert seq.shape == retrieved.shape
 ```
 ## Citations

titans_pytorch-0.0.8/data/README.md ADDED Viewed

@@ -0,0 +1,3 @@
+# Data source
+The enwik8 data was downloaded from the Hutter prize page: http://prize.hutter1.net/

titans_pytorch-0.0.8/data/enwik8.gz ADDED Viewed

Binary file

{titans_pytorch-0.0.1 → titans_pytorch-0.0.8}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.0.1"
+version = "0.0.8"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -36,7 +36,10 @@ Homepage = "https://pypi.org/project/titans-pytorch/"
 Repository = "https://github.com/lucidrains/titans-pytorch"
 [project.optional-dependencies]
-examples = []
+examples = [
+    "local-attention>=1.10.0",
+    "taylor-series-linear-attention"
+]
 test = [
     "pytest"
 ]

{titans_pytorch-0.0.1 → titans_pytorch-0.0.8}/titans_pytorch/titans.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from __future__ import annotations
+import math
 from functools import partial
 import torch
@@ -11,12 +12,13 @@ from tensordict import TensorDict
 from titans_pytorch.associative_scan import (
     associative_scan,
-    binary_operator
+    binary_operator,
+    pad_at_dim
 )
 import einx
 from einops import rearrange, pack, unpack
-from einops.layers.torch import Rearrange
+from einops.layers.torch import Rearrange, Reduce
 """
 ein notation:
@@ -41,6 +43,9 @@ def default(v, d):
 def round_down_multiple(seq, mult):
     return seq // mult * mult
+def round_up_multiple(seq, mult):
+    return math.ceil(seq / mult) * mult
 def pack_one_with_inverse(t, pattern):
     packed, packed_shape = pack([t], pattern)
@@ -50,6 +55,10 @@ def pack_one_with_inverse(t, pattern):
     return packed, inverse
+def softclamp_max(t, max_value):
+    range_value = max_value / 2
+    return ((t / range_value).tanh() * range_value) + range_value
 # classes
 class MLP(Module):
@@ -84,11 +93,17 @@ class NeuralMemory(Module):
     def __init__(
         self,
         dim,
+        chunk_size = 1,
         model: Module | None = None,
-        store_memory_loss_fn: Callable = default_loss_fn
+        store_memory_loss_fn: Callable = default_loss_fn,
+        pre_rmsnorm = False,
+        max_adaptive_step_size = 1e-5
     ):
         super().__init__()
+        self.retrieve_norm = nn.RMSNorm(dim) if pre_rmsnorm else nn.Identity()
+        self.store_norm = nn.RMSNorm(dim) if pre_rmsnorm else nn.Identity()
         if not exists(model):
             model = MLP(dim, depth = 4)
@@ -98,11 +113,15 @@ class NeuralMemory(Module):
         self.memory_model = model
+        # the chunk size within the paper where adaptive step, momentum, weight decay are shared
+        self.chunk_size = chunk_size
         # prepare function for per sample gradients from model above, using torch.func
         def forward_and_loss(params, inputs, target):
             pred = functional_call(self.memory_model, params, inputs)
-            loss = self.store_memory_loss_fn(pred, target) # simple mse loss in paper - eq (12) - |M(k) == v|²
+            loss = self.store_memory_loss_fn(pred, target) # simple mse loss in paper - eq (12) - |M(k) - v|²
             return loss
         self.per_sample_grad_and_value_fn = vmap(grad_and_value(forward_and_loss), in_dims = (None, 0, 0))
@@ -119,9 +138,25 @@ class NeuralMemory(Module):
         # learned adaptive learning rate and momentum
         # todo - explore mlp layerwise learned lr / momentum
-        self.to_momentum = LinearNoBias(dim, 1)
-        self.to_adaptive_step = nn.Sequential(LinearNoBias(dim, 1), Rearrange('... 1 -> ...'))
-        self.to_decay_factor = nn.Sequential(LinearNoBias(dim, 1), nn.Sigmoid()) # weight decay factor
+        self.to_momentum = nn.Sequential(
+            Reduce('b (n c) ... -> b n ...', 'mean', c = chunk_size),
+            LinearNoBias(dim, 1)
+        )
+        self.to_adaptive_step = nn.Sequential(
+            Reduce('b (n c) ... -> b n ...', 'mean', c = chunk_size),
+            LinearNoBias(dim, 1),
+            Rearrange('... 1 -> ...')
+        )
+        self.max_adaptive_step_size = max_adaptive_step_size
+        # weight decay factor
+        self.to_decay_factor = nn.Sequential(
+            Reduce('b (n c) ... -> b n ...', 'mean', c = chunk_size),
+            LinearNoBias(dim, 1)
+        )
     def init_weights_and_momentum(self):
         params = TensorDict(dict(self.memory_model.named_parameters()))
@@ -137,6 +172,18 @@ class NeuralMemory(Module):
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]]
     ):
+        seq = self.store_norm(seq)
+        # curtail sequence by multiple of the chunk size
+        # only a complete chunk of the sequence provides the memory for the next chunk
+        seq_len, chunk_size = seq.shape[-2], self.chunk_size
+        round_down_seq_len = round_down_multiple(seq_len, self.chunk_size)
+        seq = seq[:, :round_down_seq_len]
+        # curr weights + past weights, in the case that the initial weights are learned
         curr_weights = TensorDict(dict(self.memory_model.named_parameters()))
         past_state = tuple(TensorDict(d) for d in past_state)
@@ -148,16 +195,19 @@ class NeuralMemory(Module):
         batch = seq.shape[0]
-        adaptive_lr = self.to_adaptive_step(seq)
-        adaptive_momentum = self.to_momentum(seq)
+        adaptive_lr = softclamp_max(self.to_adaptive_step(seq), self.max_adaptive_step_size)
-        decay_factor = self.to_decay_factor(seq)
+        adaptive_momentum = self.to_momentum(seq).sigmoid()
+        decay_factor = self.to_decay_factor(seq).sigmoid()
         # keys and values
-        seq = rearrange(seq, 'b n d -> (b n) d')
         keys, values = self.to_keys_values(seq).chunk(2, dim = -1)
+        # take care of chunking
+        keys, values = tuple(rearrange(t, 'b (n c) d -> (b n) c d', c = self.chunk_size) for t in (keys, values))
         # get grads and extra auxiliary loss (for backwarding through qkv projection in base neural memory module)
         grads, aux_store_loss = self.per_sample_grad_and_value_fn(dict(curr_weights), keys, values)
@@ -172,31 +222,24 @@ class NeuralMemory(Module):
         surprises = grads.apply(lambda t: einx.multiply('b n ..., b n -> b n ...', t, -adaptive_lr))
-        # derive momentum with associative scan - eq (10)
+        # momentum + weight decay - momentum is the new contribution, as most linear RNNs have learned forgetting gates
         next_momentum = TensorDict()
+        updates = TensorDict()
         for param_name, surprise in surprises.items():
             surprise, inverse_pack = pack_one_with_inverse(surprise, 'b n *')
-            _, momentum = associative_scan(binary_operator, (adaptive_momentum, surprise)) # momentum is S / surprise in the paper
-            momentum = inverse_pack(momentum)
-            next_momentum[param_name] = momentum
-        # use associative scan again for learned forgetting (weight decay) - eq (13)
+            # derive momentum with associative scan - eq (10)
-        updates = TensorDict()
+            _, momentum = associative_scan(binary_operator, (adaptive_momentum, surprise)) # momentum is S / surprise in the paper
-        for param_name, momentum in next_momentum.items():
-            momentum, inverse_pack = pack_one_with_inverse(momentum, 'b n *')
+            # use associative scan again for learned forgetting (weight decay) - eq (13)
             _, update = associative_scan(binary_operator, (1. - decay_factor, momentum)) # momentum is S / surprise in the paper
-            update = inverse_pack(update)
-            updates[param_name] = update
+            updates[param_name] = inverse_pack(update)
+            next_momentum[param_name] = inverse_pack(momentum)
         # compute the next weight per batch
@@ -204,14 +247,28 @@ class NeuralMemory(Module):
         next_state = (curr_weights + last_update, next_momentum)
-        return updates, next_state, aux_store_loss.mean()
+        return updates, next_state, aux_store_loss.mean() / chunk_size
     def retrieve_memories(
         self,
         seq,
         past_weights: dict[str, Tensor] | None = None,
     ):
-        batch = seq.shape[0]
+        chunk_size = self.chunk_size
+        batch, seq_len = seq.shape[:2]
+        seq = self.retrieve_norm(seq)
+        assert seq_len >= chunk_size
+        seq = seq[:, (chunk_size - 1):]
+        curtailed_seq_len = seq.shape[-2]
+        next_seq_len = round_up_multiple(curtailed_seq_len, chunk_size)
+        padding = next_seq_len - curtailed_seq_len
+        seq = pad_at_dim(seq, (0, padding), dim = 1)
         # the parameters of the memory model stores the memories of the key / values
         # when the MLP has only 1 weight matrix, it is equivalent to `kv` fast weight memories from linear attention literature (recall fetching of memories is q @ (kv)) / schmidhuber's paper
@@ -231,7 +288,7 @@ class NeuralMemory(Module):
         # fetch values from memory model
         curr_weights = curr_weights.apply(lambda t: rearrange(t, 'b n ... -> (b n) ...'))
-        queries = rearrange(queries, 'b n d -> (b n) 1 d')
+        queries = rearrange(queries, 'b (n c) d -> (b n) c d', c = chunk_size)
         # forward functional call
@@ -239,7 +296,12 @@ class NeuralMemory(Module):
         # reconstitute batch dimension
-        values = rearrange(values, '(b n) 1 d -> b n d', b = batch)
+        values = rearrange(values, '(b n) c d -> b (n c) d', b = batch)
+        # restore
+        values = pad_at_dim(values, (chunk_size - 1, 0), dim = 1, value = 0.) # todo, used a learned null memory embedding instead of 0s for retrieving from empty neural memory
+        values = values[:, :-padding]
         return values

titans_pytorch-0.0.8/train.py ADDED Viewed

@@ -0,0 +1,132 @@
+import random
+import tqdm
+import gzip
+import numpy as np
+import torch
+from torch import nn
+from torch.optim import Adam
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, Dataset
+from local_attention import LocalTransformer
+from taylor_series_linear_attention import TaylorSeriesLinearAttn
+from titans_pytorch.titans import NeuralMemory
+# constants
+NUM_BATCHES = int(1e5)
+BATCH_SIZE = 4
+GRADIENT_ACCUMULATE_EVERY = 4
+LEARNING_RATE = 2e-4
+VALIDATE_EVERY  = 100
+GENERATE_EVERY  = 500
+GENERATE_LENGTH = 512
+SHOULD_GENERATE = False
+SEQ_LEN = 512
+# helpers
+def cycle(loader):
+    while True:
+        for data in loader:
+            yield data
+def decode_token(token):
+    return str(chr(max(32, token)))
+def decode_tokens(tokens):
+    return ''.join(list(map(decode_token, tokens)))
+# instantiate GPT-like decoder model
+titans_neural_memory = NeuralMemory(
+    dim = 384,
+    chunk_size = 64,
+    pre_rmsnorm = True
+)
+titans_neural_memory = nn.Sequential(
+    titans_neural_memory,
+    nn.RMSNorm(384)
+)
+linear_attn = TaylorSeriesLinearAttn(
+    dim = 384,
+    dim_head = 16,
+    heads = 16,
+    causal = True
+)
+model = LocalTransformer(
+    num_tokens = 256,
+    dim = 384,
+    depth = 8,
+    causal = True,
+    local_attn_window_size = 64,
+    max_seq_len = SEQ_LEN,
+    global_attn_layer = titans_neural_memory,
+    layers_insert_global_attn = (4,)
+).cuda()
+# prepare enwik8 data
+with gzip.open('./data/enwik8.gz') as file:
+    data = np.frombuffer(file.read(int(95e6)), dtype = np.uint8).copy()
+    data_train, data_val = np.split(data, [int(90e6)])
+    data_train, data_val = map(torch.from_numpy, (data_train, data_val))
+class TextSamplerDataset(Dataset):
+    def __init__(self, data, seq_len):
+        super().__init__()
+        self.data = data
+        self.seq_len = seq_len
+    def __getitem__(self, index):
+        rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
+        full_seq = self.data[rand_start: rand_start + self.seq_len + 1].long()
+        return full_seq.cuda()
+    def __len__(self):
+        return self.data.size(0) // self.seq_len
+train_dataset = TextSamplerDataset(data_train, SEQ_LEN)
+val_dataset   = TextSamplerDataset(data_val, SEQ_LEN)
+train_loader  = cycle(DataLoader(train_dataset, batch_size = BATCH_SIZE))
+val_loader    = cycle(DataLoader(val_dataset, batch_size = BATCH_SIZE))
+# optimizer
+optim = Adam(model.parameters(), lr=LEARNING_RATE)
+# training
+for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='training'):
+    model.train()
+    for __ in range(GRADIENT_ACCUMULATE_EVERY):
+        loss = model(next(train_loader), return_loss = True)
+        loss.backward()
+    print(f'training loss: {loss.item()}')
+    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
+    optim.step()
+    optim.zero_grad()
+    if i % VALIDATE_EVERY == 0:
+        model.eval()
+        with torch.no_grad():
+            loss = model(next(val_loader), return_loss = True)
+            print(f'validation loss: {loss.item()}')
+    if SHOULD_GENERATE and i % GENERATE_EVERY == 0:
+        model.eval()
+        inp = random.choice(val_dataset)[:-1]
+        prime = decode_tokens(inp)
+        print(f'%s \n\n %s', (prime, '*' * 100))
+        sample = model.generate(inp[None, ...], GENERATE_LENGTH, use_kv_cache = False)
+        output_str = decode_tokens(sample[0])
+        print(output_str)

{titans_pytorch-0.0.1 → titans_pytorch-0.0.8}/.github/workflows/python-publish.yml RENAMED Viewed

File without changes

{titans_pytorch-0.0.1 → titans_pytorch-0.0.8}/.gitignore RENAMED Viewed

File without changes

{titans_pytorch-0.0.1 → titans_pytorch-0.0.8}/LICENSE RENAMED Viewed

File without changes

{titans_pytorch-0.0.1 → titans_pytorch-0.0.8}/fig1.png RENAMED Viewed

File without changes

{titans_pytorch-0.0.1 → titans_pytorch-0.0.8}/fig2.png RENAMED Viewed

File without changes

{titans_pytorch-0.0.1 → titans_pytorch-0.0.8}/titans_pytorch/__init__.py RENAMED Viewed

File without changes

{titans_pytorch-0.0.1 → titans_pytorch-0.0.8}/titans_pytorch/associative_scan.py RENAMED Viewed

File without changes

titans-pytorch 0.0.1__tar.gz → 0.0.8__tar.gz

titans-pytorch 0.0.1tar.gz → 0.0.8tar.gz