PyPI - titans-pytorch - Versions diffs - 0.0.31__tar.gz → 0.0.34__tar.gz - Mend

titans-pytorch 0.0.31tar.gz → 0.0.34tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{titans_pytorch-0.0.31 → titans_pytorch-0.0.34}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.0.31
+Version: 0.0.34
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

{titans_pytorch-0.0.31 → titans_pytorch-0.0.34}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.0.31"
+version = "0.0.34"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.0.31 → titans_pytorch-0.0.34}/titans_pytorch/mac_transformer.py RENAMED Viewed

@@ -17,6 +17,10 @@ from hyper_connections import get_init_and_expand_reduce_stream_functions
 from axial_positional_embedding import ContinuousAxialPositionalEmbedding
 from rotary_embedding_torch import RotaryEmbedding
+# proposed neural memory
+from titans_pytorch.titans import NeuralMemory
 # constants
 LinearNoBias = partial(Linear, bias = False)
@@ -46,13 +50,20 @@ def pad_and_segment_with_inverse(seq, segment_len):
     next_seq_len_mult = round_up_multiple(seq_len, segment_len)
     padding = next_seq_len_mult - seq_len
-    seq = F.pad(seq, (0, 0, 0, padding))
+    needs_pad = padding > 0
+    if needs_pad:
+        seq = F.pad(seq, (0, 0, 0, padding))
     seq = rearrange(seq, 'b (w n) d -> (b w) n d', n = segment_len)
     def inverse(out):
         out = rearrange(out, '(b w) n d -> b (w n) d', b = batch)
-        return out[:, :-padding]
+        if needs_pad:
+            out = out[:, :-padding]
+        return out
     return seq, inverse
@@ -161,7 +172,9 @@ class MemoryAsContextTransformer(Module):
         dim_head = 64,
         heads = 8,
         ff_mult = 4,
-        num_residual_streams = 4
+        num_residual_streams = 4,
+        neural_memory_kwargs: dict = dict(),
+        neural_memory_layers: tuple[int, ...] | None = None,
     ):
         super().__init__()
@@ -181,8 +194,25 @@ class MemoryAsContextTransformer(Module):
         init_hyper_conn, self.expand_streams, self.reduce_streams = get_init_and_expand_reduce_stream_functions(num_residual_streams, disable = num_residual_streams == 1)
         self.layers = ModuleList([])
+        self.neural_mem_layers = ModuleList([])
+        layers = tuple(range(1, depth + 1))
+        neural_memory_layers = set(default(neural_memory_layers, layers))
+        for layer in layers:
+            # neural memory
+            mem = None
+            if num_longterm_mem_tokens > 0 and layer in neural_memory_layers:
+                mem = NeuralMemory(dim = dim, chunk_size = num_longterm_mem_tokens)
+                mem = init_hyper_conn(dim = dim, branch = mem)
+            self.neural_mem_layers.append(mem)
+            # attention and feedforward
-        for _ in range(depth):
             attn = SegmentedAttention(
                 dim = dim,
                 dim_head = dim_head,
@@ -203,7 +233,14 @@ class MemoryAsContextTransformer(Module):
         self.to_logits = LinearNoBias(dim, num_tokens)
-    def forward(self, x):
+    def forward(
+        self,
+        x,
+        return_loss = False
+    ):
+        if return_loss:
+            x, labels = x[:, :-1], x[:, 1:]
         # math
@@ -221,7 +258,7 @@ class MemoryAsContextTransformer(Module):
         x, inverse_segment = pad_and_segment_with_inverse(x, segment_len)
         mems = repeat(self.longterm_mems, 'n d -> b n d', b = x.shape[0])
-        x = torch.cat((mems, x), dim = -2)
+        x = cat((mems, x), dim = -2)
         x = inverse_segment(x)
@@ -235,8 +272,27 @@ class MemoryAsContextTransformer(Module):
         x = self.expand_streams(x)
-        for attn, ff in self.layers:
+        for (attn, ff), maybe_neural_mem in zip(self.layers, self.neural_mem_layers):
+            if exists(maybe_neural_mem):
+                batch_streams = x.shape[0]
+                x, inverse_segment = pad_and_segment_with_inverse(x, total_segment_len)
+                longterm_mems, x = x[:, :num_longterm_mem_tokens], x[:, num_longterm_mem_tokens:]
+                longterm_mems = rearrange(longterm_mems, '(b w) n d -> b (w n) d', b = batch_streams)
+                longterm_mems = maybe_neural_mem(longterm_mems)
+                longterm_mems = rearrange(longterm_mems, 'b (w n) d -> (b w) n d', n = num_longterm_mem_tokens)
+                x = cat((longterm_mems, x), dim = -2)
+                x = inverse_segment(x)
             x = attn(x)
             x = ff(x)
         x = self.reduce_streams(x)
@@ -245,7 +301,7 @@ class MemoryAsContextTransformer(Module):
         x, inverse_segment = pad_and_segment_with_inverse(x, total_segment_len)
-        x = x[:, self.num_longterm_mem_tokens:]
+        x = x[:, num_longterm_mem_tokens:]
         x = inverse_segment(x)
@@ -253,4 +309,9 @@ class MemoryAsContextTransformer(Module):
         x = self.norm(x)
-        return self.to_logits(x)
+        logits = self.to_logits(x)
+        if not return_loss:
+            return logits
+        return F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)

{titans_pytorch-0.0.31 → titans_pytorch-0.0.34}/train.py RENAMED Viewed

@@ -63,11 +63,8 @@ def decode_tokens(tokens):
 titans_neural_memory = NeuralMemory(
     dim = 384,
     chunk_size = 4,
-    pre_rmsnorm = True,
-    post_rmsnorm = True,
     dim_head = 64,
     heads = 4,
-    max_grad_norm = 1.,
     use_accelerated_scan = True,
     default_mlp_kwargs = dict(
         depth = NEURAL_MEMORY_DEPTH

titans_pytorch-0.0.34/train_mac.py ADDED Viewed

@@ -0,0 +1,129 @@
+import random
+import tqdm
+import gzip
+import numpy as np
+import torch
+from torch import nn
+from torch.optim import Adam
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, Dataset
+from titans_pytorch.mac_transformer import MemoryAsContextTransformer
+# constants
+NUM_BATCHES = int(1e5)
+BATCH_SIZE = 4
+GRADIENT_ACCUMULATE_EVERY = 4
+LEARNING_RATE = 2e-4
+VALIDATE_EVERY  = 100
+GENERATE_EVERY  = 500
+GENERATE_LENGTH = 512
+SHOULD_GENERATE = False
+SEQ_LEN = 512
+PROJECT_NAME = 'titans-mac-transformer'
+WANDB_ONLINE = False # turn this on to pipe experiment to cloud
+GLOBAL_LAYERS = (2, 4)
+NEURAL_MEMORY_DEPTH = 2
+WINDOW_SIZE = 64
+RUN_NAME = 'mac'
+# wandb experiment tracker
+import wandb
+wandb.init(project = PROJECT_NAME, mode = 'disabled' if not WANDB_ONLINE else 'online')
+wandb.run.name = RUN_NAME
+wandb.run.save()
+# helpers
+def cycle(loader):
+    while True:
+        for data in loader:
+            yield data
+def decode_token(token):
+    return str(chr(max(32, token)))
+def decode_tokens(tokens):
+    return ''.join(list(map(decode_token, tokens)))
+# instantiate memory-as-context transformer
+model = MemoryAsContextTransformer(
+    num_tokens = 256,
+    dim = 384,
+    depth = 8,
+    segment_len = WINDOW_SIZE,
+    num_persist_mem_tokens = 16,
+    num_longterm_mem_tokens = 16,
+    neural_memory_layers = (3, 4),
+    neural_memory_kwargs = dict(
+        default_mlp_kwargs = dict(
+            depth = NEURAL_MEMORY_DEPTH
+        )
+    )
+).cuda()
+# prepare enwik8 data
+with gzip.open('./data/enwik8.gz') as file:
+    data = np.frombuffer(file.read(int(95e6)), dtype = np.uint8).copy()
+    data_train, data_val = np.split(data, [int(90e6)])
+    data_train, data_val = map(torch.from_numpy, (data_train, data_val))
+class TextSamplerDataset(Dataset):
+    def __init__(self, data, seq_len):
+        super().__init__()
+        self.data = data
+        self.seq_len = seq_len
+    def __getitem__(self, index):
+        rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
+        full_seq = self.data[rand_start: rand_start + self.seq_len + 1].long()
+        return full_seq.cuda()
+    def __len__(self):
+        return self.data.size(0) // self.seq_len
+train_dataset = TextSamplerDataset(data_train, SEQ_LEN)
+val_dataset   = TextSamplerDataset(data_val, SEQ_LEN)
+train_loader  = cycle(DataLoader(train_dataset, batch_size = BATCH_SIZE))
+val_loader    = cycle(DataLoader(val_dataset, batch_size = BATCH_SIZE))
+# optimizer
+optim = Adam(model.parameters(), lr=LEARNING_RATE)
+# training
+for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='training'):
+    model.train()
+    for __ in range(GRADIENT_ACCUMULATE_EVERY):
+        loss = model(next(train_loader), return_loss = True)
+        loss.backward()
+    print(f'training loss: {loss.item()}')
+    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
+    optim.step()
+    optim.zero_grad()
+    wandb.log(dict(loss = loss.item()))
+    if i % VALIDATE_EVERY == 0:
+        model.eval()
+        with torch.no_grad():
+            loss = model(next(val_loader), return_loss = True)
+            print(f'validation loss: {loss.item()}')
+    if SHOULD_GENERATE and i % GENERATE_EVERY == 0:
+        model.eval()
+        inp = random.choice(val_dataset)[:-1]
+        prime = decode_tokens(inp)
+        print(f'%s \n\n %s', (prime, '*' * 100))
+        sample = model.generate(inp[None, ...], GENERATE_LENGTH, use_kv_cache = False)
+        output_str = decode_tokens(sample[0])
+        print(output_str)