PyPI - titans-pytorch - Versions diffs - 0.0.32__tar.gz → 0.0.35__tar.gz - Mend

titans-pytorch 0.0.32tar.gz → 0.0.35tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{titans_pytorch-0.0.32 → titans_pytorch-0.0.35}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.0.32
+Version: 0.0.35
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

{titans_pytorch-0.0.32 → titans_pytorch-0.0.35}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.0.32"
+version = "0.0.35"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.0.32 → titans_pytorch-0.0.35}/titans_pytorch/__init__.py RENAMED Viewed

@@ -2,3 +2,5 @@ from titans_pytorch.titans import (
     NeuralMemory,
     MemoryMLP,
 )
+from titans_pytorch.mac_transformer import MemoryAsContextTransformer

{titans_pytorch-0.0.32 → titans_pytorch-0.0.35}/titans_pytorch/mac_transformer.py RENAMED Viewed

@@ -50,13 +50,20 @@ def pad_and_segment_with_inverse(seq, segment_len):
     next_seq_len_mult = round_up_multiple(seq_len, segment_len)
     padding = next_seq_len_mult - seq_len
-    seq = F.pad(seq, (0, 0, 0, padding))
+    needs_pad = padding > 0
+    if needs_pad:
+        seq = F.pad(seq, (0, 0, 0, padding))
     seq = rearrange(seq, 'b (w n) d -> (b w) n d', n = segment_len)
     def inverse(out):
         out = rearrange(out, '(b w) n d -> b (w n) d', b = batch)
-        return out[:, :-padding]
+        if needs_pad:
+            out = out[:, :-padding]
+        return out
     return seq, inverse
@@ -226,7 +233,14 @@ class MemoryAsContextTransformer(Module):
         self.to_logits = LinearNoBias(dim, num_tokens)
-    def forward(self, x):
+    def forward(
+        self,
+        x,
+        return_loss = False
+    ):
+        if return_loss:
+            x, labels = x[:, :-1], x[:, 1:]
         # math
@@ -262,6 +276,7 @@ class MemoryAsContextTransformer(Module):
             if exists(maybe_neural_mem):
                 batch_streams = x.shape[0]
                 x, inverse_segment = pad_and_segment_with_inverse(x, total_segment_len)
                 longterm_mems, x = x[:, :num_longterm_mem_tokens], x[:, num_longterm_mem_tokens:]
@@ -277,6 +292,7 @@ class MemoryAsContextTransformer(Module):
                 x = inverse_segment(x)
             x = attn(x)
             x = ff(x)
         x = self.reduce_streams(x)
@@ -293,4 +309,9 @@ class MemoryAsContextTransformer(Module):
         x = self.norm(x)
-        return self.to_logits(x)
+        logits = self.to_logits(x)
+        if not return_loss:
+            return logits
+        return F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)

{titans_pytorch-0.0.32 → titans_pytorch-0.0.35}/train.py RENAMED Viewed

@@ -63,11 +63,8 @@ def decode_tokens(tokens):
 titans_neural_memory = NeuralMemory(
     dim = 384,
     chunk_size = 4,
-    pre_rmsnorm = True,
-    post_rmsnorm = True,
     dim_head = 64,
     heads = 4,
-    max_grad_norm = 1.,
     use_accelerated_scan = True,
     default_mlp_kwargs = dict(
         depth = NEURAL_MEMORY_DEPTH

titans_pytorch-0.0.35/train_mac.py ADDED Viewed

@@ -0,0 +1,129 @@
+import random
+import tqdm
+import gzip
+import numpy as np
+import torch
+from torch import nn
+from torch.optim import Adam
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, Dataset
+from titans_pytorch.mac_transformer import MemoryAsContextTransformer
+# constants
+NUM_BATCHES = int(1e5)
+BATCH_SIZE = 4
+GRADIENT_ACCUMULATE_EVERY = 4
+LEARNING_RATE = 2e-4
+VALIDATE_EVERY  = 100
+GENERATE_EVERY  = 500
+GENERATE_LENGTH = 512
+SHOULD_GENERATE = False
+SEQ_LEN = 512
+PROJECT_NAME = 'titans-mac-transformer'
+WANDB_ONLINE = False # turn this on to pipe experiment to cloud
+GLOBAL_LAYERS = (2, 4)
+NEURAL_MEMORY_DEPTH = 2
+WINDOW_SIZE = 64
+RUN_NAME = 'mac'
+# wandb experiment tracker
+import wandb
+wandb.init(project = PROJECT_NAME, mode = 'disabled' if not WANDB_ONLINE else 'online')
+wandb.run.name = RUN_NAME
+wandb.run.save()
+# helpers
+def cycle(loader):
+    while True:
+        for data in loader:
+            yield data
+def decode_token(token):
+    return str(chr(max(32, token)))
+def decode_tokens(tokens):
+    return ''.join(list(map(decode_token, tokens)))
+# instantiate memory-as-context transformer
+model = MemoryAsContextTransformer(
+    num_tokens = 256,
+    dim = 384,
+    depth = 8,
+    segment_len = WINDOW_SIZE,
+    num_persist_mem_tokens = 16,
+    num_longterm_mem_tokens = 16,
+    neural_memory_layers = (3, 4),
+    neural_memory_kwargs = dict(
+        default_mlp_kwargs = dict(
+            depth = NEURAL_MEMORY_DEPTH
+        )
+    )
+).cuda()
+# prepare enwik8 data
+with gzip.open('./data/enwik8.gz') as file:
+    data = np.frombuffer(file.read(int(95e6)), dtype = np.uint8).copy()
+    data_train, data_val = np.split(data, [int(90e6)])
+    data_train, data_val = map(torch.from_numpy, (data_train, data_val))
+class TextSamplerDataset(Dataset):
+    def __init__(self, data, seq_len):
+        super().__init__()
+        self.data = data
+        self.seq_len = seq_len
+    def __getitem__(self, index):
+        rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
+        full_seq = self.data[rand_start: rand_start + self.seq_len + 1].long()
+        return full_seq.cuda()
+    def __len__(self):
+        return self.data.size(0) // self.seq_len
+train_dataset = TextSamplerDataset(data_train, SEQ_LEN)
+val_dataset   = TextSamplerDataset(data_val, SEQ_LEN)
+train_loader  = cycle(DataLoader(train_dataset, batch_size = BATCH_SIZE))
+val_loader    = cycle(DataLoader(val_dataset, batch_size = BATCH_SIZE))
+# optimizer
+optim = Adam(model.parameters(), lr=LEARNING_RATE)
+# training
+for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='training'):
+    model.train()
+    for __ in range(GRADIENT_ACCUMULATE_EVERY):
+        loss = model(next(train_loader), return_loss = True)
+        loss.backward()
+    print(f'training loss: {loss.item()}')
+    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
+    optim.step()
+    optim.zero_grad()
+    wandb.log(dict(loss = loss.item()))
+    if i % VALIDATE_EVERY == 0:
+        model.eval()
+        with torch.no_grad():
+            loss = model(next(val_loader), return_loss = True)
+            print(f'validation loss: {loss.item()}')
+    if SHOULD_GENERATE and i % GENERATE_EVERY == 0:
+        model.eval()
+        inp = random.choice(val_dataset)[:-1]
+        prime = decode_tokens(inp)
+        print(f'%s \n\n %s', (prime, '*' * 100))
+        sample = model.generate(inp[None, ...], GENERATE_LENGTH, use_kv_cache = False)
+        output_str = decode_tokens(sample[0])
+        print(output_str)