PyPI - titans-pytorch - Versions diffs - 0.0.6__tar.gz → 0.0.7__tar.gz - Mend

titans-pytorch 0.0.6tar.gz → 0.0.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{titans_pytorch-0.0.6 → titans_pytorch-0.0.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.0.6
+Version: 0.0.7
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

titans_pytorch-0.0.7/data/README.md ADDED Viewed

@@ -0,0 +1,3 @@
+# Data source
+The enwik8 data was downloaded from the Hutter prize page: http://prize.hutter1.net/

titans_pytorch-0.0.7/data/enwik8.gz ADDED Viewed

Binary file

{titans_pytorch-0.0.6 → titans_pytorch-0.0.7}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.0.6"
+version = "0.0.7"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.0.6 → titans_pytorch-0.0.7}/titans_pytorch/titans.py RENAMED Viewed

@@ -91,10 +91,14 @@ class NeuralMemory(Module):
         dim,
         chunk_size = 1,
         model: Module | None = None,
-        store_memory_loss_fn: Callable = default_loss_fn
+        store_memory_loss_fn: Callable = default_loss_fn,
+        pre_rmsnorm = False
     ):
         super().__init__()
+        self.retrieve_norm = nn.RMSNorm(dim) if pre_rmsnorm else nn.Identity()
+        self.store_norm = nn.RMSNorm(dim) if pre_rmsnorm else nn.Identity()
         if not exists(model):
             model = MLP(dim, depth = 4)
@@ -161,6 +165,8 @@ class NeuralMemory(Module):
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]]
     ):
+        seq = self.store_norm(seq)
         # curtail sequence by multiple of the chunk size
         # only a complete chunk of the sequence provides the memory for the next chunk
@@ -244,6 +250,8 @@ class NeuralMemory(Module):
         chunk_size = self.chunk_size
         batch, seq_len = seq.shape[:2]
+        seq = self.retrieve_norm(seq)
         assert seq_len >= chunk_size
         seq = seq[:, (chunk_size - 1):]

titans_pytorch-0.0.7/train.py ADDED Viewed

@@ -0,0 +1,108 @@
+import random
+import tqdm
+import gzip
+import numpy as np
+import torch
+from torch.optim import Adam
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, Dataset
+from local_attention import LocalTransformer
+from titans_pytorch.titans import NeuralMemory
+# constants
+NUM_BATCHES = int(1e5)
+BATCH_SIZE = 4
+GRADIENT_ACCUMULATE_EVERY = 4
+LEARNING_RATE = 2e-4
+VALIDATE_EVERY  = 100
+GENERATE_EVERY  = 500
+GENERATE_LENGTH = 512
+SEQ_LEN = 512
+# helpers
+def cycle(loader):
+    while True:
+        for data in loader:
+            yield data
+def decode_token(token):
+    return str(chr(max(32, token)))
+def decode_tokens(tokens):
+    return ''.join(list(map(decode_token, tokens)))
+# instantiate GPT-like decoder model
+model = LocalTransformer(
+    num_tokens = 256,
+    dim = 512,
+    depth = 8,
+    causal = True,
+    local_attn_window_size = 64,
+    max_seq_len = SEQ_LEN
+).cuda()
+# prepare enwik8 data
+with gzip.open('./data/enwik8.gz') as file:
+    data = np.frombuffer(file.read(int(95e6)), dtype = np.uint8).copy()
+    data_train, data_val = np.split(data, [int(90e6)])
+    data_train, data_val = map(torch.from_numpy, (data_train, data_val))
+class TextSamplerDataset(Dataset):
+    def __init__(self, data, seq_len):
+        super().__init__()
+        self.data = data
+        self.seq_len = seq_len
+    def __getitem__(self, index):
+        rand_start = torch.randint(0, self.data.size(0) - self.seq_len, (1,))
+        full_seq = self.data[rand_start: rand_start + self.seq_len + 1].long()
+        return full_seq.cuda()
+    def __len__(self):
+        return self.data.size(0) // self.seq_len
+train_dataset = TextSamplerDataset(data_train, SEQ_LEN)
+val_dataset   = TextSamplerDataset(data_val, SEQ_LEN)
+train_loader  = cycle(DataLoader(train_dataset, batch_size = BATCH_SIZE))
+val_loader    = cycle(DataLoader(val_dataset, batch_size = BATCH_SIZE))
+# optimizer
+optim = Adam(model.parameters(), lr=LEARNING_RATE)
+# training
+for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='training'):
+    model.train()
+    for __ in range(GRADIENT_ACCUMULATE_EVERY):
+        loss = model(next(train_loader), return_loss = True)
+        loss.backward()
+    print(f'training loss: {loss.item()}')
+    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
+    optim.step()
+    optim.zero_grad()
+    if i % VALIDATE_EVERY == 0:
+        model.eval()
+        with torch.no_grad():
+            loss = model(next(val_loader), return_loss = True)
+            print(f'validation loss: {loss.item()}')
+    if i % GENERATE_EVERY == 0:
+        model.eval()
+        inp = random.choice(val_dataset)[:-1]
+        prime = decode_tokens(inp)
+        print(f'%s \n\n %s', (prime, '*' * 100))
+        sample = model.generate(inp[None, ...], GENERATE_LENGTH, use_kv_cache = False)
+        output_str = decode_tokens(sample[0])
+        print(output_str)