PyPI - titans-pytorch - Versions diffs - 0.0.50__tar.gz → 0.0.52__tar.gz - Mend

titans-pytorch 0.0.50tar.gz → 0.0.52tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{titans_pytorch-0.0.50 → titans_pytorch-0.0.52}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.0.50
+Version: 0.0.52
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -135,3 +135,12 @@ $ python train_mac.py
     year    = {2024}
 }
 ```
+```bibtex
+@inproceedings{Yang2024GatedDN,
+    title   = {Gated Delta Networks: Improving Mamba2 with Delta Rule},
+    author  = {Songlin Yang and Jan Kautz and Ali Hatamizadeh},
+    year    = {2024},
+    url     = {https://api.semanticscholar.org/CorpusID:274598177}
+}
+```

{titans_pytorch-0.0.50 → titans_pytorch-0.0.52}/README.md RENAMED Viewed

@@ -82,3 +82,12 @@ $ python train_mac.py
     year    = {2024}
 }
 ```
+```bibtex
+@inproceedings{Yang2024GatedDN,
+    title   = {Gated Delta Networks: Improving Mamba2 with Delta Rule},
+    author  = {Songlin Yang and Jan Kautz and Ali Hatamizadeh},
+    year    = {2024},
+    url     = {https://api.semanticscholar.org/CorpusID:274598177}
+}
+```

{titans_pytorch-0.0.50 → titans_pytorch-0.0.52}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.0.50"
+version = "0.0.52"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.0.50 → titans_pytorch-0.0.52}/tests/test_titans.py RENAMED Viewed

@@ -1,16 +1,21 @@
 import torch
+from torch import nn
 import pytest
 from titans_pytorch import NeuralMemory
 @pytest.mark.parametrize('seq_len', (32, 1024, 77))
+@pytest.mark.parametrize('silu', (False, True))
 @pytest.mark.parametrize('max_grad_norm', (None, 2.))
 def test_titans(
     seq_len,
-    max_grad_norm
+    silu,
+    max_grad_norm,
 ):
     mem = NeuralMemory(
         dim = 384,
         chunk_size = 64,
+        activation = nn.SiLU() if silu else None,
         max_grad_norm = max_grad_norm
     )

{titans_pytorch-0.0.50 → titans_pytorch-0.0.52}/titans_pytorch/mac_transformer.py RENAMED Viewed

@@ -7,16 +7,48 @@ from torch import nn, cat
 import torch.nn.functional as F
 from torch.nn import Module, ModuleList, Linear
+# flex attention
+# https://pytorch.org/blog/flexattention/
+flex_attention = None
+try:
+    from torch.nn.attention.flex_attention import flex_attention, create_block_mask
+    if torch.cuda.is_available():
+        flex_attention = torch.compile(flex_attention)
+except ImportError:
+    pass
+def create_mac_block_mask(seq_len, window_size, persist_mem_len):
+    def create_mac_mask(b, h, q_idx, kv_idx):
+        is_persist_mem = kv_idx < persist_mem_len
+        causal_mask = q_idx >= (kv_idx - is_persist_mem)
+        block_diagonal = (q_idx // window_size) == ((kv_idx - is_persist_mem) // window_size)
+        return is_persist_mem | (~is_persist_mem & (causal_mask & block_diagonal))
+    block_mask = create_block_mask(create_mac_mask, B = None, H = None, Q_LEN = seq_len, KV_LEN = seq_len + persist_mem_len, _compile = True)
+    return block_mask
+# einstein notation related
 from einops import einsum, repeat, rearrange, pack, unpack
 from einops.layers.torch import Rearrange
-from hyper_connections import get_init_and_expand_reduce_stream_functions
+# b - batch
+# n - sequence
+# h - heads
+# d - feature dimension
 # absolute and relative positions
 from axial_positional_embedding import ContinuousAxialPositionalEmbedding
 from rotary_embedding_torch import RotaryEmbedding
+# hyper connections / attend from x-transformers, which handles different queries and key lengths better
 from x_transformers.attend import Attend
+from hyper_connections import get_init_and_expand_reduce_stream_functions
 # proposed neural memory
@@ -96,6 +128,7 @@ class SegmentedAttention(Module):
         heads = 8,
         accept_value_residual = False,
         attend_kwargs: dict = dict(),
+        use_flex_attn = False
     ):
         super().__init__()
         self.norm = nn.RMSNorm(dim)
@@ -125,11 +158,79 @@ class SegmentedAttention(Module):
         self.persistent_memory = nn.Parameter(torch.zeros(2, heads, num_persist_mem_tokens, dim_head))
+        # flex attn related
+        assert not (use_flex_attn and not exists(flex_attention)), 'you need to be on the latest pytorch with a cuda device available'
+        self.use_flex_attn = use_flex_attn
+        self.segment_len = segment_len
+        self.num_persist_mem_tokens = num_persist_mem_tokens
+    def forward_flex(
+        self,
+        seq,
+        value_residual = None,
+        flex_attn_fn: Callable | None = None
+    ):
+        assert not (exists(value_residual) ^ exists(self.to_learned_v_mix))
+        batch, seq_len = seq.shape[:2]
+        # attention
+        seq = self.norm(seq)
+        q, k, v = self.to_qkv(seq).chunk(3, dim = -1)
+        q, k, v = map(self.split_heads, (q, k, v))
+        # value residual
+        orig_v = v
+        if exists(self.to_learned_v_mix):
+            mix = self.to_learned_v_mix(seq)
+            v = v.lerp(value_residual, mix)
+        # take care of persistent memory key / values
+        pmk, pmv = repeat(self.persistent_memory, 'kv h n d -> kv b h n d', b = batch)
+        # relative positions
+        q, k = self.rotary_emb.rotate_queries_with_cached_keys(q, k)
+        # persistent memory
+        k = cat((pmk, k), dim = -2)
+        v = cat((pmv, v), dim = -2)
+        # prep flex attention
+        if not exists(flex_attn_fn):
+            block_mask = create_mac_block_mask(seq_len, self.segment_len, self.num_persist_mem_tokens)
+            flex_attn_fn = partial(flex_attention, block_mask = block_mask)
+        # attention
+        out = flex_attn_fn(q, k, v)
+        out = self.merge_heads(out)
+        out = self.to_out(out)
+        return out, orig_v
     def forward(
         self,
         seq,
-        value_residual = None
+        value_residual = None,
+        flex_attn_fn: Callable | None = None
     ):
+        if seq.is_cuda and self.use_flex_attn:
+            return self.forward_flex(seq, value_residual, flex_attn_fn)
         assert not (exists(value_residual) ^ exists(self.to_learned_v_mix))
         segment_len, num_longterm_mem_tokens = self.segment_len, self.num_longterm_mem_tokens
@@ -159,7 +260,7 @@ class SegmentedAttention(Module):
         # take care of persistent memory key / values
-        pmk, pmv = tuple(repeat(t, 'h n d -> b h n d', b = seq.shape[0]) for t in self.persistent_memory)
+        pmk, pmv = repeat(self.persistent_memory, 'kv ... -> kv b ...', b = seq.shape[0])
         # relative positions

{titans_pytorch-0.0.50 → titans_pytorch-0.0.52}/titans_pytorch/titans.py RENAMED Viewed

@@ -56,6 +56,17 @@ def pack_one_with_inverse(t, pattern):
     return packed, inverse
+def Sequential(*modules):
+    modules = [*filter(exists, modules)]
+    if len(modules) == 0:
+        return nn.Identity()
+    if len(modules) == 1:
+        return modules[0]
+    return nn.Sequential(*modules)
 # softclamping gradients
 def softclamp_max(t, max_value):
@@ -124,9 +135,6 @@ class MemoryAttention(Module):
         ])
     def forward(self, x):
-        assert x.shape[-2] > 1, 'chunk size needs to be greater than 1 for using attention as memory'
         wq, wk, wv, ffw1, ffw2 = self.weights
         q = F.normalize(x @ wq, dim = -1)
@@ -168,6 +176,7 @@ class NeuralMemory(Module):
         post_rmsnorm = True,
         max_grad_norm: float | None = None,
         use_accelerated_scan = False,
+        activation: Module | None = None,
         default_model_kwargs: dict = dict(
             depth = 2
         )
@@ -225,11 +234,11 @@ class NeuralMemory(Module):
         # queries for retrieving from the model
-        self.to_queries = LinearNoBias(dim, dim_inner)
+        self.to_queries = Sequential(LinearNoBias(dim, dim_inner), activation)
         # keys and values for storing to the model
-        self.to_keys_values = LinearNoBias(dim, dim_inner * 2)
+        self.to_keys_values = Sequential(LinearNoBias(dim, dim_inner * 2), activation)
         self.store_memory_loss_fn = store_memory_loss_fn
         # empty memory embed

{titans_pytorch-0.0.50 → titans_pytorch-0.0.52}/train_mac.py RENAMED Viewed

@@ -4,7 +4,7 @@ import gzip
 import numpy as np
 import torch
-from torch import nn
+from torch import nn, Tensor
 from torch.optim import Adam
 from torch.nn import functional as F
 from torch.utils.data import DataLoader, Dataset
@@ -19,12 +19,13 @@ GRADIENT_ACCUMULATE_EVERY = 4
 LEARNING_RATE = 2e-4
 VALIDATE_EVERY  = 100
 GENERATE_EVERY  = 500
+PRIME_LENGTH = 100
 GENERATE_LENGTH = 512
-SHOULD_GENERATE = False
+SHOULD_GENERATE = True
 SEQ_LEN = 512
 PROJECT_NAME = 'titans-mac-transformer'
-WANDB_ONLINE = True # turn this on to pipe experiment to cloud
+WANDB_ONLINE = False # turn this on to pipe experiment to cloud
 NEURAL_MEMORY_DEPTH = 2
 NUM_PERSIST_MEM = 4
 NUM_LONGTERM_MEM = 4
@@ -53,6 +54,52 @@ def decode_token(token):
 def decode_tokens(tokens):
     return ''.join(list(map(decode_token, tokens)))
+# sampling helpers
+def log(t, eps = 1e-20):
+    return torch.log(t.clamp(min = eps))
+def gumbel_noise(t):
+    noise = torch.zeros_like(t).uniform_(0, 1)
+    return -log(-log(noise))
+def gumbel_sample(t, temperature = 1., keepdim = True):
+    if temperature <= 0.:
+        return t.argmax(dim = dim, keepdim = keepdim)
+    return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim = -1, keepdim = keepdim)
+# min_p
+# https://arxiv.org/abs/2407.01082
+def min_p_filter(logits, min_p = 0.1):
+    probs = logits.softmax(dim = -1)
+    max_probs = probs.amax(dim = -1, keepdim = True)
+    limit = min_p * max_probs
+    return torch.where(probs < limit, float('-inf'), logits)
+def base_decoding(
+    net,
+    prompt: Tensor,
+    seq_len: int,
+    temperature = 1.5,
+    min_p = 1e-1,
+    filter_thres = 0.9,
+):
+    prompt_seq_len, out = prompt.shape[-1], prompt.clone()
+    sample_num_times = max(0, seq_len - prompt_seq_len)
+    for _ in tqdm.tqdm(range(sample_num_times)):
+        logits = net(out)
+        logits = logits[:, -1]
+        logits = min_p_filter(logits, min_p = min_p)
+        sample = gumbel_sample(logits, temperature = temperature)
+        out = torch.cat((out, sample), dim = -1)
+    return out[..., prompt_seq_len:]
 # instantiate memory-as-context transformer
 model = MemoryAsContextTransformer(
@@ -127,10 +174,10 @@ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='training'):
     if SHOULD_GENERATE and i % GENERATE_EVERY == 0:
         model.eval()
-        inp = random.choice(val_dataset)[:-1]
+        inp = random.choice(val_dataset)[:PRIME_LENGTH]
         prime = decode_tokens(inp)
         print(f'%s \n\n %s', (prime, '*' * 100))
-        sample = model.generate(inp[None, ...], GENERATE_LENGTH, use_kv_cache = False)
+        sample = base_decoding(model, inp[None, ...], GENERATE_LENGTH)
         output_str = decode_tokens(sample[0])
         print(output_str)