PyPI - titans-pytorch - Versions diffs - 0.1.18__tar.gz → 0.1.20__tar.gz - Mend

@@ -1,12 +1,14 @@
 from __future__ import annotations
 from typing import Callable
 from math import ceil
 from functools import partial
+from collections import namedtuple
 import tqdm
 import torch
-from torch import nn, cat
+from torch import nn, stack, cat
 import torch.nn.functional as F
 from torch.nn import Module, ModuleList, Linear
@@ -69,6 +71,8 @@ from titans_pytorch.titans import NeuralMemory
 LinearNoBias = partial(Linear, bias = False)
+AttnIntermediates = namedtuple('AttnIntermediates', ('value_residual', 'cached_key_values'))
 # helpers
 def exists(v):
@@ -80,6 +84,9 @@ def default(v, d):
 def identity(t):
     return t
+def divisible_by(num, den):
+    return (num % den) == 0
 def round_up_multiple(seq, mult):
     return ceil(seq / mult) * mult
@@ -111,7 +118,7 @@ def pad_and_segment_with_inverse(seq, segment_len, fold_into_batch = True):
     def inverse(out):
         if fold_into_batch:
-            out = rearrange(out, '(b w) n d -> b (w n) d', b = batch)
+            out = rearrange(out, '(b w) ... n d -> b ... (w n) d', b = batch)
         if needs_pad:
             out = out[..., :-padding, :]
@@ -213,12 +220,75 @@ class SegmentedAttention(Module):
         self.segment_len = segment_len
         self.num_persist_mem_tokens = num_persist_mem_tokens
+    def forward_inference(
+        self,
+        token,
+        cache,
+        value_residual = None,
+        output_gating = None,
+    ):
+        batch = token.shape[0]
+        # attention
+        token = self.norm(token)
+        q, k, v = self.to_qkv(token).chunk(3, dim = -1)
+        q, k, v = map(self.split_heads, (q, k, v))
+        # value residual
+        orig_v = v
+        if exists(self.to_learned_v_mix):
+            mix = self.to_learned_v_mix(token)
+            v = v.lerp(value_residual, mix)
+        # caching
+        ck, cv = cache
+        k = cat((ck, k), dim = -2)
+        v = cat((cv, v), dim = -2)
+        next_cache = (k, v)
+        # relative positions
+        q, k = self.rotary_emb.rotate_queries_with_cached_keys(q, k)
+        # fold
+        q, k, v = tuple(rearrange(t, 'b h n d -> b h n d') for t in (q, k, v))
+        # take care of persistent memory key / values
+        pmk, pmv = repeat(self.persistent_memory, 'kv ... -> kv b ...', b = k.shape[0])
+        # persistent memory
+        k = cat((pmk, k), dim = -2)
+        v = cat((pmv, v), dim = -2)
+        # attention
+        out, _ = self.attend(q, k, v)
+        out = self.merge_heads(out)
+        out = self.to_out(out)
+        if exists(output_gating):
+            out = out * output_gating
+        return out, AttnIntermediates(orig_v, next_cache)
     def forward_flex(
         self,
         seq,
         value_residual = None,
         flex_attn_fn: Callable | None = None,
-        output_gating = None
+        output_gating = None,
+        cache = None
     ):
         assert not (exists(value_residual) ^ exists(self.to_learned_v_mix))
@@ -240,6 +310,10 @@ class SegmentedAttention(Module):
             mix = self.to_learned_v_mix(seq)
             v = v.lerp(value_residual, mix)
+        # caching
+        next_cache = tuple(map(inverse_segment, (k, v)))
         # take care of persistent memory key / values
         pmk, pmv = repeat(self.persistent_memory, 'kv h n d -> kv b h n d', b = batch)
@@ -271,7 +345,7 @@ class SegmentedAttention(Module):
         if exists(output_gating):
             out = out * output_gating
-        return out, orig_v
+        return out, AttnIntermediates(orig_v, next_cache)
     def forward(
         self,
@@ -279,10 +353,17 @@ class SegmentedAttention(Module):
         value_residual = None,
         flex_attn_fn: Callable | None = None,
         disable_flex_attn = False,
-        output_gating = None
+        output_gating = None,
+        cache = None
     ):
+        is_inferencing = exists(cache)
+        if is_inferencing:
+            assert seq.shape[-2] == 1
+            return self.forward_inference(seq, cache, value_residual, output_gating = output_gating)
         if seq.is_cuda and self.use_flex_attn and not disable_flex_attn:
-            return self.forward_flex(seq, value_residual, flex_attn_fn, output_gating = output_gating)
+            return self.forward_flex(seq, value_residual, flex_attn_fn, output_gating = output_gating, cache = cache)
         assert not (exists(value_residual) ^ exists(self.to_learned_v_mix))
@@ -310,6 +391,10 @@ class SegmentedAttention(Module):
             mix = self.to_learned_v_mix(seq)
             v = v.lerp(value_residual, mix)
+        # caching
+        next_cache = tuple(map(inverse_segment, (k, v)))
         # relative positions
         q, k = self.rotary_emb.rotate_queries_with_cached_keys(q, k)
@@ -369,7 +454,7 @@ class SegmentedAttention(Module):
         if exists(output_gating):
             out = out * output_gating
-        return out, orig_v
+        return out, AttnIntermediates(orig_v, next_cache)
 # MAC transformer
@@ -413,6 +498,7 @@ class MemoryAsContextTransformer(Module):
         # maybe sliding window attn
         self.sliding_window_attn = sliding_window_attn
+        self.attn_window_size = segment_len + num_longterm_mem_tokens
         # hyper conection
@@ -487,7 +573,6 @@ class MemoryAsContextTransformer(Module):
         assert not (use_flex_attn and not exists(flex_attention)), 'you need to be on the latest pytorch with a cuda device available'
         self.use_flex_attn = use_flex_attn
-        self.segment_len = segment_len
         self.num_persist_mem_tokens = num_persist_mem_tokens
     @torch.no_grad()
@@ -569,7 +654,7 @@ class MemoryAsContextTransformer(Module):
         # math
-        batch, seq_len, neural_mem_segment_len, segment_len, num_longterm_mem_tokens = *x.shape, self.neural_memory_segment_len, self.segment_len, self.num_longterm_mem_tokens
+        batch, seq_len, neural_mem_segment_len, segment_len, num_longterm_mem_tokens, attn_window_size = *x.shape, self.neural_memory_segment_len, self.segment_len, self.num_longterm_mem_tokens, self.attn_window_size
         # token embedding
@@ -603,6 +688,12 @@ class MemoryAsContextTransformer(Module):
             block_mask = create_mac_block_mask(seq_len_with_mem, segment_len + num_longterm_mem_tokens, self.num_persist_mem_tokens, self.sliding_window_attn)
             flex_attn_fn = partial(flex_attention, block_mask = block_mask)
+        # kv caching
+        is_inferencing = exists(cache)
+        cache = iter(default(cache, []))
+        next_kv_caches = []
         # value residual
         value_residual = None
@@ -611,6 +702,11 @@ class MemoryAsContextTransformer(Module):
         kv_recon_losses = self.zero
+        # when inferencing, only do one token at a time
+        if is_inferencing:
+            x = x[:, -1:]
         # expand and reduce streams for hyper connections
         x = self.expand_streams(x)
@@ -620,6 +716,8 @@ class MemoryAsContextTransformer(Module):
             retrieved = None
             attn_out_gates = None
+            # maybe neural memory
             if exists(mem):
                 retrieved, mem_kv_aux_loss = mem(x, return_aux_kv_loss = True)
                 kv_recon_losses = kv_recon_losses + mem_kv_aux_loss
@@ -631,16 +729,19 @@ class MemoryAsContextTransformer(Module):
             # attention
-            x, values = attn(
+            x, (values, next_kv_cache) = attn(
                 x,
                 value_residual = value_residual,
                 disable_flex_attn = disable_flex_attn,
                 flex_attn_fn = flex_attn_fn,
-                output_gating = attn_out_gates
+                output_gating = attn_out_gates,
+                cache = next(cache, None)
             )
             value_residual = default(value_residual, values)
+            next_kv_caches.append(next_kv_cache)
             # feedforward
             x = ff(x)
@@ -665,7 +766,16 @@ class MemoryAsContextTransformer(Module):
             if not return_cache:
                 return logits
-            return logits, cache
+            next_kv_caches = stack([stack(kv_cache) for kv_cache in next_kv_caches])
+            # handle kv cache length depending on local attention type
+            next_kv_caches = next_kv_caches[..., -attn_window_size:, :]
+            if not self.sliding_window_attn and divisible_by(seq_len_with_mem, attn_window_size):
+                next_kv_caches = next_kv_caches[..., 0:0, :]
+            return logits, next_kv_caches
         ar_loss = F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.1.18
+Version: 0.1.20
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.1.18"
+version = "0.1.20"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

@@ -107,14 +107,18 @@ def test_mac(
     logits = transformer(x)
     assert logits.shape == (1, seq_len, 256)
-def test_mac_sampling():
+@pytest.mark.parametrize('sliding', (False, True))
+def test_mac_sampling(sliding):
     transformer = MemoryAsContextTransformer(
         num_tokens = 256,
         dim = 256,
         depth = 2,
         segment_len = 32,
         num_persist_mem_tokens = 4,
-        num_longterm_mem_tokens = 16,
+        num_longterm_mem_tokens = 0,
+        sliding_window_attn = sliding,
+        neural_memory_layers = (),
+        neural_mem_gate_attn_output = False
     )
     ids = torch.randint(0, 256, (1, 1023))

titans-pytorch 0.1.18__tar.gz → 0.1.20__tar.gz

titans-pytorch 0.1.18tar.gz → 0.1.20tar.gz