PyPI - titans-pytorch - Versions diffs - 0.0.51__tar.gz → 0.0.53__tar.gz - Mend

titans-pytorch 0.0.51tar.gz → 0.0.53tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{titans_pytorch-0.0.51 → titans_pytorch-0.0.53}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.0.51
+Version: 0.0.53
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -37,7 +37,6 @@ Requires-Python: >=3.9
 Requires-Dist: accelerated-scan>=0.2.0
 Requires-Dist: axial-positional-embedding>=0.3.5
 Requires-Dist: einops>=0.8.0
-Requires-Dist: einx>=0.3.0
 Requires-Dist: hyper-connections>=0.1.8
 Requires-Dist: ninja
 Requires-Dist: rotary-embedding-torch

{titans_pytorch-0.0.51 → titans_pytorch-0.0.53}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.0.51"
+version = "0.0.53"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -27,7 +27,6 @@ classifiers=[
 dependencies = [
     "accelerated-scan>=0.2.0",
     "axial_positional_embedding>=0.3.5",
-    "einx>=0.3.0",
     "einops>=0.8.0",
     "hyper-connections>=0.1.8",
     "Ninja",

{titans_pytorch-0.0.51 → titans_pytorch-0.0.53}/titans_pytorch/mac_transformer.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from __future__ import annotations
+from typing import Callable
 from math import ceil
 from functools import partial
@@ -32,7 +33,7 @@ def create_mac_block_mask(seq_len, window_size, persist_mem_len):
 # einstein notation related
-from einops import einsum, repeat, rearrange, pack, unpack
+from einops import repeat, rearrange, pack, unpack
 from einops.layers.torch import Rearrange
 # b - batch
@@ -128,6 +129,7 @@ class SegmentedAttention(Module):
         heads = 8,
         accept_value_residual = False,
         attend_kwargs: dict = dict(),
+        use_flex_attn = False
     ):
         super().__init__()
         self.norm = nn.RMSNorm(dim)
@@ -157,11 +159,79 @@ class SegmentedAttention(Module):
         self.persistent_memory = nn.Parameter(torch.zeros(2, heads, num_persist_mem_tokens, dim_head))
+        # flex attn related
+        assert not (use_flex_attn and not exists(flex_attention)), 'you need to be on the latest pytorch with a cuda device available'
+        self.use_flex_attn = use_flex_attn
+        self.segment_len = segment_len
+        self.num_persist_mem_tokens = num_persist_mem_tokens
+    def forward_flex(
+        self,
+        seq,
+        value_residual = None,
+        flex_attn_fn: Callable | None = None
+    ):
+        assert not (exists(value_residual) ^ exists(self.to_learned_v_mix))
+        batch, seq_len = seq.shape[:2]
+        # attention
+        seq = self.norm(seq)
+        q, k, v = self.to_qkv(seq).chunk(3, dim = -1)
+        q, k, v = map(self.split_heads, (q, k, v))
+        # value residual
+        orig_v = v
+        if exists(self.to_learned_v_mix):
+            mix = self.to_learned_v_mix(seq)
+            v = v.lerp(value_residual, mix)
+        # take care of persistent memory key / values
+        pmk, pmv = repeat(self.persistent_memory, 'kv h n d -> kv b h n d', b = batch)
+        # relative positions
+        q, k = self.rotary_emb.rotate_queries_with_cached_keys(q, k)
+        # persistent memory
+        k = cat((pmk, k), dim = -2)
+        v = cat((pmv, v), dim = -2)
+        # prep flex attention
+        if not exists(flex_attn_fn):
+            block_mask = create_mac_block_mask(seq_len, self.segment_len, self.num_persist_mem_tokens)
+            flex_attn_fn = partial(flex_attention, block_mask = block_mask)
+        # attention
+        out = flex_attn_fn(q, k, v)
+        out = self.merge_heads(out)
+        out = self.to_out(out)
+        return out, orig_v
     def forward(
         self,
         seq,
-        value_residual = None
+        value_residual = None,
+        flex_attn_fn: Callable | None = None
     ):
+        if seq.is_cuda and self.use_flex_attn:
+            return self.forward_flex(seq, value_residual, flex_attn_fn)
         assert not (exists(value_residual) ^ exists(self.to_learned_v_mix))
         segment_len, num_longterm_mem_tokens = self.segment_len, self.num_longterm_mem_tokens
@@ -191,7 +261,7 @@ class SegmentedAttention(Module):
         # take care of persistent memory key / values
-        pmk, pmv = tuple(repeat(t, 'h n d -> b h n d', b = seq.shape[0]) for t in self.persistent_memory)
+        pmk, pmv = repeat(self.persistent_memory, 'kv ... -> kv b ...', b = seq.shape[0])
         # relative positions

{titans_pytorch-0.0.51 → titans_pytorch-0.0.53}/titans_pytorch/titans.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from __future__ import annotations
+from typing import Callable
 import math
 from functools import partial
@@ -16,7 +17,6 @@ from titans_pytorch.associative_scan import (
     pad_at_dim
 )
-import einx
 from einops import rearrange, repeat, pack, unpack
 from einops.layers.torch import Rearrange, Reduce
@@ -338,9 +338,9 @@ class NeuralMemory(Module):
         # take care of chunking
-        keys, values = tuple(rearrange(t, 'b (n c) d -> (b n) c d', c = self.chunk_size) for t in (keys, values))
+        keys, values = tuple(rearrange(t, 'b (n c) d -> (b n) c d', c = chunk_size) for t in (keys, values))
-        adaptive_lr = rearrange(adaptive_lr, 'b (n c) -> (b n) c', c = self.chunk_size)
+        adaptive_lr = rearrange(adaptive_lr, 'b (n c) -> (b n) c', c = chunk_size)
         # get grads and extra auxiliary loss (for backwarding through qkv projection in base neural memory module)