PyPI - titans-pytorch - Versions diffs - 0.0.50__py3-none-any.whl → 0.0.52__py3-none-any.whl - Mend

titans-pytorch 0.0.50py3-none-any.whl → 0.0.52py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

titans_pytorch/mac_transformer.py CHANGED Viewed

@@ -7,16 +7,48 @@ from torch import nn, cat
 import torch.nn.functional as F
 from torch.nn import Module, ModuleList, Linear
+# flex attention
+# https://pytorch.org/blog/flexattention/
+flex_attention = None
+try:
+    from torch.nn.attention.flex_attention import flex_attention, create_block_mask
+    if torch.cuda.is_available():
+        flex_attention = torch.compile(flex_attention)
+except ImportError:
+    pass
+def create_mac_block_mask(seq_len, window_size, persist_mem_len):
+    def create_mac_mask(b, h, q_idx, kv_idx):
+        is_persist_mem = kv_idx < persist_mem_len
+        causal_mask = q_idx >= (kv_idx - is_persist_mem)
+        block_diagonal = (q_idx // window_size) == ((kv_idx - is_persist_mem) // window_size)
+        return is_persist_mem | (~is_persist_mem & (causal_mask & block_diagonal))
+    block_mask = create_block_mask(create_mac_mask, B = None, H = None, Q_LEN = seq_len, KV_LEN = seq_len + persist_mem_len, _compile = True)
+    return block_mask
+# einstein notation related
 from einops import einsum, repeat, rearrange, pack, unpack
 from einops.layers.torch import Rearrange
-from hyper_connections import get_init_and_expand_reduce_stream_functions
+# b - batch
+# n - sequence
+# h - heads
+# d - feature dimension
 # absolute and relative positions
 from axial_positional_embedding import ContinuousAxialPositionalEmbedding
 from rotary_embedding_torch import RotaryEmbedding
+# hyper connections / attend from x-transformers, which handles different queries and key lengths better
 from x_transformers.attend import Attend
+from hyper_connections import get_init_and_expand_reduce_stream_functions
 # proposed neural memory
@@ -96,6 +128,7 @@ class SegmentedAttention(Module):
         heads = 8,
         accept_value_residual = False,
         attend_kwargs: dict = dict(),
+        use_flex_attn = False
     ):
         super().__init__()
         self.norm = nn.RMSNorm(dim)
@@ -125,11 +158,79 @@ class SegmentedAttention(Module):
         self.persistent_memory = nn.Parameter(torch.zeros(2, heads, num_persist_mem_tokens, dim_head))
+        # flex attn related
+        assert not (use_flex_attn and not exists(flex_attention)), 'you need to be on the latest pytorch with a cuda device available'
+        self.use_flex_attn = use_flex_attn
+        self.segment_len = segment_len
+        self.num_persist_mem_tokens = num_persist_mem_tokens
+    def forward_flex(
+        self,
+        seq,
+        value_residual = None,
+        flex_attn_fn: Callable | None = None
+    ):
+        assert not (exists(value_residual) ^ exists(self.to_learned_v_mix))
+        batch, seq_len = seq.shape[:2]
+        # attention
+        seq = self.norm(seq)
+        q, k, v = self.to_qkv(seq).chunk(3, dim = -1)
+        q, k, v = map(self.split_heads, (q, k, v))
+        # value residual
+        orig_v = v
+        if exists(self.to_learned_v_mix):
+            mix = self.to_learned_v_mix(seq)
+            v = v.lerp(value_residual, mix)
+        # take care of persistent memory key / values
+        pmk, pmv = repeat(self.persistent_memory, 'kv h n d -> kv b h n d', b = batch)
+        # relative positions
+        q, k = self.rotary_emb.rotate_queries_with_cached_keys(q, k)
+        # persistent memory
+        k = cat((pmk, k), dim = -2)
+        v = cat((pmv, v), dim = -2)
+        # prep flex attention
+        if not exists(flex_attn_fn):
+            block_mask = create_mac_block_mask(seq_len, self.segment_len, self.num_persist_mem_tokens)
+            flex_attn_fn = partial(flex_attention, block_mask = block_mask)
+        # attention
+        out = flex_attn_fn(q, k, v)
+        out = self.merge_heads(out)
+        out = self.to_out(out)
+        return out, orig_v
     def forward(
         self,
         seq,
-        value_residual = None
+        value_residual = None,
+        flex_attn_fn: Callable | None = None
     ):
+        if seq.is_cuda and self.use_flex_attn:
+            return self.forward_flex(seq, value_residual, flex_attn_fn)
         assert not (exists(value_residual) ^ exists(self.to_learned_v_mix))
         segment_len, num_longterm_mem_tokens = self.segment_len, self.num_longterm_mem_tokens
@@ -159,7 +260,7 @@ class SegmentedAttention(Module):
         # take care of persistent memory key / values
-        pmk, pmv = tuple(repeat(t, 'h n d -> b h n d', b = seq.shape[0]) for t in self.persistent_memory)
+        pmk, pmv = repeat(self.persistent_memory, 'kv ... -> kv b ...', b = seq.shape[0])
         # relative positions

titans_pytorch/titans.py CHANGED Viewed

@@ -56,6 +56,17 @@ def pack_one_with_inverse(t, pattern):
     return packed, inverse
+def Sequential(*modules):
+    modules = [*filter(exists, modules)]
+    if len(modules) == 0:
+        return nn.Identity()
+    if len(modules) == 1:
+        return modules[0]
+    return nn.Sequential(*modules)
 # softclamping gradients
 def softclamp_max(t, max_value):
@@ -124,9 +135,6 @@ class MemoryAttention(Module):
         ])
     def forward(self, x):
-        assert x.shape[-2] > 1, 'chunk size needs to be greater than 1 for using attention as memory'
         wq, wk, wv, ffw1, ffw2 = self.weights
         q = F.normalize(x @ wq, dim = -1)
@@ -168,6 +176,7 @@ class NeuralMemory(Module):
         post_rmsnorm = True,
         max_grad_norm: float | None = None,
         use_accelerated_scan = False,
+        activation: Module | None = None,
         default_model_kwargs: dict = dict(
             depth = 2
         )
@@ -225,11 +234,11 @@ class NeuralMemory(Module):
         # queries for retrieving from the model
-        self.to_queries = LinearNoBias(dim, dim_inner)
+        self.to_queries = Sequential(LinearNoBias(dim, dim_inner), activation)
         # keys and values for storing to the model
-        self.to_keys_values = LinearNoBias(dim, dim_inner * 2)
+        self.to_keys_values = Sequential(LinearNoBias(dim, dim_inner * 2), activation)
         self.store_memory_loss_fn = store_memory_loss_fn
         # empty memory embed

{titans_pytorch-0.0.50.dist-info → titans_pytorch-0.0.52.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.0.50
+Version: 0.0.52
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -135,3 +135,12 @@ $ python train_mac.py
     year    = {2024}
 }
 ```
+```bibtex
+@inproceedings{Yang2024GatedDN,
+    title   = {Gated Delta Networks: Improving Mamba2 with Delta Rule},
+    author  = {Songlin Yang and Jan Kautz and Ali Hatamizadeh},
+    year    = {2024},
+    url     = {https://api.semanticscholar.org/CorpusID:274598177}
+}
+```

titans_pytorch-0.0.52.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+titans_pytorch/__init__.py,sha256=wnv_Cjdjqh_h5IqLkQ8xrTtA2K663ITEn-1JeeHofTo,150
+titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
+titans_pytorch/mac_transformer.py,sha256=khfjpbsy-uT9NIG3dZLsLOG_XSEi7EqcyfbPr7EQc2Q,13192
+titans_pytorch/titans.py,sha256=T04onF0xhcrosS-Qkx7fcx-Cqgh0TdU5JLdq9l8ayGg,15911
+titans_pytorch-0.0.52.dist-info/METADATA,sha256=coC9ExIuNvmab0BktSE1NwUgxRaBUV7h_cTHeoJkRJo,4484
+titans_pytorch-0.0.52.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+titans_pytorch-0.0.52.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+titans_pytorch-0.0.52.dist-info/RECORD,,

titans_pytorch-0.0.50.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-titans_pytorch/__init__.py,sha256=wnv_Cjdjqh_h5IqLkQ8xrTtA2K663ITEn-1JeeHofTo,150
-titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
-titans_pytorch/mac_transformer.py,sha256=EMhxPt86Vr6LFvPm0OLMFYLaIY19khU9yIHkIhl2EMA,10316
-titans_pytorch/titans.py,sha256=TklMAxNDxgFBpJZFJa8hEhqA_DITmT6EM0p0ueE1jo8,15712
-titans_pytorch-0.0.50.dist-info/METADATA,sha256=KU7TTrH89eNVPP10NKKTDKnW-ik344_kVQkAXW7NRL8,4210
-titans_pytorch-0.0.50.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-titans_pytorch-0.0.50.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-titans_pytorch-0.0.50.dist-info/RECORD,,

{titans_pytorch-0.0.50.dist-info → titans_pytorch-0.0.52.dist-info}/WHEEL RENAMED Viewed

File without changes

{titans_pytorch-0.0.50.dist-info → titans_pytorch-0.0.52.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

titans-pytorch 0.0.50__py3-none-any.whl → 0.0.52__py3-none-any.whl

titans-pytorch 0.0.50py3-none-any.whl → 0.0.52py3-none-any.whl