PyPI - titans-pytorch - Versions diffs - 0.1.14__tar.gz → 0.1.17__tar.gz - Mend

titans-pytorch 0.1.14tar.gz → 0.1.17tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{titans_pytorch-0.1.14 → titans_pytorch-0.1.17}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.1.14
+Version: 0.1.17
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -35,7 +35,7 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.9
 Requires-Dist: accelerated-scan>=0.2.0
-Requires-Dist: axial-positional-embedding>=0.3.7
+Requires-Dist: axial-positional-embedding>=0.3.9
 Requires-Dist: einops>=0.8.0
 Requires-Dist: einx>=0.3.0
 Requires-Dist: hyper-connections>=0.1.8

{titans_pytorch-0.1.14 → titans_pytorch-0.1.17}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.1.14"
+version = "0.1.17"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -26,7 +26,7 @@ classifiers=[
 dependencies = [
     "accelerated-scan>=0.2.0",
-    "axial_positional_embedding>=0.3.7",
+    "axial_positional_embedding>=0.3.9",
     "einops>=0.8.0",
     "einx>=0.3.0",
     "hyper-connections>=0.1.8",

{titans_pytorch-0.1.14 → titans_pytorch-0.1.17}/tests/test_titans.py RENAMED Viewed

@@ -3,7 +3,7 @@ from torch import nn
 import pytest
 from titans_pytorch import NeuralMemory
-from titans_pytorch.mac_transformer import flex_attention, SegmentedAttention
+from titans_pytorch.mac_transformer import flex_attention, SegmentedAttention, MemoryAsContextTransformer
 def exists(v):
     return v is not None
@@ -92,8 +92,6 @@ def test_mac(
     num_longterm_mem_tokens,
     neural_mem_gate_attn_output
 ):
-    from titans_pytorch.mac_transformer import MemoryAsContextTransformer
     transformer = MemoryAsContextTransformer(
         num_tokens = 256,
         dim = 256,
@@ -109,6 +107,25 @@ def test_mac(
     logits = transformer(x)
     assert logits.shape == (1, seq_len, 256)
+def test_mac_sampling():
+    transformer = MemoryAsContextTransformer(
+        num_tokens = 256,
+        dim = 256,
+        depth = 2,
+        segment_len = 32,
+        num_persist_mem_tokens = 4,
+        num_longterm_mem_tokens = 16,
+    )
+    ids = torch.randint(0, 256, (1, 1023))
+    # after much training
+    sampled = transformer.sample(ids[:, :4], 53, use_cache = False, temperature = 0.)
+    sampled_with_cache = transformer.sample(ids[:, :4], 53, use_cache = True, temperature = 0.)
+    assert torch.allclose(sampled, sampled_with_cache)
 @pytest.mark.parametrize('seq_len', (1023, 17))
 @pytest.mark.parametrize('sliding', (True, False))
 def test_flex(

{titans_pytorch-0.1.14 → titans_pytorch-0.1.17}/titans_pytorch/mac_transformer.py RENAMED Viewed

@@ -537,7 +537,8 @@ class MemoryAsContextTransformer(Module):
         filter_kwargs: dict = dict(
             min_p = 0.1,
         ),
-        show_progress = True
+        show_progress = True,
+        use_cache = False
     ):
         was_training = self.training
         self.eval()
@@ -547,8 +548,37 @@ class MemoryAsContextTransformer(Module):
         iter_wrap = tqdm.tqdm if show_progress else identity
+        # cache for axial pos, attention, and neural memory
+        cache = None
+        factorized_pos_emb = None
+        # precompute factorized pos emb
+        if use_cache:
+            round_up_seq_len = round_up_multiple(seq_len, self.segment_len)
+            longterm_mem_lens = (round_up_seq_len // self.segment_len) * self.num_longterm_mem_tokens
+            seq_len_with_mem = round_up_seq_len + longterm_mem_lens
+            axial_dims = self.axial_pos_emb.maybe_derive_outer_dim(seq_len_with_mem, (self.neural_memory_segment_len,))
+            factorized_pos_emb = self.axial_pos_emb(axial_dims, return_factorized = True)
+        # sample
         for _ in iter_wrap(range(sample_num_times)):
-            logits = self.forward(out, disable_flex_attn = True)
+            logits, next_cache = self.forward(
+                out,
+                disable_flex_attn = True,
+                cache = cache,
+                return_cache = True,
+                factorized_pos_emb = factorized_pos_emb
+            )
+            if use_cache:
+                cache = next_cache
             logits = logits[:, -1]
             logits = filter_fn(logits, **filter_kwargs)
@@ -565,7 +595,10 @@ class MemoryAsContextTransformer(Module):
         x,
         return_loss = False,
         return_loss_breakdown = False,
-        disable_flex_attn = False
+        disable_flex_attn = False,
+        cache = None,
+        return_cache = False,
+        factorized_pos_emb = None
     ):
         if return_loss:
@@ -593,7 +626,7 @@ class MemoryAsContextTransformer(Module):
         # apply axial positional embedding
         # so intra and inter segment can be more easily discerned by the network
-        pos_emb = self.axial_pos_emb.forward_with_seq_len(seq_len_with_mem, (neural_mem_segment_len,))
+        pos_emb = self.axial_pos_emb.forward_with_seq_len(seq_len_with_mem, (neural_mem_segment_len,), factorized = factorized_pos_emb)
         x = x + pos_emb
@@ -651,7 +684,10 @@ class MemoryAsContextTransformer(Module):
         logits = self.to_logits(x)
         if not return_loss:
-            return logits
+            if not return_cache:
+                return logits
+            return logits, cache
         ar_loss = F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)

{titans_pytorch-0.1.14 → titans_pytorch-0.1.17}/titans_pytorch/titans.py RENAMED Viewed

@@ -301,6 +301,45 @@ class MemoryAttention(Module):
         return out
+# associative scan wrapper
+class AssocScan(Module):
+    def __init__(
+        self,
+        use_accelerated = False
+    ):
+        super().__init__()
+        self.use_accelerated = use_accelerated
+    def forward(self, gates, inputs):
+        if not self.use_accelerated:
+            _, outputs = associative_scan(binary_operator, (gates, inputs))
+            return outputs
+        from accelerated_scan.triton import scan as triton_scan
+        from accelerated_scan.warp import scan as warp_scan
+        scan = triton_scan if gates.is_cuda else warp_scan
+        def accelerate_scan_fn(gates, inputs):
+            gates = gates.expand_as(inputs)
+            gates, inputs = tuple(rearrange(t, 'b n d -> b d n') for t in (gates, inputs))
+            seq_len = gates.shape[-1]
+            next_power_two_seq_len = 2 ** max(5, int(math.ceil(math.log2(seq_len))))
+            gates = F.pad(gates, (0, next_power_two_seq_len - seq_len))
+            inputs = F.pad(inputs, (0, next_power_two_seq_len - seq_len))
+            outputs = scan(gates.contiguous(), inputs.contiguous())
+            outputs = outputs[..., :seq_len]
+            outputs = rearrange(outputs, 'b d n -> b n d')
+            return outputs
+        return accelerate_scan_fn(gates, inputs)
 # main neural memory
 def default_adaptive_step_transform(adaptive_step, max_lr = 1e-2):
@@ -339,6 +378,10 @@ class NeuralMemory(Module):
         self.retrieve_chunk_size, self.store_chunk_size = pair(chunk_size)
+        # associative scan
+        self.assoc_scan = AssocScan(use_accelerated = use_accelerated_scan)
         # norms
         self.retrieve_norm = nn.RMSNorm(dim) if pre_rmsnorm else nn.Identity()
@@ -564,38 +607,6 @@ class NeuralMemory(Module):
         surprises = grads.apply(lambda t: -t)
-        # determine scan function
-        def default_associative_scan(gates, inputs):
-            _, outputs = associative_scan(binary_operator, (gates, inputs))
-            return outputs
-        if self.use_accelerated_scan:
-            from accelerated_scan.triton import scan as triton_scan
-            from accelerated_scan.warp import scan as warp_scan
-            scan = triton_scan if seq.is_cuda else warp_scan
-            def accelerate_scan_fn(gates, inputs):
-                gates = gates.expand_as(inputs)
-                gates, inputs = tuple(rearrange(t, 'b n d -> b d n') for t in (gates, inputs))
-                seq_len = gates.shape[-1]
-                next_power_two_seq_len = 2 ** max(5, int(math.ceil(math.log2(seq_len))))
-                gates = F.pad(gates, (0, next_power_two_seq_len - seq_len))
-                inputs = F.pad(inputs, (0, next_power_two_seq_len - seq_len))
-                outputs = scan(gates.contiguous(), inputs.contiguous())
-                outputs = outputs[..., :seq_len]
-                outputs = rearrange(outputs, 'b d n -> b n d')
-                return outputs
-            scan_fn = accelerate_scan_fn
-        else:
-            scan_fn = default_associative_scan
         # momentum + weight decay - momentum is the new contribution, as most linear RNNs have learned forgetting gates
         next_momentum = TensorDict() if has_momentum else None
@@ -610,12 +621,12 @@ class NeuralMemory(Module):
             # derive momentum with associative scan - eq (10)
             if has_momentum:
-                update = scan_fn(adaptive_momentum, surprise) # momentum is S / surprise in the paper
+                update = self.assoc_scan(adaptive_momentum, surprise) # momentum is S / surprise in the paper
                 momentum = update
             # use associative scan again for learned forgetting (weight decay) - eq (13)
-            update = scan_fn(1. - decay_factor, update)
+            update = self.assoc_scan(1. - decay_factor, update)
             updates[param_name] = inverse_pack(update)