PyPI - titans-pytorch - Versions diffs - 0.1.14__tar.gz → 0.1.15__tar.gz - Mend

titans-pytorch 0.1.14tar.gz → 0.1.15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{titans_pytorch-0.1.14 → titans_pytorch-0.1.15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.1.14
+Version: 0.1.15
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -35,7 +35,7 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.9
 Requires-Dist: accelerated-scan>=0.2.0
-Requires-Dist: axial-positional-embedding>=0.3.7
+Requires-Dist: axial-positional-embedding>=0.3.9
 Requires-Dist: einops>=0.8.0
 Requires-Dist: einx>=0.3.0
 Requires-Dist: hyper-connections>=0.1.8

{titans_pytorch-0.1.14 → titans_pytorch-0.1.15}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.1.14"
+version = "0.1.15"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -26,7 +26,7 @@ classifiers=[
 dependencies = [
     "accelerated-scan>=0.2.0",
-    "axial_positional_embedding>=0.3.7",
+    "axial_positional_embedding>=0.3.9",
     "einops>=0.8.0",
     "einx>=0.3.0",
     "hyper-connections>=0.1.8",

{titans_pytorch-0.1.14 → titans_pytorch-0.1.15}/titans_pytorch/titans.py RENAMED Viewed

@@ -301,6 +301,45 @@ class MemoryAttention(Module):
         return out
+# associative scan wrapper
+class AssocScan(Module):
+    def __init__(
+        self,
+        use_accelerated = False
+    ):
+        super().__init__()
+        self.use_accelerated = use_accelerated
+    def forward(self, gates, inputs):
+        if not self.use_accelerated:
+            _, outputs = associative_scan(binary_operator, (gates, inputs))
+            return outputs
+        from accelerated_scan.triton import scan as triton_scan
+        from accelerated_scan.warp import scan as warp_scan
+        scan = triton_scan if gates.is_cuda else warp_scan
+        def accelerate_scan_fn(gates, inputs):
+            gates = gates.expand_as(inputs)
+            gates, inputs = tuple(rearrange(t, 'b n d -> b d n') for t in (gates, inputs))
+            seq_len = gates.shape[-1]
+            next_power_two_seq_len = 2 ** max(5, int(math.ceil(math.log2(seq_len))))
+            gates = F.pad(gates, (0, next_power_two_seq_len - seq_len))
+            inputs = F.pad(inputs, (0, next_power_two_seq_len - seq_len))
+            outputs = scan(gates.contiguous(), inputs.contiguous())
+            outputs = outputs[..., :seq_len]
+            outputs = rearrange(outputs, 'b d n -> b n d')
+            return outputs
+        return accelerate_scan_fn(gates, inputs)
 # main neural memory
 def default_adaptive_step_transform(adaptive_step, max_lr = 1e-2):
@@ -339,6 +378,10 @@ class NeuralMemory(Module):
         self.retrieve_chunk_size, self.store_chunk_size = pair(chunk_size)
+        # associative scan
+        self.assoc_scan = AssocScan(use_accelerated = use_accelerated_scan)
         # norms
         self.retrieve_norm = nn.RMSNorm(dim) if pre_rmsnorm else nn.Identity()
@@ -564,38 +607,6 @@ class NeuralMemory(Module):
         surprises = grads.apply(lambda t: -t)
-        # determine scan function
-        def default_associative_scan(gates, inputs):
-            _, outputs = associative_scan(binary_operator, (gates, inputs))
-            return outputs
-        if self.use_accelerated_scan:
-            from accelerated_scan.triton import scan as triton_scan
-            from accelerated_scan.warp import scan as warp_scan
-            scan = triton_scan if seq.is_cuda else warp_scan
-            def accelerate_scan_fn(gates, inputs):
-                gates = gates.expand_as(inputs)
-                gates, inputs = tuple(rearrange(t, 'b n d -> b d n') for t in (gates, inputs))
-                seq_len = gates.shape[-1]
-                next_power_two_seq_len = 2 ** max(5, int(math.ceil(math.log2(seq_len))))
-                gates = F.pad(gates, (0, next_power_two_seq_len - seq_len))
-                inputs = F.pad(inputs, (0, next_power_two_seq_len - seq_len))
-                outputs = scan(gates.contiguous(), inputs.contiguous())
-                outputs = outputs[..., :seq_len]
-                outputs = rearrange(outputs, 'b d n -> b n d')
-                return outputs
-            scan_fn = accelerate_scan_fn
-        else:
-            scan_fn = default_associative_scan
         # momentum + weight decay - momentum is the new contribution, as most linear RNNs have learned forgetting gates
         next_momentum = TensorDict() if has_momentum else None
@@ -610,12 +621,12 @@ class NeuralMemory(Module):
             # derive momentum with associative scan - eq (10)
             if has_momentum:
-                update = scan_fn(adaptive_momentum, surprise) # momentum is S / surprise in the paper
+                update = self.assoc_scan(adaptive_momentum, surprise) # momentum is S / surprise in the paper
                 momentum = update
             # use associative scan again for learned forgetting (weight decay) - eq (13)
-            update = scan_fn(1. - decay_factor, update)
+            update = self.assoc_scan(1. - decay_factor, update)
             updates[param_name] = inverse_pack(update)