PyPI - titans-pytorch - Versions diffs - 0.1.38__tar.gz → 0.2.1__tar.gz - Mend

titans-pytorch 0.1.38tar.gz → 0.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{titans_pytorch-0.1.38 → titans_pytorch-0.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.1.38
+Version: 0.2.1
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -56,7 +56,7 @@ Description-Content-Type: text/markdown
 <img src="./fig1.png" width="400px"></img>
-## Titans - Pytorch (wip)
+## Titans - Pytorch
 Unofficial implementation of [Titans](https://arxiv.org/abs/2501.00663) in Pytorch. Will also contain some explorations into architectures beyond their simple 1-4 layer MLP for the neural memory module, if it works well to any degree.

{titans_pytorch-0.1.38 → titans_pytorch-0.2.1}/README.md RENAMED Viewed

@@ -2,7 +2,7 @@
 <img src="./fig1.png" width="400px"></img>
-## Titans - Pytorch (wip)
+## Titans - Pytorch
 Unofficial implementation of [Titans](https://arxiv.org/abs/2501.00663) in Pytorch. Will also contain some explorations into architectures beyond their simple 1-4 layer MLP for the neural memory module, if it works well to any degree.

{titans_pytorch-0.1.38 → titans_pytorch-0.2.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.1.38"
+version = "0.2.1"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.1.38 → titans_pytorch-0.2.1}/tests/test_titans.py RENAMED Viewed

@@ -184,9 +184,10 @@ def test_mac(
     assert logits.shape == (1, seq_len, 256)
 @pytest.mark.parametrize('sliding', (False, True))
-@pytest.mark.parametrize('mem_layers', (()))
+@pytest.mark.parametrize('mem_layers', ((), None))
 @pytest.mark.parametrize('longterm_mems', (0, 4, 16))
-@pytest.mark.parametrize('prompt_len', (0, 4, 16))
+@pytest.mark.parametrize('prompt_len', (4, 16))
+@torch_default_dtype(torch.float64)
 def test_mac_sampling(
     sliding,
     mem_layers,

{titans_pytorch-0.1.38 → titans_pytorch-0.2.1}/titans_pytorch/mac_transformer.py RENAMED Viewed

@@ -111,6 +111,7 @@ def pad_and_segment_with_inverse(
     seq,
     segment_len,
     fold_into_batch = True,
+    inverse_remove_pad = True
 ):
     batch, seq_len = seq.shape[:2]
     next_seq_len_mult = round_up_multiple(seq_len, segment_len)
@@ -124,15 +125,12 @@ def pad_and_segment_with_inverse(
     if fold_into_batch:
         seq = rearrange(seq, 'b (w n) d -> (b w) n d', n = segment_len)
-    shape = seq.shape
     def inverse(out):
-        unchanged_shape = out.shape == shape
         if fold_into_batch:
             out = rearrange(out, '(b w) ... n d -> b ... (w n) d', b = batch)
-        if needs_pad and unchanged_shape:
+        if needs_pad and inverse_remove_pad:
             out = out[..., :-padding, :]
         return out
@@ -493,7 +491,8 @@ class MemoryAsContextTransformer(Module):
         aux_kv_recon_loss_weight = 0.,
         use_flex_attn = False,
         sliding_window_attn = False,
-        weight_tie_memory_model = False
+        weight_tie_memory_model = False,
+        prev_neural_mem_update_for_weights = None
     ):
         super().__init__()
@@ -535,6 +534,7 @@ class MemoryAsContextTransformer(Module):
             assert exists(neural_memory_model), '`neural_memory_model` must be explicitly set'
         self.weight_tie_memory_model = weight_tie_memory_model
+        self.prev_neural_mem_update_for_weights = default(prev_neural_mem_update_for_weights, weight_tie_memory_model)
         # value residual learning for neural memory
@@ -704,7 +704,7 @@ class MemoryAsContextTransformer(Module):
         # math
-        batch, seq_len, neural_mem_segment_len, segment_len, num_longterm_mem_tokens, attn_window_size, weight_tie_memory_model = *x.shape, self.neural_memory_segment_len, self.segment_len, self.num_longterm_mem_tokens, self.attn_window_size, self.weight_tie_memory_model
+        batch, seq_len, neural_mem_segment_len, segment_len, num_longterm_mem_tokens, attn_window_size, prev_neural_mem_update_for_weights = *x.shape, self.neural_memory_segment_len, self.segment_len, self.num_longterm_mem_tokens, self.attn_window_size, self.prev_neural_mem_update_for_weights
         seq_len_with_mem = self.seq_len_with_longterm_mem(seq_len)
@@ -714,7 +714,7 @@ class MemoryAsContextTransformer(Module):
         # intersperse longterm memory
-        x, inverse_segment = pad_and_segment_with_inverse(x, segment_len)
+        x, inverse_segment = pad_and_segment_with_inverse(x, segment_len, inverse_remove_pad = False)
         mems = repeat(self.longterm_mems, 'n d -> b n d', b = x.shape[0])
         x, inverse_pack_mems = pack_with_inverse((x, mems), 'b * d')
@@ -816,7 +816,7 @@ class MemoryAsContextTransformer(Module):
                 if self.mem_add_value_residual:
                     mem_value_residual = next_mem_value_residual
-                if weight_tie_memory_model:
+                if prev_neural_mem_update_for_weights:
                     neural_memory_updates = next_neural_mem_cache.updates
                 if self.gate_attn_output:
@@ -856,7 +856,9 @@ class MemoryAsContextTransformer(Module):
             next_kv_caches = next_kv_caches[..., -attn_window_size:, :]
-            if not self.sliding_window_attn and divisible_by(seq_len_with_mem, attn_window_size):
+            kv_cache_length = next_kv_caches.shape[-2]
+            if not self.sliding_window_attn and divisible_by(kv_cache_length, attn_window_size):
                 next_kv_caches = next_kv_caches[..., 0:0, :]
             next_cache = (
@@ -878,7 +880,7 @@ class MemoryAsContextTransformer(Module):
         if not is_inferencing:
-            x, inverse_segment = pad_and_segment_with_inverse(x, attn_window_size)
+            x, inverse_segment = pad_and_segment_with_inverse(x, attn_window_size, inverse_remove_pad = False)
             x, _ = inverse_pack_mems(x)

{titans_pytorch-0.1.38 → titans_pytorch-0.2.1}/train_mac.py RENAMED Viewed

@@ -53,6 +53,7 @@ WANDB_ONLINE = False # turn this on to pipe experiment to cloud
 USE_ACCELERATED_SCAN = True
 USE_FLEX_ATTN = True
+USE_FAST_INFERENCE = False
 # wandb experiment tracker
@@ -163,6 +164,6 @@ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval = 10., desc = 'training'):
         prime = decode_tokens(inp)
         print(f'%s \n\n %s', (prime, '*' * 100))
-        sample = model.sample(inp[None, ...], GENERATE_LENGTH)
+        sample = model.sample(inp[None, ...], GENERATE_LENGTH, use_cache = USE_FAST_INFERENCE)
         output_str = decode_tokens(sample[0])
         print(output_str)