PyPI - titans-pytorch - Versions diffs - 0.1.30__tar.gz → 0.1.32__tar.gz - Mend

titans-pytorch 0.1.30tar.gz → 0.1.32tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{titans_pytorch-0.1.30 → titans_pytorch-0.1.32}/.gitignore RENAMED Viewed

@@ -1,4 +1,5 @@
 train_local.py
+.DS_Store
 # Byte-compiled / optimized / DLL files
 __pycache__/

{titans_pytorch-0.1.30 → titans_pytorch-0.1.32}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.1.30
+Version: 0.1.32
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -82,7 +82,7 @@ mem = NeuralMemory(
 ).cuda()
 seq = torch.randn(2, 1024, 384).cuda()
-retrieved = mem(seq)
+retrieved, mem_state = mem(seq)
 assert seq.shape == retrieved.shape
 ```

{titans_pytorch-0.1.30 → titans_pytorch-0.1.32}/README.md RENAMED Viewed

@@ -28,7 +28,7 @@ mem = NeuralMemory(
 ).cuda()
 seq = torch.randn(2, 1024, 384).cuda()
-retrieved = mem(seq)
+retrieved, mem_state = mem(seq)
 assert seq.shape == retrieved.shape
 ```

{titans_pytorch-0.1.30 → titans_pytorch-0.1.32}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.1.30"
+version = "0.1.32"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.1.30 → titans_pytorch-0.1.32}/tests/test_titans.py RENAMED Viewed

@@ -52,12 +52,12 @@ def test_titans(
     )
     seq = torch.randn(2, seq_len, 384)
-    retrieved = mem(seq)
+    retrieved, _ = mem(seq)
     assert seq.shape == retrieved.shape
 def test_titans_attn_memory():
-    from titans_pytorch.titans import MemoryAttention
+    from titans_pytorch.neural_memory import MemoryAttention
     mem = NeuralMemory(
         dim = 384,
@@ -68,7 +68,7 @@ def test_titans_attn_memory():
     )
     seq = torch.randn(2, 1024, 384)
-    retrieved = mem(seq)
+    retrieved, _ = mem(seq)
     assert seq.shape == retrieved.shape
@@ -81,7 +81,7 @@ def test_retrieve_store_diff_seq():
     retrieve_seq = torch.randn(2, 64 * 64, 384)
     store_seq = torch.randn(2, 64 * 32, 384)
-    retrieved = mem(retrieve_seq, store_seq = store_seq)
+    retrieved, _ = mem(retrieve_seq, store_seq = store_seq)
     assert retrieve_seq.shape == retrieved.shape
@@ -94,7 +94,7 @@ def test_overriding_chunk_size():
     seq = torch.randn(2, 128 * 16, 384)
     store_seq = torch.randn(2, 128 * 8, 384)
-    retrieved = mem(seq, store_seq, chunk_size = 16, store_chunk_size = 8)
+    retrieved, _ = mem(seq, store_seq, chunk_size = 16, store_chunk_size = 8)
     assert seq.shape == retrieved.shape
@@ -124,18 +124,22 @@ def test_mac(
     assert logits.shape == (1, seq_len, 256)
 @pytest.mark.parametrize('sliding', (False, True))
-@pytest.mark.parametrize('mem_layers', ((), None, (4,)))
+@pytest.mark.parametrize('mem_layers', (()))
+@pytest.mark.parametrize('longterm_mems', (0, 4, 16))
+@pytest.mark.parametrize('prompt_len', (0, 4, 16))
 def test_mac_sampling(
     sliding,
-    mem_layers
+    mem_layers,
+    longterm_mems,
+    prompt_len
 ):
     transformer = MemoryAsContextTransformer(
         num_tokens = 256,
         dim = 256,
-        depth = 2,
+        depth = 4,
         segment_len = 32,
         num_persist_mem_tokens = 4,
-        num_longterm_mem_tokens = 0,
+        num_longterm_mem_tokens = longterm_mems,
         sliding_window_attn = sliding,
         neural_memory_layers = mem_layers,
         neural_mem_gate_attn_output = False
@@ -145,29 +149,46 @@ def test_mac_sampling(
     # after much training
-    sampled = transformer.sample(ids[:, :4], 53, use_cache = False, temperature = 0.)
-    sampled_with_cache = transformer.sample(ids[:, :4], 53, use_cache = True, temperature = 0.)
+    prompt = ids[:, :prompt_len]
+    sampled = transformer.sample(prompt, 53, use_cache = False, temperature = 0.)
+    sampled_with_cache = transformer.sample(prompt, 53, use_cache = True, temperature = 0.)
     assert torch.allclose(sampled, sampled_with_cache)
 @pytest.mark.parametrize('seq_len', (2, 64, 256))
+@pytest.mark.parametrize('prompt_len', (0, 65))
+@pytest.mark.parametrize('mem_chunk_size', (2, 32, 64))
 @torch_default_dtype(torch.float64)
 def test_neural_mem_inference(
-    seq_len
+    seq_len,
+    prompt_len,
+    mem_chunk_size
 ):
     mem = NeuralMemory(
         dim = 384,
-        chunk_size = 64,
+        chunk_size = mem_chunk_size,
     )
     seq = torch.randn(2, seq_len, 384)
-    parallel_retrieved = mem(seq)
+    parallel_retrieved, _ = mem(seq)
     assert seq.shape == parallel_retrieved.shape
     state = None
     sequential_retrieved = []
+    # test initial parallel prompt
+    test_parallel_prompt = prompt_len > 0 and prompt_len < seq_len
+    if test_parallel_prompt:
+        prompt, seq = seq[:, :prompt_len], seq[:, prompt_len:]
+        retrieved_prompt, state = mem(prompt)
+        sequential_retrieved.append(retrieved_prompt)
+    # sequential inference
     for token in seq.unbind(dim = 1):
         one_retrieved, state = mem.forward_inference(
@@ -208,7 +229,7 @@ def test_flex(
 @torch_default_dtype(torch.float64)
 def test_assoc_scan():
-    from titans_pytorch.titans import AssocScan
+    from titans_pytorch.neural_memory import AssocScan
     torch.set_default_dtype(torch.float64)
     scan = AssocScan()

{titans_pytorch-0.1.30 → titans_pytorch-0.1.32}/titans_pytorch/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from titans_pytorch.titans import (
+from titans_pytorch.neural_memory import (
     NeuralMemory,
     MemoryMLP,
     MemoryAttention,

{titans_pytorch-0.1.30 → titans_pytorch-0.1.32}/titans_pytorch/mac_transformer.py RENAMED Viewed

@@ -65,7 +65,7 @@ from hyper_connections import get_init_and_expand_reduce_stream_functions
 # proposed neural memory
-from titans_pytorch.titans import NeuralMemory
+from titans_pytorch.neural_memory import NeuralMemory
 # constants
@@ -106,7 +106,11 @@ def pad_at_dim(t, pad, dim = -1, value = 0.):
     zeros = ((0, 0) * dims_from_right)
     return F.pad(t, (*zeros, *pad), value = value)
-def pad_and_segment_with_inverse(seq, segment_len, fold_into_batch = True):
+def pad_and_segment_with_inverse(
+    seq,
+    segment_len,
+    fold_into_batch = True,
+):
     batch, seq_len = seq.shape[:2]
     next_seq_len_mult = round_up_multiple(seq_len, segment_len)
@@ -119,11 +123,15 @@ def pad_and_segment_with_inverse(seq, segment_len, fold_into_batch = True):
     if fold_into_batch:
         seq = rearrange(seq, 'b (w n) d -> (b w) n d', n = segment_len)
-    def inverse(out, remove_pad = True):
+    shape = seq.shape
+    def inverse(out):
+        unchanged_shape = out.shape == shape
         if fold_into_batch:
             out = rearrange(out, '(b w) ... n d -> b ... (w n) d', b = batch)
-        if needs_pad and remove_pad:
+        if needs_pad and unchanged_shape:
             out = out[..., :-padding, :]
         return out
@@ -582,13 +590,8 @@ class MemoryAsContextTransformer(Module):
         self,
         seq_index
     ):
-        total_segment_len = self.attn_window_size
-        seq = seq_index + 1
-        seq -= int((seq % total_segment_len) == 0)
-        last_segment_len = round_down_multiple(seq, total_segment_len)
-        segment_seq = seq - last_segment_len
-        return (segment_seq - self.segment_len) > 0
+        total_segment_len, segment_len = self.attn_window_size, self.segment_len
+        return ((seq_index % total_segment_len + 1) - segment_len) > 0
     def seq_len_with_longterm_mem(
         self,
@@ -597,7 +600,7 @@ class MemoryAsContextTransformer(Module):
         assert seq_len > 0
         segment_len, num_mem = self.segment_len, self.num_longterm_mem_tokens
-        return ceil(seq_len / segment_len) * num_mem + seq_len
+        return ((seq_len - 1) // segment_len) * num_mem + seq_len
     @torch.no_grad()
     def sample(
@@ -695,7 +698,7 @@ class MemoryAsContextTransformer(Module):
         mems = repeat(self.longterm_mems, 'n d -> b n d', b = x.shape[0])
         x, inverse_pack_mems = pack_with_inverse((x, mems), 'b * d')
-        x = inverse_segment(x, remove_pad = False)
+        x = inverse_segment(x)
         # splice out unneeded tokens from padding for longterm mems
@@ -723,9 +726,9 @@ class MemoryAsContextTransformer(Module):
         is_inferencing = exists(cache)
         if not exists(cache):
-            cache = (None, None)
+            cache = (seq_len_with_mem - 1, None, None)
-        kv_caches, neural_mem_caches = cache
+        inference_seq_index, kv_caches, neural_mem_caches = cache
         kv_caches = iter(default(kv_caches, []))
         neural_mem_caches = iter(default(neural_mem_caches, []))
@@ -744,7 +747,8 @@ class MemoryAsContextTransformer(Module):
         # when inferencing, only do one token at a time
         if is_inferencing:
-            x = x[:, -1:]
+            ind = inference_seq_index
+            x = x[:, ind:(ind + 1)]
         # expand and reduce streams for hyper connections
@@ -763,14 +767,13 @@ class MemoryAsContextTransformer(Module):
                 mem_input, add_residual = mem_hyper_conn(x)
                 if not is_inferencing:
-                    retrieved, mem_kv_aux_loss = mem(
+                    (retrieved, next_neural_mem_cache), mem_kv_aux_loss = mem(
                         mem_input,
                         return_aux_kv_loss = True
                     )
                     kv_recon_losses = kv_recon_losses + mem_kv_aux_loss
-                    next_neural_mem_cache = (seq_len, None, None, None)
                 else:
                     retrieved, next_neural_mem_cache = mem.forward_inference(
                         mem_input,
@@ -817,6 +820,17 @@ class MemoryAsContextTransformer(Module):
             if not self.sliding_window_attn and divisible_by(seq_len_with_mem, attn_window_size):
                 next_kv_caches = next_kv_caches[..., 0:0, :]
+            next_cache = (
+                inference_seq_index + 1,
+                next_kv_caches,
+                next_neural_mem_caches
+            )
+            is_longterm_mem = self.seq_index_is_longterm(inference_seq_index)
+            if is_inferencing and is_longterm_mem:
+                return None, next_cache
         # hyper connection reducing of streams
         x = self.reduce_streams(x)
@@ -843,7 +857,7 @@ class MemoryAsContextTransformer(Module):
             if not return_cache:
                 return logits
-            return logits, (next_kv_caches, next_neural_mem_caches)
+            return logits, next_cache
         ar_loss = F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)

titans_pytorch-0.1.30/titans_pytorch/titans.py → titans_pytorch-0.1.32/titans_pytorch/neural_memory.py RENAMED Viewed

@@ -38,8 +38,11 @@ LinearNoBias = partial(Linear, bias = False)
 def exists(v):
     return v is not None
-def default(v, d):
-    return v if exists(v) else d
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
 def xnor(x, y):
     return not (x ^ y)
@@ -468,7 +471,12 @@ class NeuralMemory(Module):
             weighted_loss = loss * loss_weights
             return weighted_loss.sum(), weighted_loss.mean()
-        self.per_sample_grad_fn = vmap(grad(forward_and_loss, has_aux = True), in_dims = (None, 0, 0, 0))
+        # two functions
+        grad_fn = grad(forward_and_loss, has_aux = True)
+        self.per_sample_grad_fn = vmap(grad_fn, in_dims = (None, 0, 0, 0))
+        self.per_sample_grad_fn_expanded_weights = vmap(grad_fn, in_dims = (0,) * 4)
         # queries for retrieving from the model
@@ -561,6 +569,7 @@ class NeuralMemory(Module):
         seq,
         weights: dict[str, Tensor],
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]] | None = None,
+        prev_layer_updates: dict[str, Tensor] | None = None,
         return_aux_kv_loss = False,
         chunk_size = None,
         value_residual = None
@@ -583,10 +592,25 @@ class NeuralMemory(Module):
         seq = seq[:, :round_down_seq_len]
+        # per sample grad function
+        per_sample_grad_fn = self.per_sample_grad_fn
         # weights of the memory network
         weights = TensorDict(weights)
+        # allow for neural memory of a previous layer and the past to produce gradients that become the weights of the current one generating the surprise
+        # think this is necessary otherwise the memory model is static (unless if paper is misunderstood)
+        # improvise (or perhaps correcting to) a solution
+        if exists(prev_layer_updates):
+            prev_layer_updates = TensorDict(weights)
+            weights = weights + prev_layer_updates
+            per_sample_grad_fn = self.per_sample_grad_fn_expanded_weights # the weights will now have a batch * chunk dimension
         # derive learned hparams for optimization of memory network
         adaptive_lr = self.to_adaptive_step(seq)
@@ -635,7 +659,7 @@ class NeuralMemory(Module):
         # get grads and extra auxiliary loss (for backwarding through qkv projection in base neural memory module)
-        grads, aux_kv_recon_loss = self.per_sample_grad_fn(dict(weights), keys, adaptive_lr, values)
+        grads, aux_kv_recon_loss = per_sample_grad_fn(dict(weights), keys, adaptive_lr, values)
         grads = TensorDict(grads)
@@ -781,6 +805,7 @@ class NeuralMemory(Module):
         return values[:, :seq_len]
+    @torch.no_grad()
     def forward_inference(
         self,
         token: Tensor,
@@ -854,13 +879,18 @@ class NeuralMemory(Module):
         return_aux_kv_loss = False,
         chunk_size = None,
         store_chunk_size = None,
-        return_values = False
+        return_values = False,
+        return_next_state = False
     ):
         batch, seq_len = seq.shape[:2]
         if seq_len < self.retrieve_chunk_size:
             out = self.init_empty_memory_embed(batch, seq_len)
+            next_store_state = (seq_len, seq, None, None)
+            out = (out, next_store_state)
             if not return_aux_kv_loss:
                 return out
@@ -870,16 +900,31 @@ class NeuralMemory(Module):
             mem_model_weights = self.init_weights()
         store_seq = default(store_seq, seq)
-        store_chunk_size = default(store_chunk_size, chunk_size)
+        store_seq_len = store_seq.shape[-2]
+        store_chunk_size = default(store_chunk_size, chunk_size, self.store_chunk_size)
+        remainder = store_seq_len % store_chunk_size
         (updates, next_state, values), aux_kv_recon_loss = self.store_memories(store_seq, mem_model_weights, chunk_size = store_chunk_size, return_aux_kv_loss = True)
         retrieved = self.retrieve_memories(seq, mem_model_weights + updates, chunk_size = chunk_size)
-        output = retrieved
+        # determine state for the storing of memories
+        # for transformer-xl like training with neural memory as well as inferencing with initial prompt
+        cache_store_seq = None
+        if remainder > 0:
+            cache_store_seq = store_seq[:, -remainder:]
+        updates = updates.apply(lambda t: t[:, -1:])
+        next_store_state = (seq_len, cache_store_seq, next_state, updates)
+        output = (retrieved, next_store_state)
         if return_values:
-            output = (retrieved, values)
+            output = (*output, values)
         if not return_aux_kv_loss:
             return output