PyPI - titans-pytorch - Versions diffs - 0.0.58__tar.gz → 0.0.61__tar.gz - Mend

titans-pytorch 0.0.58tar.gz → 0.0.61tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{titans_pytorch-0.0.58 → titans_pytorch-0.0.61}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.0.58
+Version: 0.0.61
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch

{titans_pytorch-0.0.58 → titans_pytorch-0.0.61}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "titans-pytorch"
-version = "0.0.58"
+version = "0.0.61"
 description = "Titans"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{titans_pytorch-0.0.58 → titans_pytorch-0.0.61}/titans_pytorch/mac_transformer.py RENAMED Viewed

@@ -24,8 +24,8 @@ def create_mac_block_mask(seq_len, window_size, persist_mem_len):
     def create_mac_mask(b, h, q_idx, kv_idx):
         is_persist_mem = kv_idx < persist_mem_len
-        causal_mask = q_idx >= (kv_idx - is_persist_mem)
-        block_diagonal = (q_idx // window_size) == ((kv_idx - is_persist_mem) // window_size)
+        causal_mask = q_idx >= (kv_idx - persist_mem_len)
+        block_diagonal = (q_idx // window_size) == ((kv_idx - persist_mem_len) // window_size)
         return is_persist_mem | (~is_persist_mem & (causal_mask & block_diagonal))
     block_mask = create_block_mask(create_mac_mask, B = None, H = None, Q_LEN = seq_len, KV_LEN = seq_len + persist_mem_len, _compile = True)
@@ -489,7 +489,7 @@ class MemoryAsContextTransformer(Module):
         flex_attn_fn = None
         if use_flex_attn:
-            block_mask = create_mac_block_mask(seq_len_with_mem, self.segment_len, self.num_persist_mem_tokens)
+            block_mask = create_mac_block_mask(seq_len_with_mem, segment_len + num_longterm_mem_tokens, self.num_persist_mem_tokens)
             flex_attn_fn = partial(flex_attention, block_mask = block_mask)
         # value residual

{titans_pytorch-0.0.58 → titans_pytorch-0.0.61}/titans_pytorch/titans.py RENAMED Viewed

@@ -289,6 +289,8 @@ class NeuralMemory(Module):
         self.use_accelerated_scan = use_accelerated_scan
+        self.register_buffer('zero', torch.tensor(0.), persistent = False)
     def init_weights_and_momentum(self):
         params = TensorDict(dict(self.memory_model.named_parameters()))
@@ -306,6 +308,13 @@ class NeuralMemory(Module):
         past_state: tuple[dict[str, Tensor], dict[str, Tensor]],
         return_aux_kv_loss = False
     ):
+        seq_len = seq.shape[-2]
+        # handle edge case
+        if seq_len < self.chunk_size:
+            past_weight, _ = past_state
+            return TensorDict(past_weight).clone().zero_(), self.zero
         seq = self.store_norm(seq)
@@ -425,12 +434,10 @@ class NeuralMemory(Module):
         last_update = updates.apply(lambda t: t[:, -1])
-        next_state = (curr_weights + last_update, next_momentum)
         if not return_aux_kv_loss:
-            return updates, next_state
+            return updates
-        return updates, next_state, aux_kv_recon_loss.mean()
+        return updates, aux_kv_recon_loss.mean()
     def retrieve_memories(
         self,
@@ -442,7 +449,8 @@ class NeuralMemory(Module):
         seq = self.retrieve_norm(seq)
-        assert seq_len >= chunk_size
+        if seq_len < self.chunk_size:
+            return self.init_empty_memory_embed(batch, seq_len)
         seq = seq[:, (chunk_size - 1):]
         curtailed_seq_len = seq.shape[-2]
@@ -524,7 +532,7 @@ class NeuralMemory(Module):
         store_seq = default(store_seq, seq)
-        updates, next_memories, aux_kv_recon_loss = self.store_memories(store_seq, past_state, return_aux_kv_loss = True)
+        updates, aux_kv_recon_loss = self.store_memories(store_seq, past_state, return_aux_kv_loss = True)
         past_weights, _ = past_state

{titans_pytorch-0.0.58 → titans_pytorch-0.0.61}/train_mac.py RENAMED Viewed

@@ -24,16 +24,27 @@ GENERATE_LENGTH = 512
 SHOULD_GENERATE = True
 SEQ_LEN = 512
-PROJECT_NAME = 'titans-mac-transformer'
-WANDB_ONLINE = False # turn this on to pipe experiment to cloud
+# neural memory related
 NEURAL_MEMORY_DEPTH = 2
 NUM_PERSIST_MEM = 4
 NUM_LONGTERM_MEM = 4
 NEURAL_MEM_LAYERS = (2, 4)
+NEURAL_MEM_GATE_ATTN_OUTPUT = True
 WINDOW_SIZE = 32
 KV_RECON_LOSS_WEIGHT = 0.
 LEARNED_MEM_MODEL_WEIGHTS = True
+# experiment related
+PROJECT_NAME = 'titans-mac-transformer'
 RUN_NAME = f'mac - {NUM_LONGTERM_MEM} longterm mems, layers {NEURAL_MEM_LAYERS}'
+WANDB_ONLINE = False # turn this on to pipe experiment to cloud
+# perf related
+USE_ACCELERATED_SCAN = True
+USE_FLEX_ATTN = True
 # wandb experiment tracker
@@ -112,10 +123,13 @@ model = MemoryAsContextTransformer(
     num_longterm_mem_tokens = NUM_LONGTERM_MEM,
     neural_memory_layers = NEURAL_MEM_LAYERS,
     neural_memory_segment_len = WINDOW_SIZE // 2,
+    neural_mem_gate_attn_output = NEURAL_MEM_GATE_ATTN_OUTPUT,
     aux_kv_recon_loss_weight = KV_RECON_LOSS_WEIGHT,
+    use_flex_attn = USE_FLEX_ATTN,
     neural_memory_kwargs = dict(
         dim_head = 64,
         heads = 4,
+        use_accelerated_scan = USE_ACCELERATED_SCAN,
         learned_mem_model_weights = LEARNED_MEM_MODEL_WEIGHTS,
         default_model_kwargs = dict(
             depth = NEURAL_MEMORY_DEPTH,