PyPI - titans-pytorch - Versions diffs - 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl - Mend

titans-pytorch 0.1.26py3-none-any.whl → 0.1.28py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

titans_pytorch/mac_transformer.py CHANGED Viewed

@@ -90,6 +90,9 @@ def divisible_by(num, den):
 def round_up_multiple(seq, mult):
     return ceil(seq / mult) * mult
+def round_down_multiple(seq, mult):
+    return seq // mult * mult
 def pack_with_inverse(t, pattern):
     packed, packed_shape = pack(t, pattern)
@@ -116,11 +119,11 @@ def pad_and_segment_with_inverse(seq, segment_len, fold_into_batch = True):
     if fold_into_batch:
         seq = rearrange(seq, 'b (w n) d -> (b w) n d', n = segment_len)
-    def inverse(out):
+    def inverse(out, remove_pad = True):
         if fold_into_batch:
             out = rearrange(out, '(b w) ... n d -> b ... (w n) d', b = batch)
-        if needs_pad:
+        if needs_pad and remove_pad:
             out = out[..., :-padding, :]
         return out
@@ -312,7 +315,7 @@ class SegmentedAttention(Module):
         # caching
-        next_cache = tuple(map(inverse_segment, (k, v)))
+        next_cache = (k, v)
         # take care of persistent memory key / values
@@ -575,6 +578,27 @@ class MemoryAsContextTransformer(Module):
         self.num_persist_mem_tokens = num_persist_mem_tokens
+    def seq_index_is_longterm(
+        self,
+        seq_index
+    ):
+        total_segment_len = self.attn_window_size
+        seq = seq_index + 1
+        seq -= int((seq % total_segment_len) == 0)
+        last_segment_len = round_down_multiple(seq, total_segment_len)
+        segment_seq = seq - last_segment_len
+        return (segment_seq - self.segment_len) > 0
+    def seq_len_with_longterm_mem(
+        self,
+        seq_len
+    ):
+        assert seq_len > 0
+        segment_len, num_mem = self.segment_len, self.num_longterm_mem_tokens
+        return ceil(seq_len / segment_len) * num_mem + seq_len
     @torch.no_grad()
     def sample(
         self,
@@ -594,8 +618,6 @@ class MemoryAsContextTransformer(Module):
         prompt_seq_len, out = prompt.shape[-1], prompt.clone()
         sample_num_times = max(0, seq_len - prompt_seq_len)
-        iter_wrap = tqdm.tqdm if show_progress else identity
         # cache for axial pos, attention, and neural memory
         cache = None
@@ -604,9 +626,7 @@ class MemoryAsContextTransformer(Module):
         # precompute factorized pos emb
         if use_cache:
-            round_up_seq_len = round_up_multiple(seq_len, self.segment_len)
-            longterm_mem_lens = (round_up_seq_len // self.segment_len) * self.num_longterm_mem_tokens
-            seq_len_with_mem = round_up_seq_len + longterm_mem_lens
+            seq_len_with_mem = self.seq_len_with_longterm_mem(seq_len)
             axial_dims = self.axial_pos_emb.maybe_derive_outer_dim(seq_len_with_mem, (self.neural_memory_segment_len,))
@@ -614,25 +634,31 @@ class MemoryAsContextTransformer(Module):
         # sample
-        for _ in iter_wrap(range(sample_num_times)):
+        with tqdm.tqdm(total = sample_num_times, disable = not show_progress) as pbar:
-            logits, next_cache = self.forward(
-                out,
-                disable_flex_attn = True,
-                cache = cache,
-                return_cache = True,
-                factorized_pos_emb = factorized_pos_emb
-            )
+            while out.shape[-1] < seq_len:
+                logits, next_cache = self.forward(
+                    out,
+                    disable_flex_attn = True,
+                    cache = cache,
+                    return_cache = True,
+                    factorized_pos_emb = factorized_pos_emb
+                )
-            if use_cache:
-                cache = next_cache
+                if use_cache:
+                    cache = next_cache
-            logits = logits[:, -1]
+                if not exists(logits):
+                    continue
-            logits = filter_fn(logits, **filter_kwargs)
-            sample = gumbel_sample(logits, temperature = temperature)
+                logits = logits[:, -1]
-            out = torch.cat((out, sample), dim = -1)
+                logits = filter_fn(logits, **filter_kwargs)
+                sample = gumbel_sample(logits, temperature = temperature)
+                out = torch.cat((out, sample), dim = -1)
+                pbar.update(1)
         self.train(was_training)
@@ -656,6 +682,8 @@ class MemoryAsContextTransformer(Module):
         batch, seq_len, neural_mem_segment_len, segment_len, num_longterm_mem_tokens, attn_window_size = *x.shape, self.neural_memory_segment_len, self.segment_len, self.num_longterm_mem_tokens, self.attn_window_size
+        seq_len_with_mem = self.seq_len_with_longterm_mem(seq_len)
         # token embedding
         x = self.token_emb(x)
@@ -667,9 +695,11 @@ class MemoryAsContextTransformer(Module):
         mems = repeat(self.longterm_mems, 'n d -> b n d', b = x.shape[0])
         x, inverse_pack_mems = pack_with_inverse((x, mems), 'b * d')
-        x = inverse_segment(x)
+        x = inverse_segment(x, remove_pad = False)
+        # splice out unneeded tokens from padding for longterm mems
-        seq_len_with_mem = x.shape[-2]
+        x = x[:, :seq_len_with_mem]
         # apply axial positional embedding
         # so intra and inter segment can be more easily discerned by the network
@@ -685,13 +715,12 @@ class MemoryAsContextTransformer(Module):
         flex_attn_fn = None
         if use_flex_attn:
-            block_mask = create_mac_block_mask(seq_len_with_mem, segment_len + num_longterm_mem_tokens, self.num_persist_mem_tokens, self.sliding_window_attn)
+            block_mask = create_mac_block_mask(seq_len_with_mem, self.attn_window_size, self.num_persist_mem_tokens, self.sliding_window_attn)
             flex_attn_fn = partial(flex_attention, block_mask = block_mask)
         # kv caching
         is_inferencing = exists(cache)
-        assert not (is_inferencing and self.num_longterm_mem_tokens > 0)
         if not exists(cache):
             cache = (None, None)
@@ -741,10 +770,10 @@ class MemoryAsContextTransformer(Module):
                     kv_recon_losses = kv_recon_losses + mem_kv_aux_loss
+                    next_neural_mem_cache = (seq_len, None, None, None)
                 else:
                     retrieved, next_neural_mem_cache = mem.forward_inference(
                         mem_input,
-                        seq_index = seq_len - 1,
                         state = next(neural_mem_caches, None)
                     )
@@ -775,15 +804,34 @@ class MemoryAsContextTransformer(Module):
             x = ff(x)
+        # taking care of cache first
+        # for early return when processing long term mem tokens during inference
+        if return_cache:
+            next_kv_caches = stack([stack(kv_cache) for kv_cache in next_kv_caches])
+            # handle kv cache length depending on local attention type
+            next_kv_caches = next_kv_caches[..., -attn_window_size:, :]
+            if not self.sliding_window_attn and divisible_by(seq_len_with_mem, attn_window_size):
+                next_kv_caches = next_kv_caches[..., 0:0, :]
+        # hyper connection reducing of streams
         x = self.reduce_streams(x)
         # excise out the memories
-        x, inverse_segment = pad_and_segment_with_inverse(x, segment_len + num_longterm_mem_tokens)
+        if not is_inferencing:
+            x, inverse_segment = pad_and_segment_with_inverse(x, attn_window_size)
-        x, _ = inverse_pack_mems(x)
+            x, _ = inverse_pack_mems(x)
-        x = inverse_segment(x)
+            x = inverse_segment(x)
+            x = x[:, :seq_len]
         # to logits
@@ -795,15 +843,6 @@ class MemoryAsContextTransformer(Module):
             if not return_cache:
                 return logits
-            next_kv_caches = stack([stack(kv_cache) for kv_cache in next_kv_caches])
-            # handle kv cache length depending on local attention type
-            next_kv_caches = next_kv_caches[..., -attn_window_size:, :]
-            if not self.sliding_window_attn and divisible_by(seq_len_with_mem, attn_window_size):
-                next_kv_caches = next_kv_caches[..., 0:0, :]
             return logits, (next_kv_caches, next_neural_mem_caches)
         ar_loss = F.cross_entropy(rearrange(logits, 'b n l -> b l n'), labels)

titans_pytorch/titans.py CHANGED Viewed

@@ -783,18 +783,16 @@ class NeuralMemory(Module):
     def forward_inference(
         self,
         token: Tensor,
-        seq_index = None, # the index of the token in the sequence, starts at 0
         state = None,
     ):
         # unpack previous state
         if not exists(state):
-            state = (None, None, None)
+            state = (0, None, None, None)
-        cache_store_seq, past_states, updates = state
+        seq_index, cache_store_seq, past_states, updates = state
-        seq_index = default(seq_index, 0)
         curr_seq_len = seq_index + 1
         batch = token.shape[0]
@@ -814,7 +812,7 @@ class NeuralMemory(Module):
         if curr_seq_len < self.chunk_size:
             empty_mem = self.init_empty_memory_embed(batch, 1)
-            return empty_mem, (cache_store_seq, past_states, updates)
+            return empty_mem, (curr_seq_len, cache_store_seq, past_states, updates)
         # store if storage sequence cache hits the chunk size
@@ -842,7 +840,7 @@ class NeuralMemory(Module):
         # next state tuple
-        next_state = (cache_store_seq, next_states, updates)
+        next_state = (curr_seq_len, cache_store_seq, next_states, updates)
         return retrieved, next_state

{titans_pytorch-0.1.26.dist-info → titans_pytorch-0.1.28.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: titans-pytorch
-Version: 0.1.26
+Version: 0.1.28
 Summary: Titans
 Project-URL: Homepage, https://pypi.org/project/titans-pytorch/
 Project-URL: Repository, https://github.com/lucidrains/titans-pytorch
@@ -35,7 +35,7 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.9
 Requires-Dist: accelerated-scan>=0.2.0
-Requires-Dist: axial-positional-embedding>=0.3.9
+Requires-Dist: axial-positional-embedding>=0.3.10
 Requires-Dist: einops>=0.8.0
 Requires-Dist: einx>=0.3.0
 Requires-Dist: hyper-connections>=0.1.9

titans_pytorch-0.1.28.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+titans_pytorch/__init__.py,sha256=u0tta_KqhOdfzCEDWT9P4_jejJEK2q1XxhsEzB5MnQU,223
+titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
+titans_pytorch/mac_transformer.py,sha256=RRLdVa8z-2IWbhhmRGfoNBycwaL32aMbpqutzmSQqpc,24575
+titans_pytorch/titans.py,sha256=gjoDcTsvw5X2d1I2xq4cM45YJIBqtLFuws8_jVylW_4,25746
+titans_pytorch-0.1.28.dist-info/METADATA,sha256=8AJX9oaut11GeFcyBmVsmbnY7oWhsal13yv75DtPeno,6815
+titans_pytorch-0.1.28.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+titans_pytorch-0.1.28.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+titans_pytorch-0.1.28.dist-info/RECORD,,

titans_pytorch-0.1.26.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-titans_pytorch/__init__.py,sha256=u0tta_KqhOdfzCEDWT9P4_jejJEK2q1XxhsEzB5MnQU,223
-titans_pytorch/associative_scan.py,sha256=Y-iYqmFuG-NoCKu6kgql1mhowXTeJfyawi3eUIXamp0,2650
-titans_pytorch/mac_transformer.py,sha256=RkEGmVlQyK1opqylqt1VEFEc_Gd_pbAArcwfhphotXI,23564
-titans_pytorch/titans.py,sha256=a-BXTG6DdNXWhby6E4W2fdhwipuMQ12tSqSL10iLvfY,25826
-titans_pytorch-0.1.26.dist-info/METADATA,sha256=zogTDD7iLlxkPDzIeCap9GCgz2VNFUWjVF_K6K8H9kg,6814
-titans_pytorch-0.1.26.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-titans_pytorch-0.1.26.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-titans_pytorch-0.1.26.dist-info/RECORD,,

{titans_pytorch-0.1.26.dist-info → titans_pytorch-0.1.28.dist-info}/WHEEL RENAMED Viewed

File without changes

{titans_pytorch-0.1.26.dist-info → titans_pytorch-0.1.28.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

titans-pytorch 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl

titans-pytorch 0.1.26py3-none-any.whl → 0.1.28py3-none-any.whl