PyPI - locoformer - Versions diffs - 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl - Mend

locoformer 0.0.5py3-none-any.whl → 0.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

locoformer/locoformer.py CHANGED Viewed

@@ -2,11 +2,12 @@ from __future__ import annotations
 from functools import partial
 import torch
-from torch import cat, stack, is_tensor
+from torch import nn, cat, stack, arange, Tensor, is_tensor
 import torch.nn.functional as F
-from torch.nn import Module, ModuleList, Linear, RMSNorm, Identity
+from torch.nn import Module, ModuleList, Linear, RMSNorm, Identity, Sequential
 from torch.utils._pytree import tree_map
+import einx
 from einops import rearrange, einsum
 from einops.layers.torch import Rearrange
@@ -24,6 +25,9 @@ def exists(v):
 def default(v, d):
     return v if exists(v) else d
+def first(arr):
+    return arr[0]
 def divisible_by(num, den):
     return (num % den) == 0
@@ -148,9 +152,12 @@ class Attention(Module):
     def __init__(
         self,
         dim,
+        window_size,
         dim_head = 64,
         heads = 8,
-        pre_rmsnorm = True
+        pre_rmsnorm = True,
+        fixed_window_size = False,
+        accept_value_residual = False
     ):
         super().__init__()
         self.scale = dim_head ** -0.5
@@ -167,20 +174,55 @@ class Attention(Module):
         self.to_kv = LinearNoBias(dim, dim_inner * 2)
         self.to_out = LinearNoBias(dim_inner, dim)
+        self.to_v_gates = Sequential(
+            LinearNoBias(dim, heads),
+            Rearrange('b n h -> b h n 1'),
+            nn.Sigmoid()
+        )
+        # value residual
+        self.accept_value_residual = accept_value_residual
+        if accept_value_residual:
+            self.to_value_residual_mix = Sequential(
+                LinearNoBias(dim, heads),
+                Rearrange('b n h -> b h n 1'),
+                nn.Sigmoid()
+            )
+        # fixed window size
+        self.fixed_window_size = fixed_window_size
+        self.window_size = window_size
     def forward(
         self,
         tokens,
+        value_residual = None,
         kv_cache = None,
-        return_kv_cache = False
+        return_kv_cache = False,
     ):
+        seq_len = tokens.shape[-2]
+        assert seq_len <= self.window_size
+        device = tokens.device
         tokens = self.norm(tokens)
         q, k, v = (self.to_q(tokens), *self.to_kv(tokens).chunk(2, dim = -1))
         q, k, v = map(self.split_heads, (q, k, v))
+        orig_v = v
         q = q * self.scale
+        if exists(value_residual):
+            assert self.accept_value_residual
+            mix = self.to_value_residual_mix(tokens)
+            v = v.lerp(value_residual, mix)
         if exists(kv_cache):
             ck, cv = kv_cache
             k = cat((ck, k), dim = -2)
@@ -195,7 +237,13 @@ class Attention(Module):
         i, j = sim.shape[-2:]
-        causal_mask = torch.ones((i, j), dtype = torch.bool, device = sim.device).triu(j - i + 1)
+        if self.fixed_window_size:
+            i_seq = arange(i, device = device)
+            j_seq = arange(j, device = device) - (j - i)
+            dist = einx.subtract('i, j -> i j', i_seq, j_seq)
+            causal_mask = (dist < 0) | (dist > self.window_size)
+        else:
+            causal_mask = torch.ones((i, j), dtype = torch.bool, device = sim.device).triu(j - i + 1)
         sim = sim.masked_fill(causal_mask, -torch.finfo(sim.dtype).max)
@@ -203,6 +251,8 @@ class Attention(Module):
         out = einsum(attn, v, 'b h i j, b h j d -> b h i d')
+        out = out * self.to_v_gates(tokens)
         out = self.merge_heads(out)
         out = self.to_out(out)
@@ -210,7 +260,7 @@ class Attention(Module):
         if not return_kv_cache:
             return out
-        return out, next_kv_cache
+        return out, (next_kv_cache, orig_v)
 class FeedForward(Module):
     def __init__(
@@ -244,17 +294,21 @@ class TransformerXL(Module):
         self,
         dim,
         depth,
+        window_size,
         dim_head = 64,
         heads = 8,
         expansion_factor = 4.,
-        final_norm = True
+        final_norm = True,
+        fixed_window_size = False,
     ):
         super().__init__()
         layers = ModuleList([])
-        for _ in range(depth):
-            attn = Attention(dim = dim, dim_head = dim_head, heads = heads)
+        for i in range(depth):
+            is_first = i == 0
+            attn = Attention(dim = dim, dim_head = dim_head, heads = heads, fixed_window_size = fixed_window_size, window_size = window_size, accept_value_residual = not is_first)
             ff = FeedForward(dim = dim, expansion_factor = expansion_factor)
@@ -265,6 +319,11 @@ class TransformerXL(Module):
         self.layers = layers
         self.norm = RMSNorm(dim) if final_norm else Identity()
+        # fixed window size
+        self.fixed_window_size = fixed_window_size
+        self.window_size = window_size
     def forward(
         self,
         x,
@@ -275,22 +334,28 @@ class TransformerXL(Module):
         cache = default(cache, (None,) * len(self.layers))
         next_kv_caches = []
+        value_residual = None
         for (attn, ff), kv_cache in zip(self.layers, cache):
-            attn_out, next_kv_cache = attn(x, kv_cache = kv_cache, return_kv_cache = True)
-            next_kv_caches.append(next_kv_cache)
+            attn_out, (next_kv_cache, values) = attn(x, value_residual = value_residual, kv_cache = kv_cache, return_kv_cache = True)
             x = attn_out + x
             x = ff(x) + x
+            next_kv_caches.append(next_kv_cache)
+            value_residual = default(value_residual, values)
         embed = self.norm(x)
         if not return_kv_cache:
             return embed
-        return embed, stack(next_kv_caches)
+        next_kv_cache = stack(next_kv_caches)
+        next_kv_cache = next_kv_cache[..., -self.window_size:, :]
+        return embed, next_kv_cache
 # class
@@ -314,28 +379,42 @@ class Locoformer(Module):
         self.value_network = value_network
+        self.fixed_window_size = transformer.fixed_window_size
+        self.window_size = transformer.window_size
     @property
     def device(self):
         return next(self.parameters()).device
+    def actor_parameters(self):
+        return self.unembedder.parameters()
+    def critic_parameters(self):
+        return self.value_network.parameters()
     def get_stateful_forward(
         self,
-        segment_size,
         initial_states: Tensor | None = None,
         inference_mode = False,
         has_batch_dim = False,
+        has_time_dim = False,
         **kwargs
     ):
+        window_size = self.window_size
         cache = None
-        def stateful_forward(state: Tensor, override_kwargs: dict = dict()):
+        def stateful_forward(state: Tensor, **override_kwargs):
             nonlocal cache
-            # handle no batch, for easier time rolling out against envs
+            # handle no batch or time, for easier time rolling out against envs
             if not has_batch_dim:
                 state = rearrange(state, '... -> 1 ...')
+            if not has_time_dim:
+                state = rearrange(state, '... d -> ... 1 d')
             # forwards
             out, cache = self.forward(state, cache = cache, **{**kwargs, **override_kwargs})
@@ -344,10 +423,13 @@ class Locoformer(Module):
             cache_len = cache.shape[-2]
-            if divisible_by(cache_len, segment_size * 2):
-                cache = cache[..., -segment_size:, :]
+            if self.fixed_window_size or divisible_by(cache_len, window_size * 2):
+                cache = cache[..., -window_size:, :]
+            # maybe remove batch or time
-            # maybe remove batch
+            if not has_time_dim:
+                out = tree_map_tensor(out, lambda t: rearrange(t, '... 1 d -> ... d'))
             if not has_batch_dim:
                 out = tree_map_tensor(out, lambda t: rearrange(t, '1 ... -> ...'))
@@ -364,7 +446,7 @@ class Locoformer(Module):
         initial_logits = []
-        for state_segments in initial_states.split(segment_size, dim = -1):
+        for state_segments in initial_states.split(self.window_size, dim = -1):
             logits = stateful_forward(state_segments, return_values = False)
             initial_logits.append(logits)

{locoformer-0.0.5.dist-info → locoformer-0.0.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: locoformer
-Version: 0.0.5
+Version: 0.0.7
 Summary: LocoFormer
 Project-URL: Homepage, https://pypi.org/project/locoformer/
 Project-URL: Repository, https://github.com/lucidrains/locoformer
@@ -53,7 +53,7 @@ Description-Content-Type: text/markdown
 [LocoFormer - Generalist Locomotion via Long-Context Adaptation](https://generalist-locomotion.github.io/)
-The gist is they trained a simple Transformer-XL in simulation on robots with many different bodies (cross-embodiment). When transferring to the real-world, they noticed the robot now gains the ability to adapt to insults. The XL memories span across multiple trials, which allowed the robot to learn in-context adaptation.
+The gist is they trained a simple Transformer-XL in simulation on robots with many different bodies (cross-embodiment) with extreme domain randomization. When transferring to the real-world, they noticed the robot now gains the ability to adapt to insults. The XL memories span across multiple trials, which allowed the robot to learn in-context adaptation.
 ## Sponsors

locoformer-0.0.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,6 @@
+locoformer/__init__.py,sha256=XctsMGEZSR4mVl75fhds_1BtS5qGFiiItTDV7CmCt_I,45
+locoformer/locoformer.py,sha256=lJQs0CKr9iztF8tie1FRUVEItCt-IZbIILQqKcgK2sI,13142
+locoformer-0.0.7.dist-info/METADATA,sha256=PZ_phKV3t4Bha0GnUB5HPmE9w8A5fvNevsuN532Ls3s,3193
+locoformer-0.0.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+locoformer-0.0.7.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+locoformer-0.0.7.dist-info/RECORD,,

locoformer-0.0.5.dist-info/RECORD DELETED Viewed

@@ -1,6 +0,0 @@
-locoformer/__init__.py,sha256=XctsMGEZSR4mVl75fhds_1BtS5qGFiiItTDV7CmCt_I,45
-locoformer/locoformer.py,sha256=Yoh3hrj2E_91YLoYRa73wGzjdIiMdcd5ofNjkiVlogI,10570
-locoformer-0.0.5.dist-info/METADATA,sha256=oe6HfOwWKQvusiJl1ukmNFcrGRhdDZ6NcKZi3upv-SY,3159
-locoformer-0.0.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-locoformer-0.0.5.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-locoformer-0.0.5.dist-info/RECORD,,

{locoformer-0.0.5.dist-info → locoformer-0.0.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{locoformer-0.0.5.dist-info → locoformer-0.0.7.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

locoformer 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl

locoformer 0.0.5py3-none-any.whl → 0.0.7py3-none-any.whl