PyPI - locoformer - Versions diffs - 0.0.17__tar.gz → 0.0.37__tar.gz - Mend

locoformer 0.0.17tar.gz → 0.0.37tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{locoformer-0.0.17 → locoformer-0.0.37}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: locoformer
-Version: 0.0.17
+Version: 0.0.37
 Summary: LocoFormer
 Project-URL: Homepage, https://pypi.org/project/locoformer/
 Project-URL: Repository, https://github.com/lucidrains/locoformer
@@ -38,8 +38,10 @@ Requires-Dist: assoc-scan
 Requires-Dist: beartype
 Requires-Dist: einops>=0.8.0
 Requires-Dist: einx>=0.3.0
+Requires-Dist: hl-gauss-pytorch>=0.2.0
 Requires-Dist: rotary-embedding-torch
 Requires-Dist: torch>=2.4
+Requires-Dist: x-evolution
 Requires-Dist: x-mlps-pytorch
 Provides-Extra: examples
 Requires-Dist: accelerate; extra == 'examples'
@@ -54,7 +56,7 @@ Description-Content-Type: text/markdown
 [LocoFormer - Generalist Locomotion via Long-Context Adaptation](https://generalist-locomotion.github.io/)
-The gist is they trained a simple Transformer-XL in simulation on robots with many different bodies (cross-embodiment) with extreme domain randomization. When transferring to the real-world, they noticed the robot now gains the ability to adapt to insults. The XL memories span across multiple trials, which allowed the robot to learn in-context adaptation.
+The gist is they trained a simple Transformer-XL in simulation on robots with many different bodies (cross-embodiment) and extreme domain randomization. When transferring to the real-world, they noticed the robot now gains the ability to adapt to insults. The XL memories span across multiple trials, which allowed the robot to learn in-context adaptation.
 ## Sponsors

{locoformer-0.0.17 → locoformer-0.0.37}/README.md RENAMED Viewed

@@ -4,7 +4,7 @@
 [LocoFormer - Generalist Locomotion via Long-Context Adaptation](https://generalist-locomotion.github.io/)
-The gist is they trained a simple Transformer-XL in simulation on robots with many different bodies (cross-embodiment) with extreme domain randomization. When transferring to the real-world, they noticed the robot now gains the ability to adapt to insults. The XL memories span across multiple trials, which allowed the robot to learn in-context adaptation.
+The gist is they trained a simple Transformer-XL in simulation on robots with many different bodies (cross-embodiment) and extreme domain randomization. When transferring to the real-world, they noticed the robot now gains the ability to adapt to insults. The XL memories span across multiple trials, which allowed the robot to learn in-context adaptation.
 ## Sponsors

{locoformer-0.0.17 → locoformer-0.0.37}/locoformer/locoformer.py RENAMED Viewed

@@ -1,10 +1,14 @@
 from __future__ import annotations
+from typing import Callable
+from types import SimpleNamespace
 from functools import partial
 from pathlib import Path
 from contextlib import contextmanager
 from collections import namedtuple
+from inspect import signature
 import numpy as np
 from numpy import ndarray
 from numpy.lib.format import open_memmap
@@ -16,7 +20,7 @@ import torch
 from torch import nn, cat, stack, arange, Tensor, tensor, is_tensor, from_numpy
 import torch.nn.functional as F
 from torch.nn import Module, ModuleList, Linear, RMSNorm, Identity, Sequential
-from torch.utils._pytree import tree_map
+from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
 from torch.utils.data import Dataset, DataLoader
 from torch.optim import Optimizer
@@ -26,12 +30,20 @@ from einops.layers.torch import Rearrange
 from rotary_embedding_torch import RotaryEmbedding
+from hl_gauss_pytorch import HLGaussLoss
 from assoc_scan import AssocScan
+from x_mlps_pytorch import MLP
+from x_evolution import EvoStrategy
 # constants
 LinearNoBias = partial(Linear, bias = False)
+Cache = namedtuple('Cache', ('curr_timestep', 'kv_cache')) # (int, Tensor)
 # helper functions
 def exists(v):
@@ -43,14 +55,24 @@ def default(v, d):
 def first(arr):
     return arr[0]
+def xnor(x, y):
+    return not (x ^ y)
 def divisible_by(num, den):
     return (num % den) == 0
+def get_param_names(fn):
+    parameters = signature(fn).parameters
+    return list(parameters.keys())
 # tensor helpers
 def log(t, eps = 1e-20):
     return t.clamp_min(eps).log()
+def is_empty(t):
+    return t.numel() == 0
 def tree_map_tensor(x, fn):
     return tree_map(lambda t: t if not is_tensor(t) else fn(t), x)
@@ -67,10 +89,102 @@ def pad_at_dim(
     zeros = ((0, 0) * dims_from_right)
     return F.pad(t, (*zeros, *pad), value = value)
+def normalize(t, eps = 1e-5):
+    return (t - t.mean()) / t.std().clamp_min(eps)
+def tensor_to_dict(
+    t: Tensor,
+    config: tuple[tuple[str, int] | str],
+    dim = -1,
+    return_dottable = True
+):
+    config = tuple((c, 1) if isinstance(c, str) else c for c in config)
+    names, sizes = zip(*config)
+    assert sum(sizes) == t.shape[dim]
+    t = t.split(sizes, dim = dim)
+    tensor_dict = dict(zip(names, t))
+    if not return_dottable:
+        return tensor_dict
+    return SimpleNamespace(**tensor_dict)
 def calc_entropy(logits):
     prob = logits.softmax(dim = -1)
     return -(prob * log(prob)).sum(dim = -1)
+# reward functions - A.2
+def reward_linear_velocity_command_tracking(
+    state,
+    command,
+    s1 = 1.
+):
+    if not (hasattr(state, 'v_xy') and hasattr(command, 'v_xy')):
+        return 0.
+    error = (state.v_xy - command.v_xy).norm(dim = -1).pow(2)
+    return torch.exp(-error / s1)
+def reward_angular_velocity_command_tracking(
+    state,
+    command,
+    s2 = 1.
+):
+    if not (hasattr(state, 'w_z') and hasattr(command, 'w_z')):
+        return 0.
+    error = (state.w_z - command.w_z).norm(dim = -1).pow(2)
+    return torch.exp(-error / s2)
+def reward_base_linear_velocity_penalty(
+    state
+):
+    if not hasattr(state, 'v_z'):
+        return 0.
+    return -state.v_z.norm(dim = -1).pow(2)
+def reward_base_angular_velocity_penalty(
+    state
+):
+    if not hasattr(state, 'w_xy'):
+        return 0.
+    return -state.w_xy.norm(dim = -1).pow(2)
+def reward_base_height_penalty(
+    state,
+    x_z_nominal = 0.27
+):
+    if not hasattr(state, 'x_z'):
+        return 0.
+    return -(state.x_z - x_z_nominal).norm(dim = -1).pow(2)
+def reward_joint_acceleration_penalty(
+    state
+):
+    if not hasattr(state, 'joint_q'):
+        return 0.
+    return -state.joint_q.norm(dim = -1).pow(2)
+def reward_torque_penalty(
+    state
+):
+    if not hasattr(state, 'tau'):
+        return 0.
+    return -state.tau.norm(dim = -1).pow(2)
+def reward_alive(
+    state
+):
+    return 1.
 # generalized advantage estimate
 @torch.no_grad()
@@ -250,6 +364,57 @@ class ReplayDataset(Dataset):
         return data
+class RemappedReplayDataset(Dataset):
+    def __init__(
+        self,
+        dataset: ReplayDataset,
+        episode_mapping: Tensor | list[list[int]],
+        shuffle_episodes = False
+    ):
+        assert len(dataset) > 0
+        self.dataset = dataset
+        if is_tensor(episode_mapping):
+            assert episode_mapping.dtype in (torch.int, torch.long) and episode_mapping.ndim == 2
+            episode_mapping = episode_mapping.tolist()
+        self.episode_mapping = episode_mapping
+        self.shuffle_episodes = shuffle_episodes
+    def __len__(self):
+        return len(self.episode_mapping)
+    def __getitem__(self, idx):
+        episode_indices = self.episode_mapping[idx]
+        episode_indices = tensor(episode_indices)
+        episode_indices = episode_indices[(episode_indices >= 0) & (episode_indices < len(self.dataset))]
+        assert not is_empty(episode_indices)
+        if self.shuffle_episodes and episode_indices.numel() > 1:
+            num_episodes = len(episode_indices)
+            episode_indices = episode_indices[torch.randperm(num_episodes)]
+        episode_data = [self.dataset[i] for i in episode_indices.tolist()]
+        episode_lens = stack([data.pop('_lens') for data in episode_data])
+        keys = first(episode_data).keys()
+        values = [list(data.values()) for data in episode_data]
+        values = [cat(field_values) for field_values in zip(*values)] # concat across time
+        multi_episode_data = dict(zip(keys, values))
+        multi_episode_data['_lens'] = episode_lens.sum()
+        multi_episode_data['_episode_indices'] = cat([torch.full((episode_len,), episode_index) for episode_len, episode_index in zip(episode_lens, episode_indices)])
+        return multi_episode_data
 class ReplayBuffer:
     @beartype
@@ -314,6 +479,9 @@ class ReplayBuffer:
         self.memory_namedtuple = namedtuple('Memory', list(fields.keys()))
+    def __len__(self):
+        return (self.episode_lens > 0).sum().item()
     def reset_(self):
         self.episode_lens[:] = 0
         self.episode_index = 0
@@ -375,15 +543,92 @@ class ReplayBuffer:
         return self.memory_namedtuple(**data)
-    def dataset(self) -> Dataset:
+    def dataset(
+        self,
+        episode_mapping: Tensor | list[list[int]] | None = None,
+    ) -> Dataset:
         self.flush()
-        return ReplayDataset(self.folder)
+        dataset = ReplayDataset(self.folder)
+        if not exists(episode_mapping):
+            return dataset
+        return RemappedReplayDataset(dataset, episode_mapping)
-    def dataloader(self, batch_size, **kwargs) -> DataLoader:
+    def dataloader(
+        self,
+        batch_size,
+        episode_mapping: Tensor | list[list[int]] | None = None,
+        **kwargs
+    ) -> DataLoader:
         self.flush()
-        return DataLoader(self.dataset(), batch_size = batch_size, collate_fn = collate_var_time, **kwargs)
+        return DataLoader(self.dataset(episode_mapping), batch_size = batch_size, collate_fn = collate_var_time, **kwargs)
+# normalization + conditioning (needed for the commands to the robot)
+class MaybeAdaRMSNormWrapper(Module):
+    def __init__(
+        self,
+        fn: Module,
+        dim,
+        dim_cond = None
+    ):
+        super().__init__()
+        condition = exists(dim_cond)
+        self.fn = fn
+        self.norm = nn.RMSNorm(dim, elementwise_affine = not condition)
+        self.accept_condition = condition
+        if condition:
+            self.to_gamma = LinearNoBias(dim_cond, dim)
+            self.to_ada_norm_zero = nn.Linear(dim_cond, dim)
+            nn.init.zeros_(self.to_gamma.weight)
+            nn.init.zeros_(self.to_ada_norm_zero.weight)
+            nn.init.constant_(self.to_ada_norm_zero.bias, -5.)
+    def forward(
+        self,
+        x,
+        cond = None,
+        **kwargs
+    ):
+        need_cond = self.accept_condition
+        assert xnor(exists(cond), need_cond)
+        prenormed = self.norm(x)
+        if need_cond:
+            if cond.ndim == 2:
+                cond = rearrange(cond, 'b d -> b 1 d')
+            scale_in = self.to_gamma(cond)
+            prenormed = prenormed * (scale_in + 1.)
+        all_fn_out = self.fn(prenormed, **kwargs)
+        if not need_cond:
+            return all_fn_out
+        # function may return multiple args
+        (out, *rest), tree_spec = tree_flatten(all_fn_out)
+        if need_cond:
+            scale_out = self.to_ada_norm_zero(cond).sigmoid()
+            out = out * scale_out
+        # restore
+        all_fn_out = tree_unflatten((out, *rest), tree_spec)
+        return all_fn_out
 # transformer-xl with ppo
@@ -394,15 +639,12 @@ class Attention(Module):
         window_size,
         dim_head = 64,
         heads = 8,
-        pre_rmsnorm = True,
         fixed_window_size = False,
         accept_value_residual = False
     ):
         super().__init__()
         self.scale = dim_head ** -0.5
-        self.norm = RMSNorm(dim) if pre_rmsnorm else Identity()
         self.split_heads = Rearrange('b n (h d) -> b h n d', h = heads)
         self.merge_heads = Rearrange('b h n d -> b n (h d)')
@@ -446,8 +688,6 @@ class Attention(Module):
         device = tokens.device
-        tokens = self.norm(tokens)
         q, k, v = (self.to_q(tokens), *self.to_kv(tokens).chunk(2, dim = -1))
         q, k, v = map(self.split_heads, (q, k, v))
@@ -536,19 +776,26 @@ class TransformerXL(Module):
         dim_head = 64,
         heads = 8,
         expansion_factor = 4.,
+        dim_cond = None,
         final_norm = True,
         fixed_window_size = False,
     ):
         super().__init__()
+        condition = exists(dim_cond)
+        self.to_cond_tokens = MLP(dim_cond, dim * 2, activate_last = True) if exists(dim_cond) else None
+        norm_fn = partial(MaybeAdaRMSNormWrapper, dim = dim, dim_cond = (dim * 2) if condition else None)
         layers = ModuleList([])
         for i in range(depth):
             is_first = i == 0
-            attn = Attention(dim = dim, dim_head = dim_head, heads = heads, fixed_window_size = fixed_window_size, window_size = window_size, accept_value_residual = not is_first)
+            attn = norm_fn(Attention(dim = dim, dim_head = dim_head, heads = heads, fixed_window_size = fixed_window_size, window_size = window_size, accept_value_residual = not is_first))
-            ff = FeedForward(dim = dim, expansion_factor = expansion_factor)
+            ff = norm_fn(FeedForward(dim = dim, expansion_factor = expansion_factor))
             layers.append(ModuleList([
                 attn, ff
@@ -566,20 +813,32 @@ class TransformerXL(Module):
         self,
         x,
         cache = None,
-        return_kv_cache = False
+        return_kv_cache = False,
+        condition: Tensor | None = None
     ):
+        # cache and residuals
         cache = default(cache, (None,) * len(self.layers))
         next_kv_caches = []
         value_residual = None
+        # handle condition
+        cond_tokens = None
+        if exists(condition):
+            assert exists(self.to_cond_tokens)
+            cond_tokens = self.to_cond_tokens(condition)
+        # layers
         for (attn, ff), kv_cache in zip(self.layers, cache):
-            attn_out, (next_kv_cache, values) = attn(x, value_residual = value_residual, kv_cache = kv_cache, return_kv_cache = True)
+            attn_out, (next_kv_cache, values) = attn(x, cond = cond_tokens, value_residual = value_residual, kv_cache = kv_cache, return_kv_cache = True)
             x = attn_out + x
-            x = ff(x) + x
+            x = ff(x, cond = cond_tokens) + x
             next_kv_caches.append(next_kv_cache)
             value_residual = default(value_residual, values)
@@ -603,14 +862,21 @@ class Locoformer(Module):
         embedder: Module,
         unembedder: Module,
         transformer: dict | TransformerXL,
-        value_network: Module | None = None,
         discount_factor = 0.999,
         gae_lam = 0.95,
         ppo_eps_clip = 0.2,
         ppo_entropy_weight = 0.01,
         ppo_value_clip = 0.4,
+        dim_value_input = None,                 # needs to be set for value network to be available
+        value_network: Module = nn.Identity(),
+        reward_range: tuple[float, float] | None = None,
+        reward_shaping_fns: list[Callable[..., float | Tensor]] | None = None,
+        num_reward_bins = 32,
+        hl_gauss_loss_kwargs = dict(),
         value_loss_weight = 0.5,
-        calc_gae_kwargs: dict = dict()
+        calc_gae_kwargs: dict = dict(),
+        recurrent_kv_cache = True,
+        use_spo = False # simple policy optimization https://arxiv.org/abs/2401.16025 - Levine's group (PI) verified it is more stable than PPO
     ):
         super().__init__()
@@ -622,11 +888,30 @@ class Locoformer(Module):
         self.embedder = embedder
         self.unembedder = unembedder
-        self.value_network = value_network
         self.fixed_window_size = transformer.fixed_window_size
         self.window_size = transformer.window_size
+        # determine value network, using HL Gauss Layer
+        self.to_value_pred = None
+        if exists(dim_value_input):
+            assert exists(reward_range)
+            self.to_value_pred = nn.Sequential(
+                value_network,
+                LinearNoBias(dim_value_input, num_reward_bins)
+            )
+            reward_min, reward_max = reward_range
+            self.hl_gauss_loss = HLGaussLoss(
+                min_value = reward_min,
+                max_value = reward_max,
+                num_bins = num_reward_bins,
+                **hl_gauss_loss_kwargs
+            )
         # ppo related
         self.discount_factor = discount_factor
@@ -638,6 +923,19 @@ class Locoformer(Module):
         self.calc_gae_kwargs = calc_gae_kwargs
+        # maybe use spo
+        self.use_spo = use_spo
+        # maybe recurrent kv cache, from Ding et al. https://arxiv.org/abs/2012.15688
+        self.recurrent_kv_cache = recurrent_kv_cache
+        # reward shaping function
+        self.has_reward_shaping = exists(reward_shaping_fns)
+        self.reward_shaping_fns = reward_shaping_fns
         # loss related
         self.register_buffer('zero', tensor(0.), persistent = False)
@@ -650,10 +948,18 @@ class Locoformer(Module):
         return self.unembedder.parameters()
     def critic_parameters(self):
-        if not exists(self.value_network):
+        if not exists(self.to_value_pred):
             return []
-        return self.value_network.parameters()
+        return self.to_value_pred.parameters()
+    def evolve(
+        self,
+        environment,
+        **kwargs
+    ):
+        evo_strat = EvoStrategy(self, environment = environment, **kwargs)
+        evo_strat()
     def ppo(
         self,
@@ -663,12 +969,20 @@ class Locoformer(Module):
         reward,
         old_value,
         mask,
+        episode_lens,
         actor_optim: Optimizer | None = None,
         critic_optim: Optimizer | None = None
     ):
         window_size = self.window_size
         total_learnable_tokens = mask.sum().item()
+        seq_len = state.shape[1]
+        gae_mask = einx.less('j, i -> i j', arange(seq_len, device = self.device), episode_lens)
+        advantage, returns = calc_gae(reward, old_value, masks = gae_mask, lam = self.gae_lam, gamma = self.discount_factor, **self.calc_gae_kwargs)
+        advantage = normalize(advantage)
         windowed_tensors = [
             t.split(window_size, dim = 1) for t in
             (
@@ -677,7 +991,9 @@ class Locoformer(Module):
                 old_action_log_prob,
                 reward,
                 old_value,
-                mask
+                mask,
+                advantage,
+                returns
             )
         ]
@@ -694,10 +1010,12 @@ class Locoformer(Module):
             old_action_log_prob,
             reward,
             old_value,
-            mask
+            mask,
+            advantage,
+            returns
         ) in zip(*windowed_tensors):
-            (action_logits, value), cache = self.forward(state, cache = cache, detach_cache = True, return_values = True)
+            (action_logits, value_logits), cache = self.forward(state, cache = cache, detach_cache = True, return_values = True, return_raw_value_logits = True)
             entropy = calc_entropy(action_logits)
             action = rearrange(action, 'b t -> b t 1')
@@ -709,9 +1027,10 @@ class Locoformer(Module):
             eps_clip = self.ppo_eps_clip
             ratio = (log_prob - old_action_log_prob).exp()
-            advantage, returns = calc_gae(reward, old_value, lam = self.gae_lam, gamma = self.discount_factor, **self.calc_gae_kwargs)
-            actor_loss = -torch.min(ratio * advantage, ratio.clamp(1. - eps_clip, 1. + eps_clip) * advantage)
+            if self.use_spo:
+                actor_loss = -(ratio * advantage - (advantage.abs() * (ratio - 1.).square()) / (2 * eps_clip))
+            else:
+                actor_loss = -torch.min(ratio * advantage, ratio.clamp(1. - eps_clip, 1. + eps_clip) * advantage)
             actor_loss = actor_loss - self.ppo_entropy_weight * entropy
@@ -720,11 +1039,13 @@ class Locoformer(Module):
             # update critic
-            value_loss = F.mse_loss(returns, value, reduction = 'none')
+            value_loss = self.hl_gauss_loss(value_logits, returns, reduction = 'none')
             value_clip = self.ppo_value_clip
+            value = self.hl_gauss_loss(value_logits)
             clipped_value = old_value + (value - old_value).clamp(-value_clip, value_clip)
-            clipped_value_loss = F.mse_loss(returns, clipped_value, reduction = 'none')
+            clipped_value_loss = self.hl_gauss_loss(clipped_value, returns, reduction = 'none')
             critic_loss = torch.maximum(value_loss, clipped_value_loss) * self.value_loss_weight
@@ -750,28 +1071,65 @@ class Locoformer(Module):
         return mean_actor_loss.detach(), mean_critic_loss.detach()
+    def state_and_command_to_rewards(
+        self,
+        state,
+        commands = None
+    ) -> Tensor:
+        assert self.has_reward_shaping
+        rewards = []
+        for fn in self.reward_shaping_fns:
+            param_names = get_param_names(fn)
+            param_names = set(param_names) & {'state', 'command'}
+            if param_names == {'state'}: # only state
+                reward = fn(state = state)
+            elif param_names == {'state', 'command'}: # state and command
+                reward = fn(state = state, command = commands)
+            else:
+                raise ValueError('invalid number of arguments for reward shaping function')
+            rewards.append(reward)
+        # cast to Tensor if returns a float, just make it flexible for researcher
+        rewards = [tensor(reward) if not is_tensor(reward) else reward for reward in rewards]
+        return stack(rewards)
     def wrap_env_functions(self, env):
-        def wrapped_reset(*args, **kwargs):
-            state, _ =  env.reset(*args, **kwargs)
+        def transform_output(el):
+            if isinstance(el, ndarray):
+                return from_numpy(el)
+            elif isinstance(el, (int, bool, float)):
+                return tensor(el)
+            else:
+                return el
-            if isinstance(state, ndarray):
-                state = from_numpy(state)
+        def wrapped_reset(*args, **kwargs):
+            env_reset_out =  env.reset(*args, **kwargs)
-            return state, _
+            return tree_map(transform_output, env_reset_out)
         def wrapped_step(action, *args, **kwargs):
-            out = env.step(action.item(), *args, **kwargs)
-            def transform_output(el):
-                if isinstance(el, ndarray):
-                    return from_numpy(el)
-                elif isinstance(el, (int, bool, float)):
-                    return tensor(el)
-                else:
-                    return el
+            if is_tensor(action):
+                action = action.item()
+            env_step_out = env.step(action, *args, **kwargs)
+            env_step_out_torch = tree_map(transform_output, env_step_out)
+            if not self.has_reward_shaping:
+                return env_step_out_torch
-            return tree_map(transform_output, out)
+            shaped_rewards = self.state_and_command_to_rewards(env_step_out_torch)
+            return env_step_out_torch, shaped_rewards
         return wrapped_reset, wrapped_step
@@ -781,13 +1139,18 @@ class Locoformer(Module):
         inference_mode = False,
         has_batch_dim = False,
         has_time_dim = False,
+        state_time_dim = 1,
         **kwargs
     ):
         window_size = self.window_size
         cache = None
-        def stateful_forward(state: Tensor, **override_kwargs):
+        def stateful_forward(
+            state: Tensor,
+            condition: Tensor | None = None,
+            **override_kwargs
+        ):
             nonlocal cache
             # handle no batch or time, for easier time rolling out against envs
@@ -795,24 +1158,23 @@ class Locoformer(Module):
             if not has_batch_dim:
                 state = rearrange(state, '... -> 1 ...')
-            if not has_time_dim:
-                state = rearrange(state, '... d -> ... 1 d')
-            # forwards
+                if exists(command):
+                    condition = rearrange(condition, '... -> 1 ...')
-            out, cache = self.forward(state, cache = cache, **{**kwargs, **override_kwargs})
+            if not has_time_dim:
+                state = state.unsqueeze(state_time_dim)
-            # handle cache
+                if exists(command):
+                    condition = rearrange(condition, '... d -> ... 1 d')
-            cache_len = cache.shape[-2]
+            # forwards
-            if self.fixed_window_size or divisible_by(cache_len, window_size * 2):
-                cache = cache[..., -window_size:, :]
+            out, cache = self.forward(state, condition = condition, cache = cache, **{**kwargs, **override_kwargs})
             # maybe remove batch or time
             if not has_time_dim:
-                out = tree_map_tensor(out, lambda t: rearrange(t, '... 1 d -> ... d'))
+                out = tree_map_tensor(out, lambda t: t.squeeze(state_time_dim))
             if not has_batch_dim:
                 out = tree_map_tensor(out, lambda t: rearrange(t, '1 ... -> ...'))
@@ -841,16 +1203,36 @@ class Locoformer(Module):
     def forward(
         self,
         state: Tensor,
-        cache: Tensor | None = None,
+        cache: Cache | None = None,
+        condition: Tensor | None = None,
         detach_cache = False,
-        return_values = False
+        return_values = False,
+        return_raw_value_logits = False
     ):
         state = state.to(self.device)
         tokens = self.embedder(state)
-        embed, kv_cache = self.transformer(tokens, cache = cache, return_kv_cache = True)
+        # time
+        time = tokens.shape[-2]
+        # destruct the cache for the current timestep and the cache
+        prev_kv_cache = None
+        timestep_start = 0
+        if exists(cache):
+            timestep_start, prev_kv_cache = cache
+        # an assert - make sure during training or inference, forward never gets anything that crosses the window segment boundary, to open up some possibilities with extending memory
+        assert ((timestep_start % self.window_size) + time) <= self.window_size
+        # attention
+        embed, kv_cache = self.transformer(tokens, condition = condition, cache = prev_kv_cache, return_kv_cache = True)
         # unembed to actions - in language models this would be the next state
@@ -866,16 +1248,29 @@ class Locoformer(Module):
         # handle returning of values
         if return_values:
-            assert exists(self.value_network)
+            assert exists(self.to_value_pred)
-            values = self.value_network(embed)
+            values = self.to_value_pred(embed)
-            if values.ndim == 3:
-                assert values.shape[-1] == 1
-                values = rearrange(values, '... 1 -> ...')
+            if not return_raw_value_logits:
+                values = self.hl_gauss_loss(values) # converts the value logits to scalar values
             out = (out, values)
         # output and cache
-        return out, kv_cache
+        next_timestep = time + timestep_start
+        # handle curtailing kv cache at the right intervals
+        window_size = self.window_size
+        if self.fixed_window_size or divisible_by(next_timestep, window_size * 2):
+            kv_cache = kv_cache[..., -window_size:, :]
+        # maybe recurrent cache - shift the kv cache from one layer above to the one below, for extending on receptive field of past
+        if self.recurrent_kv_cache and divisible_by(next_timestep, window_size):
+            kv_cache = torch.roll(kv_cache, shifts = -1, dims = 0)
+        return out, (next_timestep, kv_cache)

{locoformer-0.0.17 → locoformer-0.0.37}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "locoformer"
-version = "0.0.17"
+version = "0.0.37"
 description = "LocoFormer"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -30,8 +30,10 @@ dependencies = [
     "beartype",
     "einx>=0.3.0",
     "einops>=0.8.0",
+    "hl-gauss-pytorch>=0.2.0",
     "rotary-embedding-torch",
     "torch>=2.4",
+    "x-evolution",
     "x-mlps-pytorch",
 ]

locoformer-0.0.37/tests/test_locoformer.py ADDED Viewed

@@ -0,0 +1,182 @@
+import pytest
+param = pytest.mark.parametrize
+import torch
+from torch import nn
+from x_mlps_pytorch import MLP
+from einops import rearrange
+from locoformer.locoformer import Locoformer
+@param('recurrent_kv_cache', (False, True))
+@param('has_commands', (False, True))
+def test_locoformer(
+    recurrent_kv_cache,
+    has_commands
+):
+    model = Locoformer(
+        embedder = nn.Embedding(256, 128),
+        unembedder = nn.Linear(128, 256, bias = False),
+        value_network = MLP(128, 64, 32),
+        dim_value_input = 32,
+        reward_range = (-100., 100.),
+        recurrent_kv_cache = recurrent_kv_cache,
+        transformer = dict(
+            dim = 128,
+            depth = 1,
+            window_size = 512,
+            dim_cond = 2 if has_commands else None
+        )
+    )
+    seq = torch.randint(0, 256, (3, 512))
+    commands = None
+    if has_commands:
+        commands = torch.randn(3, 512, 2)
+    (logits, values), cache = model(seq, condition = commands, return_values = True)
+    (logits, values), cache = model(seq, condition = commands, return_values = True, cache = cache)
+    (logits, values), cache = model(seq, condition = commands, return_values = True, cache = cache)
+    assert logits.shape == (3, 512, 256)
+    stateful_forward = model.get_stateful_forward(has_batch_dim = True, has_time_dim = True, return_values = True, inference_mode = True)
+    inference_command = torch.randn(1, 1, 2) if has_commands else None
+    for state in seq.unbind(dim = -1):
+        state = rearrange(state, 'b -> b 1')
+        logits, values = stateful_forward(state, condition = inference_command)
+        assert logits.shape == (3, 1, 256)
+def test_replay():
+    from locoformer.locoformer import ReplayBuffer
+    replay_buffer = ReplayBuffer(
+        './replay_data',
+        max_episodes = 10_000,
+        max_timesteps = 501,
+        fields = dict(
+            state = ('float', (8,)),
+            action = 'int',
+            action_log_prob = 'float',
+            reward = 'float',
+            value = 'float',
+            done = 'bool'
+        )
+    )
+    lens = [3, 5, 4]
+    for episode_len in lens:
+        with replay_buffer.one_episode():
+            for _ in range(episode_len):
+                state = torch.randn((8,))
+                action = torch.randint(0, 4, ())
+                log_prob = torch.randn(())
+                reward = torch.randn(())
+                value = torch.randn(())
+                done = torch.randint(0, 2, ()).bool()
+                replay_buffer.store(
+                    state = state,
+                    action = action,
+                    action_log_prob = log_prob,
+                    reward = reward,
+                    value = value,
+                    done = done
+                )
+    dataset = replay_buffer.dataset()
+    assert len(dataset) == 3
+    assert torch.is_tensor(dataset[0]['state'])
+    dataloader = replay_buffer.dataloader(batch_size = 3)
+    assert next(iter(dataloader))['state'].shape[0] == 3
+    # we will now consider consecutive pairs of episodes as 2 trials to be used for in-context adaptation
+    # but realistically there will be a function that converts a given ReplayBuffer -> Int[batch, episode_indices]
+    from torch import stack, arange
+    episode_indices = arange(len(replay_buffer))
+    remapped_episodes = stack((episode_indices[:-1], episode_indices[1:]))
+    dataloader = replay_buffer.dataloader(
+        batch_size = 1,
+        episode_mapping = remapped_episodes
+    )
+    assert next(iter(dataloader))['_lens'][0] == (3 + 5) # first and second episodes are concatted together timewise
+def test_reward_shaping():
+    model = Locoformer(
+        embedder = nn.Embedding(256, 128),
+        unembedder = nn.Linear(128, 256, bias = False),
+        value_network = MLP(128, 64, 32),
+        dim_value_input = 32,
+        reward_range = (-100., 100.),
+        reward_shaping_fns = [
+            lambda state: (state[3] - 2.5).pow(2).mean(),
+            lambda state, command: state[4:6].norm(dim = -1)
+        ],
+        transformer = dict(
+            dim = 128,
+            depth = 1,
+            window_size = 512
+        )
+    )
+    import numpy as np
+    class MockEnv:
+        def reset(self):
+            return np.random.normal(size = (10,))
+        def step(self, *args, **kwargs):
+            return np.random.normal(size = (10,))
+    env = MockEnv()
+    reset_fn, step_fn = model.wrap_env_functions(env)
+    reset_fn()
+    _, rewards = step_fn(3)
+    assert len(rewards) == 2
+def test_tensor_to_dict():
+    state = torch.randn(1, 3, 5)
+    config = (('xyz', 3), 'vx', 'vy')
+    from locoformer.locoformer import tensor_to_dict
+    state_dict = tensor_to_dict(state, config)
+    assert hasattr(state_dict, 'xyz') and state_dict.xyz.shape == (1, 3, 3)
+def test_evo():
+    model = Locoformer(
+        embedder = nn.Embedding(256, 128),
+        unembedder = nn.Linear(128, 256, bias = False),
+        value_network = MLP(128, 64, 32),
+        dim_value_input = 32,
+        reward_range = (-100., 100.),
+        transformer = dict(
+            dim = 128,
+            depth = 1,
+            window_size = 512,
+        )
+    )
+    model.evolve(lambda model: 1., num_generations = 1)

{locoformer-0.0.17 → locoformer-0.0.37}/train.py RENAMED Viewed

@@ -160,7 +160,7 @@ for i in range(NUM_BATCHES):
         optim.step()
         optim.zero_grad()
-    if divisible_by(i + 1, GENERATE_EVERY):
+    if divisible_by(i, GENERATE_EVERY):
         model.eval()
         val_seq = next(val_loader_iter)

{locoformer-0.0.17 → locoformer-0.0.37}/train_gym.py RENAMED Viewed

@@ -25,7 +25,6 @@ import torch.nn.functional as F
 from torch.utils.data import TensorDataset, DataLoader
 from torch.optim import Adam
-import einx
 from einops import rearrange
 from locoformer.locoformer import Locoformer, ReplayBuffer
@@ -60,8 +59,6 @@ def learn(
     batch_size = 16,
     epochs = 2,
 ):
-    device = accelerator.device
     dl = replay.dataloader(batch_size = batch_size, shuffle = True)
     model, dl, actor_optim, critic_optim = accelerator.prepare(model, dl, actor_optim, critic_optim)
@@ -70,18 +67,14 @@ def learn(
             data = SimpleNamespace(**data)
-            seq_len = data.state.shape[1]
-            value_mask = einx.less('j, i -> i j', arange(seq_len, device = device), data._lens)
-            value = torch.where(value_mask, data.value, 0.)
             actor_loss, critic_loss = model.ppo(
                 state = data.state,
                 action = data.action,
                 old_action_log_prob = data.action_log_prob,
                 reward = data.reward,
-                old_value = value,
+                old_value = data.value,
                 mask = data.learnable,
+                episode_lens = data._lens,
                 actor_optim = actor_optim,
                 critic_optim = critic_optim
             )
@@ -94,7 +87,7 @@ def main(
     env_name = 'LunarLander-v3',
     num_episodes = 50_000,
     max_timesteps = 500,
-    num_episodes_before_learn = 32,
+    num_episodes_before_learn = 64,
     clear_video = True,
     video_folder = 'recordings',
     record_every_episode = 250,
@@ -105,7 +98,8 @@ def main(
     ppo_eps_clip = 0.2,
     ppo_entropy_weight = .01,
     batch_size = 16,
-    epochs = 2
+    epochs = 3,
+    reward_range = (-100., 100.)
 ):
     # accelerate
@@ -153,7 +147,6 @@ def main(
     locoformer = Locoformer(
         embedder = MLP(dim_state, 64, bias = False),
         unembedder = MLP(64, num_actions, bias = False),
-        value_network = MLP(64, 1, bias = False),
         transformer = dict(
             dim = 64,
             dim_head = 32,
@@ -165,16 +158,20 @@ def main(
         gae_lam = gae_lam,
         ppo_eps_clip = ppo_eps_clip,
         ppo_entropy_weight = ppo_entropy_weight,
+        use_spo = True,
+        value_network = MLP(64, 64),
+        dim_value_input = 64,
+        reward_range = reward_range,
+        hl_gauss_loss_kwargs = dict(),
+        recurrent_kv_cache = True,
         calc_gae_kwargs = dict(
             use_accelerated = False
-        )
+        ),
     ).to(device)
     optim_actor = Adam([*locoformer.transformer.parameters(), *locoformer.actor_parameters()], lr = learning_rate, betas = betas)
     optim_critic = Adam([*locoformer.transformer.parameters(), *locoformer.critic_parameters()], lr = learning_rate, betas = betas)
-    timesteps_learn = 0
     # able to wrap the env for all values to torch tensors and back
     # all environments should follow usual MDP interface, domain randomization should be given at instantiation
@@ -205,7 +202,8 @@ def main(
                 # append to memory
-                done = truncated or terminated
+                exceeds_max_timesteps = timestep == (max_timesteps - 1)
+                done = truncated or terminated or tensor(exceeds_max_timesteps)
                 # get log prob of action
@@ -222,23 +220,24 @@ def main(
                     learnable = tensor(True)
                 )
-                # handle bootstrap value, which is a non-learnable timestep added with the next value for GAE
-                # only if terminated signal not detected
+                # increment counters
-                if not terminated:
-                    _, next_value = stateful_forward(next_state, return_values = True)
+                timestep += 1
-                    memory._replace(value = next_value, learnable = False)
+                # break if done or exceed max timestep
-                    replay.store(**memory._asdict())
+                if done:
-                # increment counters
+                    # handle bootstrap value, which is a non-learnable timestep added with the next value for GAE
+                    # only if terminated signal not detected
-                timestep += 1
+                    if not terminated:
+                        _, next_value = stateful_forward(next_state, return_values = True)
-                # break if done or exceed max timestep
+                        memory._replace(value = next_value, learnable = False)
+                        replay.store(**memory._asdict())
-                if done or timestep >= max_timesteps:
                     break
                 state = next_state

locoformer-0.0.17/tests/test_locoformer.py DELETED Viewed

@@ -1,86 +0,0 @@
-import pytest
-param = pytest.mark.parametrize
-import torch
-from x_mlps_pytorch import MLP
-from einops import rearrange
-def test_locoformer():
-    from locoformer.locoformer import Locoformer
-    from torch import nn
-    model = Locoformer(
-        embedder = nn.Embedding(256, 128),
-        unembedder = nn.Linear(128, 256, bias = False),
-        value_network = MLP(128, 32, 1),
-        transformer = dict(
-            dim = 128,
-            depth = 1,
-            window_size = 512
-        )
-    )
-    seq = torch.randint(0, 256, (3, 512))
-    (logits, values), cache = model(seq, return_values = True)
-    (logits, values), cache = model(seq, return_values = True, cache = cache)
-    (logits, values), cache = model(seq, return_values = True, cache = cache)
-    assert logits.shape == (3, 512, 256)
-    stateful_forward = model.get_stateful_forward(has_batch_dim = True, has_time_dim = True, return_values = True, inference_mode = True)
-    for state in seq.unbind(dim = -1):
-        state = rearrange(state, 'b -> b 1')
-        logits, values = stateful_forward(state)
-        assert logits.shape == (3, 1, 256)
-def test_replay():
-    from locoformer.locoformer import ReplayBuffer
-    replay_buffer = ReplayBuffer(
-        './replay_data',
-        max_episodes = 10_000,
-        max_timesteps = 501,
-        fields = dict(
-            state = ('float', (8,)),
-            action = 'int',
-            action_log_prob = 'float',
-            reward = 'float',
-            value = 'float',
-            done = 'bool'
-        )
-    )
-    lens = [3, 5, 4]
-    for episode_len in lens:
-        with replay_buffer.one_episode():
-            for _ in range(episode_len):
-                state = torch.randn((8,))
-                action = torch.randint(0, 4, ())
-                log_prob = torch.randn(())
-                reward = torch.randn(())
-                value = torch.randn(())
-                done = torch.randint(0, 2, ()).bool()
-                replay_buffer.store(
-                    state = state,
-                    action = action,
-                    action_log_prob = log_prob,
-                    reward = reward,
-                    value = value,
-                    done = done
-                )
-    dataset = replay_buffer.dataset()
-    assert len(dataset) == 3
-    assert torch.is_tensor(dataset[0]['state'])
-    dataloader = replay_buffer.dataloader(batch_size = 3)
-    assert next(iter(dataloader))['state'].shape[0] == 3