PyPI - locoformer - Versions diffs - 0.0.15__py3-none-any.whl → 0.0.29__py3-none-any.whl - Mend

locoformer 0.0.15py3-none-any.whl → 0.0.29py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

locoformer/locoformer.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from __future__ import annotations
+from typing import Callable
 from functools import partial
 from pathlib import Path
@@ -18,6 +19,7 @@ import torch.nn.functional as F
 from torch.nn import Module, ModuleList, Linear, RMSNorm, Identity, Sequential
 from torch.utils._pytree import tree_map
 from torch.utils.data import Dataset, DataLoader
+from torch.optim import Optimizer
 import einx
 from einops import rearrange, einsum
@@ -25,10 +27,16 @@ from einops.layers.torch import Rearrange
 from rotary_embedding_torch import RotaryEmbedding
+from hl_gauss_pytorch import HLGaussLoss
 from assoc_scan import AssocScan
+# constants
 LinearNoBias = partial(Linear, bias = False)
+Cache = namedtuple('Cache', ('curr_timestep', 'kv_cache')) # (int, Tensor)
 # helper functions
 def exists(v):
@@ -48,12 +56,12 @@ def divisible_by(num, den):
 def log(t, eps = 1e-20):
     return t.clamp_min(eps).log()
+def is_empty(t):
+    return t.numel() == 0
 def tree_map_tensor(x, fn):
     return tree_map(lambda t: t if not is_tensor(t) else fn(t), x)
-def detach_all(x):
-    return tree_map_tensor(x, lambda t: t.detach())
 def pad_at_dim(
     t,
     pad: tuple[int, int],
@@ -67,6 +75,9 @@ def pad_at_dim(
     zeros = ((0, 0) * dims_from_right)
     return F.pad(t, (*zeros, *pad), value = value)
+def normalize(t, eps = 1e-5):
+    return (t - t.mean()) / t.std().clamp_min(eps)
 def calc_entropy(logits):
     prob = logits.softmax(dim = -1)
     return -(prob * log(prob)).sum(dim = -1)
@@ -100,7 +111,7 @@ def calc_gae(
     returns = gae + values
-    return returns
+    return gae, returns
 # transformer-xl mask w/ flex attn
@@ -250,6 +261,57 @@ class ReplayDataset(Dataset):
         return data
+class RemappedReplayDataset(Dataset):
+    def __init__(
+        self,
+        dataset: ReplayDataset,
+        episode_mapping: Tensor | list[list[int]],
+        shuffle_episodes = False
+    ):
+        assert len(dataset) > 0
+        self.dataset = dataset
+        if is_tensor(episode_mapping):
+            assert episode_mapping.dtype in (torch.int, torch.long) and episode_mapping.ndim == 2
+            episode_mapping = episode_mapping.tolist()
+        self.episode_mapping = episode_mapping
+        self.shuffle_episodes = shuffle_episodes
+    def __len__(self):
+        return len(self.episode_mapping)
+    def __getitem__(self, idx):
+        episode_indices = self.episode_mapping[idx]
+        episode_indices = tensor(episode_indices)
+        episode_indices = episode_indices[(episode_indices >= 0) & (episode_indices < len(self.dataset))]
+        assert not is_empty(episode_indices)
+        if self.shuffle_episodes and episode_indices.numel() > 1:
+            num_episodes = len(episode_indices)
+            episode_indices = episode_indices[torch.randperm(num_episodes)]
+        episode_data = [self.dataset[i] for i in episode_indices.tolist()]
+        episode_lens = stack([data.pop('_lens') for data in episode_data])
+        keys = first(episode_data).keys()
+        values = [list(data.values()) for data in episode_data]
+        values = [cat(field_values) for field_values in zip(*values)] # concat across time
+        multi_episode_data = dict(zip(keys, values))
+        multi_episode_data['_lens'] = episode_lens.sum()
+        multi_episode_data['_episode_indices'] = cat([torch.full((episode_len,), episode_index) for episode_len, episode_index in zip(episode_lens, episode_indices)])
+        return multi_episode_data
 class ReplayBuffer:
     @beartype
@@ -314,6 +376,9 @@ class ReplayBuffer:
         self.memory_namedtuple = namedtuple('Memory', list(fields.keys()))
+    def __len__(self):
+        return (self.episode_lens > 0).sum().item()
     def reset_(self):
         self.episode_lens[:] = 0
         self.episode_index = 0
@@ -375,15 +440,28 @@ class ReplayBuffer:
         return self.memory_namedtuple(**data)
-    def dataset(self) -> Dataset:
+    def dataset(
+        self,
+        episode_mapping: Tensor | list[list[int]] | None = None,
+    ) -> Dataset:
         self.flush()
-        return ReplayDataset(self.folder)
+        dataset = ReplayDataset(self.folder)
+        if not exists(episode_mapping):
+            return dataset
-    def dataloader(self, batch_size, **kwargs) -> DataLoader:
+        return RemappedReplayDataset(dataset, episode_mapping)
+    def dataloader(
+        self,
+        batch_size,
+        episode_mapping: Tensor | list[list[int]] | None = None,
+        **kwargs
+    ) -> DataLoader:
         self.flush()
-        return DataLoader(self.dataset(), batch_size = batch_size, collate_fn = collate_var_time, **kwargs)
+        return DataLoader(self.dataset(episode_mapping), batch_size = batch_size, collate_fn = collate_var_time, **kwargs)
 # transformer-xl with ppo
@@ -603,13 +681,21 @@ class Locoformer(Module):
         embedder: Module,
         unembedder: Module,
         transformer: dict | TransformerXL,
-        value_network: Module | None = None,
         discount_factor = 0.999,
         gae_lam = 0.95,
         ppo_eps_clip = 0.2,
         ppo_entropy_weight = 0.01,
         ppo_value_clip = 0.4,
-        value_loss_weight = 0.5
+        dim_value_input = None, # needs to be set for value network to be available
+        value_network: Module = nn.Identity(),
+        reward_range: tuple[float, float] | None = None,
+        reward_shaping_fns: list[Callable[[Tensor], float | Tensor]] | None = None,
+        num_reward_bins = 32,
+        hl_gauss_loss_kwargs = dict(),
+        value_loss_weight = 0.5,
+        calc_gae_kwargs: dict = dict(),
+        recurrent_kv_cache = True,
+        use_spo = False # simple policy optimization https://arxiv.org/abs/2401.16025 - Levine's group (PI) verified it is more stable than PPO
     ):
         super().__init__()
@@ -621,11 +707,30 @@ class Locoformer(Module):
         self.embedder = embedder
         self.unembedder = unembedder
-        self.value_network = value_network
         self.fixed_window_size = transformer.fixed_window_size
         self.window_size = transformer.window_size
+        # determine value network, using HL Gauss Layer
+        self.to_value_pred = None
+        if exists(dim_value_input):
+            assert exists(reward_range)
+            self.to_value_pred = nn.Sequential(
+                value_network,
+                LinearNoBias(dim_value_input, num_reward_bins)
+            )
+            reward_min, reward_max = reward_range
+            self.hl_gauss_loss = HLGaussLoss(
+                min_value = reward_min,
+                max_value = reward_max,
+                num_bins = num_reward_bins,
+                **hl_gauss_loss_kwargs
+            )
         # ppo related
         self.discount_factor = discount_factor
@@ -635,6 +740,25 @@ class Locoformer(Module):
         self.ppo_value_clip = ppo_value_clip
         self.value_loss_weight = value_loss_weight
+        self.calc_gae_kwargs = calc_gae_kwargs
+        # maybe use spo
+        self.use_spo = use_spo
+        # maybe recurrent kv cache (todo: find and cite this paper from ages ago)
+        self.recurrent_kv_cache = recurrent_kv_cache
+        # reward shaping function
+        self.has_reward_shaping = exists(reward_shaping_fns)
+        self.reward_shaping_fns = reward_shaping_fns
+        # loss related
+        self.register_buffer('zero', tensor(0.), persistent = False)
     @property
     def device(self):
         return next(self.parameters()).device
@@ -643,10 +767,10 @@ class Locoformer(Module):
         return self.unembedder.parameters()
     def critic_parameters(self):
-        if not exists(self.value_network):
+        if not exists(self.to_value_pred):
             return []
-        return self.value_network.parameters()
+        return self.to_value_pred.parameters()
     def ppo(
         self,
@@ -656,79 +780,150 @@ class Locoformer(Module):
         reward,
         old_value,
         mask,
-        actor_optim,
-        critic_optim
+        episode_lens,
+        actor_optim: Optimizer | None = None,
+        critic_optim: Optimizer | None = None
     ):
+        window_size = self.window_size
+        total_learnable_tokens = mask.sum().item()
+        seq_len = state.shape[1]
+        gae_mask = einx.less('j, i -> i j', arange(seq_len, device = self.device), episode_lens)
+        advantage, returns = calc_gae(reward, old_value, masks = gae_mask, lam = self.gae_lam, gamma = self.discount_factor, **self.calc_gae_kwargs)
+        advantage = normalize(advantage)
+        windowed_tensors = [
+            t.split(window_size, dim = 1) for t in
+            (
+                state,
+                action,
+                old_action_log_prob,
+                reward,
+                old_value,
+                mask,
+                advantage,
+                returns
+            )
+        ]
+        mean_actor_loss = self.zero.clone()
+        mean_critic_loss = self.zero.clone()
-        (action_logits, value), _ = self.forward(state, return_values = True)
-        entropy = calc_entropy(action_logits)
+        # learn across windows
-        action = rearrange(action, 'b t -> b t 1')
-        log_prob = action_logits.gather(-1, action)
-        log_prob = rearrange(log_prob, 'b t 1 -> b t')
+        cache = None
-        # update actor, classic clipped surrogate loss
+        for (
+            state,
+            action,
+            old_action_log_prob,
+            reward,
+            old_value,
+            mask,
+            advantage,
+            returns
+        ) in zip(*windowed_tensors):
-        eps_clip = self.ppo_eps_clip
-        ratio = (log_prob - old_action_log_prob).exp()
+            (action_logits, value_logits), cache = self.forward(state, cache = cache, detach_cache = True, return_values = True, return_raw_value_logits = True)
+            entropy = calc_entropy(action_logits)
-        returns = calc_gae(reward, old_value, lam = self.gae_lam, gamma = self.discount_factor)
-        advantage = returns - old_value
+            action = rearrange(action, 'b t -> b t 1')
+            log_prob = action_logits.gather(-1, action)
+            log_prob = rearrange(log_prob, 'b t 1 -> b t')
-        actor_loss = -torch.min(ratio * advantage, ratio.clamp(1. - eps_clip, 1. + eps_clip) * advantage)
+            # update actor, classic clipped surrogate loss
-        actor_loss = actor_loss - self.ppo_entropy_weight * entropy
+            eps_clip = self.ppo_eps_clip
+            ratio = (log_prob - old_action_log_prob).exp()
-        mean_actor_loss = actor_loss[mask].mean()
-        mean_actor_loss.backward(retain_graph = True)
+            if self.use_spo:
+                actor_loss = -(ratio * advantage - (advantage.abs() * (ratio - 1.).square()) / (2 * eps_clip))
+            else:
+                actor_loss = -torch.min(ratio * advantage, ratio.clamp(1. - eps_clip, 1. + eps_clip) * advantage)
-        # update critic
+            actor_loss = actor_loss - self.ppo_entropy_weight * entropy
-        value_loss = F.mse_loss(returns, value, reduction = 'none')
+            windowed_actor_loss = actor_loss[mask].sum() / total_learnable_tokens
+            windowed_actor_loss.backward(retain_graph = True)
-        value_clip = self.ppo_value_clip
-        clipped_value = old_value + (value - old_value).clamp(-value_clip, value_clip)
-        clipped_value_loss = F.mse_loss(returns, clipped_value, reduction = 'none')
+            # update critic
-        critic_loss = torch.maximum(value_loss, clipped_value_loss) * self.value_loss_weight
+            value_loss = self.hl_gauss_loss(value_logits, returns, reduction = 'none')
-        mean_critic_loss = critic_loss[mask].mean()
-        mean_critic_loss.backward()
+            value_clip = self.ppo_value_clip
+            value = self.hl_gauss_loss(value_logits)
+            clipped_value = old_value + (value - old_value).clamp(-value_clip, value_clip)
+            clipped_value_loss = self.hl_gauss_loss(clipped_value, returns, reduction = 'none')
+            critic_loss = torch.maximum(value_loss, clipped_value_loss) * self.value_loss_weight
+            windowed_critic_loss = critic_loss[mask].sum() / total_learnable_tokens
+            windowed_critic_loss.backward(retain_graph = True)
+            # accumulate
+            mean_actor_loss.add_(windowed_actor_loss)
+            mean_critic_loss.add_(windowed_critic_loss)
         # optimizer update
-        actor_optim.step()
-        actor_optim.zero_grad()
+        if exists(actor_optim):
+            actor_optim.step()
+            actor_optim.zero_grad()
-        critic_optim.step()
-        critic_optim.zero_grad()
+        if exists(critic_optim):
+            critic_optim.step()
+            critic_optim.zero_grad()
         # return losses for logging
         return mean_actor_loss.detach(), mean_critic_loss.detach()
+    def state_to_rewards(
+        self,
+        state
+    ) -> Tensor:
+        assert self.has_reward_shaping
+        rewards = [fn(state) for fn in self.reward_shaping_fns]
+        rewards = [tensor(reward) if not is_tensor(reward) else reward for reward in rewards]
+        return stack(rewards)
     def wrap_env_functions(self, env):
-        def wrapped_reset(*args, **kwargs):
-            state, _ =  env.reset(*args, **kwargs)
+        def transform_output(el):
+            if isinstance(el, ndarray):
+                return from_numpy(el)
+            elif isinstance(el, (int, bool, float)):
+                return tensor(el)
+            else:
+                return el
-            if isinstance(state, ndarray):
-                state = from_numpy(state)
+        def wrapped_reset(*args, **kwargs):
+            env_reset_out =  env.reset(*args, **kwargs)
-            return state, _
+            return tree_map(transform_output, env_reset_out)
         def wrapped_step(action, *args, **kwargs):
-            out = env.step(action.item(), *args, **kwargs)
-            def transform_output(el):
-                if isinstance(el, ndarray):
-                    return from_numpy(el)
-                elif isinstance(el, (int, bool, float)):
-                    return tensor(el)
-                else:
-                    return el
+            if is_tensor(action):
+                action = action.item()
+            env_step_out = env.step(action, *args, **kwargs)
+            env_step_out_torch = tree_map(transform_output, env_step_out)
-            return tree_map(transform_output, out)
+            if not self.has_reward_shaping:
+                return env_step_out_torch
+            shaped_rewards = self.state_to_rewards(env_step_out_torch)
+            return env_step_out_torch, shaped_rewards
         return wrapped_reset, wrapped_step
@@ -738,6 +933,7 @@ class Locoformer(Module):
         inference_mode = False,
         has_batch_dim = False,
         has_time_dim = False,
+        state_time_dim = 1,
         **kwargs
     ):
         window_size = self.window_size
@@ -753,23 +949,16 @@ class Locoformer(Module):
                 state = rearrange(state, '... -> 1 ...')
             if not has_time_dim:
-                state = rearrange(state, '... d -> ... 1 d')
+                state = state.unsqueeze(state_time_dim)
             # forwards
             out, cache = self.forward(state, cache = cache, **{**kwargs, **override_kwargs})
-            # handle cache
-            cache_len = cache.shape[-2]
-            if self.fixed_window_size or divisible_by(cache_len, window_size * 2):
-                cache = cache[..., -window_size:, :]
             # maybe remove batch or time
             if not has_time_dim:
-                out = tree_map_tensor(out, lambda t: rearrange(t, '... 1 d -> ... d'))
+                out = tree_map_tensor(out, lambda t: t.squeeze(state_time_dim))
             if not has_batch_dim:
                 out = tree_map_tensor(out, lambda t: rearrange(t, '1 ... -> ...'))
@@ -798,16 +987,35 @@ class Locoformer(Module):
     def forward(
         self,
         state: Tensor,
-        cache: Tensor | None = None,
+        cache: Cache | None = None,
         detach_cache = False,
-        return_values = False
+        return_values = False,
+        return_raw_value_logits = False
     ):
         state = state.to(self.device)
         tokens = self.embedder(state)
-        embed, kv_cache = self.transformer(tokens, cache = cache, return_kv_cache = True)
+        # time
+        time = tokens.shape[-2]
+        # destruct the cache for the current timestep and the cache
+        prev_kv_cache = None
+        timestep_start = 0
+        if exists(cache):
+            timestep_start, prev_kv_cache = cache
+        # an assert - make sure during training or inference, forward never gets anything that crosses the window segment boundary, to open up some possibilities with extending memory
+        assert ((timestep_start % self.window_size) + time) <= self.window_size
+        # attention
+        embed, kv_cache = self.transformer(tokens, cache = prev_kv_cache, return_kv_cache = True)
         # unembed to actions - in language models this would be the next state
@@ -818,21 +1026,34 @@ class Locoformer(Module):
         # maybe detach cache
         if detach_cache:
-            kv_cache = detach_all(kv_cache)
+            kv_cache = kv_cache.detach()
         # handle returning of values
         if return_values:
-            assert exists(self.value_network)
+            assert exists(self.to_value_pred)
-            values = self.value_network(embed)
+            values = self.to_value_pred(embed)
-            if values.ndim == 3:
-                assert values.shape[-1] == 1
-                values = rearrange(values, '... 1 -> ...')
+            if not return_raw_value_logits:
+                values = self.hl_gauss_loss(values) # converts the value logits to scalar values
             out = (out, values)
         # output and cache
-        return out, kv_cache
+        next_timestep = time + timestep_start
+        # handle curtailing kv cache at the right intervals
+        window_size = self.window_size
+        if self.fixed_window_size or divisible_by(next_timestep, window_size * 2):
+            kv_cache = kv_cache[..., -window_size:, :]
+        # maybe recurrent cache - shift the kv cache from one layer above to the one below, for extending on receptive field of past
+        if self.recurrent_kv_cache and divisible_by(next_timestep, window_size):
+            kv_cache = torch.roll(kv_cache, shifts = -1, dims = 0)
+        return out, (next_timestep, kv_cache)

{locoformer-0.0.15.dist-info → locoformer-0.0.29.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: locoformer
-Version: 0.0.15
+Version: 0.0.29
 Summary: LocoFormer
 Project-URL: Homepage, https://pypi.org/project/locoformer/
 Project-URL: Repository, https://github.com/lucidrains/locoformer
@@ -38,6 +38,7 @@ Requires-Dist: assoc-scan
 Requires-Dist: beartype
 Requires-Dist: einops>=0.8.0
 Requires-Dist: einx>=0.3.0
+Requires-Dist: hl-gauss-pytorch>=0.2.0
 Requires-Dist: rotary-embedding-torch
 Requires-Dist: torch>=2.4
 Requires-Dist: x-mlps-pytorch
@@ -54,7 +55,7 @@ Description-Content-Type: text/markdown
 [LocoFormer - Generalist Locomotion via Long-Context Adaptation](https://generalist-locomotion.github.io/)
-The gist is they trained a simple Transformer-XL in simulation on robots with many different bodies (cross-embodiment) with extreme domain randomization. When transferring to the real-world, they noticed the robot now gains the ability to adapt to insults. The XL memories span across multiple trials, which allowed the robot to learn in-context adaptation.
+The gist is they trained a simple Transformer-XL in simulation on robots with many different bodies (cross-embodiment) and extreme domain randomization. When transferring to the real-world, they noticed the robot now gains the ability to adapt to insults. The XL memories span across multiple trials, which allowed the robot to learn in-context adaptation.
 ## Sponsors

locoformer-0.0.29.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,6 @@
+locoformer/__init__.py,sha256=XctsMGEZSR4mVl75fhds_1BtS5qGFiiItTDV7CmCt_I,45
+locoformer/locoformer.py,sha256=Tr_1btuoTZ0huXeDcAeuHxTPaVeCUEGc5iLvMYGDLck,29982
+locoformer-0.0.29.dist-info/METADATA,sha256=5Fi3EOsgpBvpzAFVZQyrlink-HcHE8EgFl10Y5l8mqM,3256
+locoformer-0.0.29.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+locoformer-0.0.29.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+locoformer-0.0.29.dist-info/RECORD,,

locoformer-0.0.15.dist-info/RECORD DELETED Viewed

@@ -1,6 +0,0 @@
-locoformer/__init__.py,sha256=XctsMGEZSR4mVl75fhds_1BtS5qGFiiItTDV7CmCt_I,45
-locoformer/locoformer.py,sha256=1jPK41G4HB1PEPtlusQxcrne489E-3QKXAULZ20FEZM,22740
-locoformer-0.0.15.dist-info/METADATA,sha256=IHtK7NvVQewYQ0GBB7v1KG90_H2Jakxir0MakUIA-jU,3218
-locoformer-0.0.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-locoformer-0.0.15.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-locoformer-0.0.15.dist-info/RECORD,,

{locoformer-0.0.15.dist-info → locoformer-0.0.29.dist-info}/WHEEL RENAMED Viewed

File without changes

{locoformer-0.0.15.dist-info → locoformer-0.0.29.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

locoformer 0.0.15__py3-none-any.whl → 0.0.29__py3-none-any.whl

locoformer 0.0.15py3-none-any.whl → 0.0.29py3-none-any.whl