PyPI - evolutionary-policy-optimization - Versions diffs - 0.0.63__tar.gz → 0.0.65__tar.gz - Mend

evolutionary-policy-optimization 0.0.63tar.gz → 0.0.65tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: evolutionary-policy-optimization
-Version: 0.0.63
+Version: 0.0.65
 Summary: EPO - Pytorch
 Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
 Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization

{evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/evolutionary_policy_optimization/epo.py RENAMED Viewed

@@ -114,18 +114,17 @@ def get_fitness_scores(
 # generalized advantage estimate
 def calc_generalized_advantage_estimate(
-    rewards, # Float[n]
-    values,  # Float[n+1]
-    masks,   # Bool[n]
+    rewards,
+    values,
+    masks,
     gamma = 0.99,
     lam = 0.95,
     use_accelerated = None
 ):
-    assert values.shape[-1] == (rewards.shape[-1] + 1)
     use_accelerated = default(use_accelerated, rewards.is_cuda)
     device = rewards.device
+    values = F.pad(values, (0, 1), value = 0.)
     values, values_next = values[:-1], values[1:]
     delta = rewards + gamma * values_next * masks - values
@@ -866,21 +865,16 @@ class Agent(Module):
         # generalized advantage estimate
         advantages = self.calc_gae(
-            rewards[:-1],
+            rewards,
             values,
-            masks[:-1],
+            masks,
         )
         # dataset and dataloader
         valid_episode = episode_ids >= 0
-        dataset = TensorDataset(
-            *[
-                advantages[valid_episode[:-1]],
-                *[t[valid_episode] for t in (states, latent_gene_ids, actions, log_probs, values)]
-            ]
-        )
+        dataset = TensorDataset(*[t[valid_episode] for t in (advantages, states, latent_gene_ids, actions, log_probs, values)])
         dataloader = DataLoader(dataset, batch_size = self.batch_size, shuffle = True)
@@ -1068,7 +1062,8 @@ class EPO(Module):
         agent: Agent,
         episodes_per_latent,
         max_episode_length,
-        action_sample_temperature = 1.
+        action_sample_temperature = 1.,
+        fix_environ_across_latents = True
     ):
         super().__init__()
         self.agent = agent
@@ -1077,6 +1072,7 @@ class EPO(Module):
         self.num_latents = agent.latent_gene_pool.num_latents
         self.episodes_per_latent = episodes_per_latent
         self.max_episode_length = max_episode_length
+        self.fix_environ_across_latents = fix_environ_across_latents
         self.register_buffer('dummy', tensor(0))
@@ -1133,9 +1129,11 @@ class EPO(Module):
     def forward(
         self,
         env,
-        fix_environ_across_latents = True
+        fix_environ_across_latents = None
     ) -> MemoriesAndCumulativeRewards:
+        fix_environ_across_latents = default(fix_environ_across_latents, self.fix_environ_across_latents)
         self.agent.eval()
         invalid_episode = tensor(-1) # will use `episode_id` value of `-1` for the `next_value`, needed for not discarding last reward for generalized advantage estimate
@@ -1179,12 +1177,14 @@ class EPO(Module):
                 # get the next state, action, and reward
-                state, reward, done = env(action)
+                state, reward, truncated, terminated = env(action)
+                done = truncated or terminated
                 # update cumulative rewards per latent, to be used as default fitness score
                 rewards_per_latent_episode[latent_id, episode_id] += reward
                 # store memories
                 memory = Memory(
@@ -1195,23 +1195,25 @@ class EPO(Module):
                     log_prob,
                     reward,
                     value,
-                    done
+                    terminated
                 )
                 memories.append(memory)
                 time += 1
-            # need the final next value for GAE, iiuc
+            if not terminated:
+                # add bootstrap value if truncated
-            next_value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent)
+                next_value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent)
-            memory_for_gae = memory._replace(
-                episode_id = invalid_episode,
-                value = next_value
-            )
+                memory_for_gae = memory._replace(
+                    episode_id = invalid_episode,
+                    value = next_value,
+                    done = tensor(True)
+                )
-            memories.append(memory_for_gae)
+                memories.append(memory_for_gae)
         return MemoriesAndCumulativeRewards(
             memories = memories,

{evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/evolutionary_policy_optimization/mock_env.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from __future__ import annotations
+from random import choice
 import torch
 from torch import tensor, randn, randint
@@ -14,21 +15,25 @@ def cast_tuple(v):
 class Env(Module):
     def __init__(
         self,
-        state_shape: int | tuple[int, ...]
+        state_shape: int | tuple[int, ...],
+        can_terminate_after = 2
     ):
         super().__init__()
         self.state_shape = cast_tuple(state_shape)
-        self.register_buffer('dummy', tensor(0))
+        self.can_terminate_after = can_terminate_after
+        self.register_buffer('step', tensor(0))
     @property
     def device(self):
-        return self.dummy.device
+        return self.step.device
     def reset(
         self,
         seed = None
     ):
         state = randn(self.state_shape, device = self.device)
+        self.step.zero_()
         return state
     def forward(
@@ -37,6 +42,13 @@ class Env(Module):
     ):
         state = randn(self.state_shape, device = self.device)
         reward = randint(0, 5, (), device = self.device).float()
-        done = torch.zeros((), device = self.device, dtype = torch.bool)
-        return state, reward, done
+        if self.step > self.can_terminate_after:
+            truncated = tensor(choice((True, False)), device =self.device)
+            terminated = tensor(choice((True, False)), device =self.device)
+        else:
+            truncated = terminated = tensor(False, device = self.device)
+        self.step.add_(1)
+        return state, reward, truncated, terminated

{evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "evolutionary-policy-optimization"
-version = "0.0.63"
+version = "0.0.65"
 description = "EPO - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }