PyPI - evolutionary-policy-optimization - Versions diffs - 0.0.61__py3-none-any.whl → 0.0.63__py3-none-any.whl - Mend

evolutionary-policy-optimization 0.0.61py3-none-any.whl → 0.0.63py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

evolutionary_policy_optimization/epo.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Callable
 from pathlib import Path
 from math import ceil
+from itertools import product
 from functools import partial, wraps
 from collections import namedtuple
 from random import randrange
@@ -104,8 +105,11 @@ def temp_batch_dim(fn):
 # fitness related
-def get_fitness_scores(cum_rewards, memories):
-    return cum_rewards
+def get_fitness_scores(
+    cum_rewards, # Float['gene episodes']
+    memories
+): # Float['gene']
+    return cum_rewards.sum(dim = -1) # sum all rewards across episodes, but could override this function for normalizing with whatever
 # generalized advantage estimate
@@ -684,7 +688,8 @@ class Agent(Module):
         ),
         actor_loss_kwargs: dict = dict(
             eps_clip = 0.2,
-            entropy_weight = .01
+            entropy_weight = .01,
+            norm_advantages = True
         ),
         ema_kwargs: dict = dict(),
         actor_optim_kwargs: dict = dict(),
@@ -826,9 +831,7 @@ class Agent(Module):
         memories_and_cumulative_rewards: MemoriesAndCumulativeRewards,
         epochs = 2
     ):
-        memories, cumulative_rewards = memories_and_cumulative_rewards
-        fitness_scores = self.get_fitness_scores(cumulative_rewards, memories)
+        memories, rewards_per_latent_episode = memories_and_cumulative_rewards
         # stack memories
@@ -839,7 +842,13 @@ class Agent(Module):
         if is_distributed():
             memories = map(partial(all_gather_variable_dim, dim = 0), memories)
-            fitness_scores = all_gather_variable_dim(fitness_scores, dim = 0)
+            rewards_per_latent_episode = dist.all_reduce(rewards_per_latent_episode)
+        # calculate fitness scores
+        fitness_scores = self.get_fitness_scores(rewards_per_latent_episode, memories)
+        # process memories
         (
             episode_ids,
@@ -854,12 +863,16 @@ class Agent(Module):
         masks = 1. - dones.float()
+        # generalized advantage estimate
         advantages = self.calc_gae(
             rewards[:-1],
             values,
             masks[:-1],
         )
+        # dataset and dataloader
         valid_episode = episode_ids >= 0
         dataset = TensorDataset(
@@ -871,6 +884,8 @@ class Agent(Module):
         dataloader = DataLoader(dataset, batch_size = self.batch_size, shuffle = True)
+        # updating actor and critic
         self.actor.train()
         self.critic.train()
@@ -954,7 +969,8 @@ def actor_loss(
     advantages,     # Float[b]
     eps_clip = 0.2,
     entropy_weight = .01,
-    eps = 1e-5
+    eps = 1e-5,
+    norm_advantages = True
 ):
     batch = logits.shape[0]
@@ -966,7 +982,8 @@ def actor_loss(
     clipped_ratio = ratio.clamp(min = 1. - eps_clip, max = 1. + eps_clip)
-    advantages = F.layer_norm(advantages, (batch,), eps = eps)
+    if norm_advantages:
+        advantages = F.layer_norm(advantages, (batch,), eps = eps)
     actor_loss = -torch.min(clipped_ratio * advantages, ratio * advantages)
@@ -1041,7 +1058,7 @@ Memory = namedtuple('Memory', [
 MemoriesAndCumulativeRewards = namedtuple('MemoriesAndCumulativeRewards', [
     'memories',
-    'cumulative_rewards'
+    'cumulative_rewards' # Float['latent episodes']
 ])
 class EPO(Module):
@@ -1067,29 +1084,56 @@ class EPO(Module):
     def device(self):
         return self.dummy.device
-    def latents_for_machine(self):
+    def rollouts_for_machine(
+        self,
+        fix_environ_across_latents = False
+    ): # -> (<latent_id>, <episode_id>, <maybe synced env seed>) for the machine
         num_latents = self.num_latents
+        episodes = self.episodes_per_latent
+        num_latent_episodes = num_latents * episodes
+        # if fixing environment across latents, compute all the environment seeds upfront for simplicity
+        environment_seeds = None
+        if fix_environ_across_latents:
+            environment_seeds = torch.randint(0, int(1e6), (episodes,))
+            if is_distributed():
+                dist.all_reduce(environment_seeds) # reduce sum as a way to synchronize. it's fine
+        # get number of machines, and this machine id
         world_size, rank = get_world_and_rank()
-        assert num_latents >= world_size, 'number of latents must be greater than world size for now'
-        assert rank < world_size
+        assert num_latent_episodes >= world_size, f'number of ({self.num_latents} latents x {self.episodes_per_latent} episodes) ({num_latent_episodes}) must be greater than world size ({world_size}) for now'
+        latent_episode_permutations = list(product(range(num_latents), range(episodes)))
-        num_latents_per_machine = ceil(num_latents / world_size)
+        num_rollouts_per_machine = ceil(num_latent_episodes / world_size)
-        for i in range(num_latents_per_machine):
-            latent_id = rank * num_latents_per_machine + i
+        for i in range(num_rollouts_per_machine):
+            rollout_id = rank * num_rollouts_per_machine + i
-            if latent_id >= num_latents:
+            if rollout_id >= num_latent_episodes:
                 continue
-            yield i
+            latent_id, episode_id = latent_episode_permutations[rollout_id]
+            # maybe synchronized environment seed
+            maybe_seed = None
+            if fix_environ_across_latents:
+                maybe_seed = environment_seeds[episode_id]
+            yield latent_id, episode_id, maybe_seed
     @torch.no_grad()
     def forward(
         self,
         env,
-        fix_seed_across_latents = True
+        fix_environ_across_latents = True
     ) -> MemoriesAndCumulativeRewards:
         self.agent.eval()
@@ -1098,86 +1142,78 @@ class EPO(Module):
         memories: list[Memory] = []
-        cumulative_rewards = torch.zeros((self.num_latents))
+        rewards_per_latent_episode = torch.zeros((self.num_latents, self.episodes_per_latent))
-        latent_ids_gen = self.latents_for_machine()
+        rollout_gen = self.rollouts_for_machine(fix_environ_across_latents)
-        for episode_id in tqdm(range(self.episodes_per_latent), desc = 'episode'):
+        for latent_id, episode_id, maybe_seed in tqdm(rollout_gen, desc = 'rollout'):
-            maybe_barrier()
+            time = 0
-            # maybe fix seed for environment across all latents
+            # initial state
-            env_reset_kwargs = dict()
+            reset_kwargs = dict()
-            if fix_seed_across_latents:
-                seed = maybe_sync_seed(device = self.device)
-                env_reset_kwargs = dict(seed = seed)
+            if fix_environ_across_latents:
+                reset_kwargs.update(seed = maybe_seed)
-            # for each latent (on a single machine for now)
+            state = env.reset(**reset_kwargs)
-            for latent_id in tqdm(latent_ids_gen, desc = 'latent'):
-                time = 0
+            # get latent from pool
-                # initial state
+            latent = self.agent.latent_gene_pool(latent_id = latent_id)
-                state = env.reset(**env_reset_kwargs)
+            # until maximum episode length
-                # get latent from pool
+            done = tensor(False)
-                latent = self.agent.latent_gene_pool(latent_id = latent_id)
+            while time < self.max_episode_length and not done:
-                # until maximum episode length
+                # sample action
-                done = tensor(False)
+                action, log_prob = temp_batch_dim(self.agent.get_actor_actions)(state, latent = latent, sample = True, temperature = self.action_sample_temperature)
-                while time < self.max_episode_length and not done:
+                # values
-                    # sample action
+                value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent, use_ema_if_available = True)
-                    action, log_prob = temp_batch_dim(self.agent.get_actor_actions)(state, latent = latent, sample = True, temperature = self.action_sample_temperature)
+                # get the next state, action, and reward
-                    # values
+                state, reward, done = env(action)
-                    value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent, use_ema_if_available = True)
+                # update cumulative rewards per latent, to be used as default fitness score
-                    # get the next state, action, and reward
+                rewards_per_latent_episode[latent_id, episode_id] += reward
+                # store memories
-                    state, reward, done = env(action)
-                    # update cumulative rewards per latent, to be used as default fitness score
-                    cumulative_rewards[latent_id] += reward
-                    # store memories
-                    memory = Memory(
-                        tensor(episode_id),
-                        state,
-                        tensor(latent_id),
-                        action,
-                        log_prob,
-                        reward,
-                        value,
-                        done
-                    )
+                memory = Memory(
+                    tensor(episode_id),
+                    state,
+                    tensor(latent_id),
+                    action,
+                    log_prob,
+                    reward,
+                    value,
+                    done
+                )
-                    memories.append(memory)
+                memories.append(memory)
-                    time += 1
+                time += 1
-                # need the final next value for GAE, iiuc
+            # need the final next value for GAE, iiuc
-                next_value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent)
+            next_value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent)
-                memory_for_gae = memory._replace(
-                    episode_id = invalid_episode,
-                    value = next_value
-                )
+            memory_for_gae = memory._replace(
+                episode_id = invalid_episode,
+                value = next_value
+            )
-                memories.append(memory_for_gae)
+            memories.append(memory_for_gae)
         return MemoriesAndCumulativeRewards(
             memories = memories,
-            cumulative_rewards = cumulative_rewards
+            cumulative_rewards = rewards_per_latent_episode
         )

evolutionary_policy_optimization/mock_env.py CHANGED Viewed

@@ -26,7 +26,7 @@ class Env(Module):
     def reset(
         self,
-        seed
+        seed = None
     ):
         state = randn(self.state_shape, device = self.device)
         return state

{evolutionary_policy_optimization-0.0.61.dist-info → evolutionary_policy_optimization-0.0.63.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: evolutionary-policy-optimization
-Version: 0.0.61
+Version: 0.0.63
 Summary: EPO - Pytorch
 Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
 Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization

evolutionary_policy_optimization-0.0.63.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+evolutionary_policy_optimization/__init__.py,sha256=0q0aBuFgWi06MLMD8FiHzBYQ3_W4LYWrwmCtF3u5H2A,201
+evolutionary_policy_optimization/distributed.py,sha256=7KgZdeS_wxBHo_du9XZFB1Cu318J-Bp66Xdr6Log_20,2423
+evolutionary_policy_optimization/epo.py,sha256=DSG2fYWLk0cyHhfoiwqmSzh2TBOWhz25sD1oWIM5p1k,36695
+evolutionary_policy_optimization/experimental.py,sha256=-IgqjJ_Wk_CMB1y9YYWpoYqTG9GZHAS6kbRdTluVevg,1563
+evolutionary_policy_optimization/mock_env.py,sha256=gvATGA51Ym5sf3jiR2VmlpjiCcT7KCDDY_SrR-MEwsU,941
+evolutionary_policy_optimization-0.0.63.dist-info/METADATA,sha256=X2FKT8WJ9T1t0ydEdtxrJsJGXY1ubfvydQSykv2G03M,6220
+evolutionary_policy_optimization-0.0.63.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+evolutionary_policy_optimization-0.0.63.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+evolutionary_policy_optimization-0.0.63.dist-info/RECORD,,

evolutionary_policy_optimization-0.0.61.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-evolutionary_policy_optimization/__init__.py,sha256=0q0aBuFgWi06MLMD8FiHzBYQ3_W4LYWrwmCtF3u5H2A,201
-evolutionary_policy_optimization/distributed.py,sha256=7KgZdeS_wxBHo_du9XZFB1Cu318J-Bp66Xdr6Log_20,2423
-evolutionary_policy_optimization/epo.py,sha256=kFT49rJdcmaDehfpx3YyhYhvAcp7S-gRWDkS2y20Q2Y,35377
-evolutionary_policy_optimization/experimental.py,sha256=-IgqjJ_Wk_CMB1y9YYWpoYqTG9GZHAS6kbRdTluVevg,1563
-evolutionary_policy_optimization/mock_env.py,sha256=202KJ5g57wQvOzhGYzgHfBa7Y2do5uuDvl5kFg5o73g,934
-evolutionary_policy_optimization-0.0.61.dist-info/METADATA,sha256=3IbcY9kg71P6lTNxZaRBw3IYfDjcK4uTJJaFRD0Skwg,6220
-evolutionary_policy_optimization-0.0.61.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-evolutionary_policy_optimization-0.0.61.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-evolutionary_policy_optimization-0.0.61.dist-info/RECORD,,

{evolutionary_policy_optimization-0.0.61.dist-info → evolutionary_policy_optimization-0.0.63.dist-info}/WHEEL RENAMED Viewed

File without changes

{evolutionary_policy_optimization-0.0.61.dist-info → evolutionary_policy_optimization-0.0.63.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

evolutionary-policy-optimization 0.0.61__py3-none-any.whl → 0.0.63__py3-none-any.whl

evolutionary-policy-optimization 0.0.61py3-none-any.whl → 0.0.63py3-none-any.whl