PyPI - evolutionary-policy-optimization - Versions diffs - 0.0.62__py3-none-any.whl → 0.0.64__py3-none-any.whl - Mend

evolutionary-policy-optimization 0.0.62py3-none-any.whl → 0.0.64py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

evolutionary_policy_optimization/epo.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Callable
 from pathlib import Path
 from math import ceil
+from itertools import product
 from functools import partial, wraps
 from collections import namedtuple
 from random import randrange
@@ -1067,7 +1068,8 @@ class EPO(Module):
         agent: Agent,
         episodes_per_latent,
         max_episode_length,
-        action_sample_temperature = 1.
+        action_sample_temperature = 1.,
+        fix_environ_across_latents = True
     ):
         super().__init__()
         self.agent = agent
@@ -1076,6 +1078,7 @@ class EPO(Module):
         self.num_latents = agent.latent_gene_pool.num_latents
         self.episodes_per_latent = episodes_per_latent
         self.max_episode_length = max_episode_length
+        self.fix_environ_across_latents = fix_environ_across_latents
         self.register_buffer('dummy', tensor(0))
@@ -1083,31 +1086,60 @@ class EPO(Module):
     def device(self):
         return self.dummy.device
-    def latents_for_machine(self):
+    def rollouts_for_machine(
+        self,
+        fix_environ_across_latents = False
+    ): # -> (<latent_id>, <episode_id>, <maybe synced env seed>) for the machine
         num_latents = self.num_latents
+        episodes = self.episodes_per_latent
+        num_latent_episodes = num_latents * episodes
+        # if fixing environment across latents, compute all the environment seeds upfront for simplicity
+        environment_seeds = None
+        if fix_environ_across_latents:
+            environment_seeds = torch.randint(0, int(1e6), (episodes,))
+            if is_distributed():
+                dist.all_reduce(environment_seeds) # reduce sum as a way to synchronize. it's fine
+        # get number of machines, and this machine id
         world_size, rank = get_world_and_rank()
-        assert num_latents >= world_size, 'number of latents must be greater than world size for now'
-        assert rank < world_size
+        assert num_latent_episodes >= world_size, f'number of ({self.num_latents} latents x {self.episodes_per_latent} episodes) ({num_latent_episodes}) must be greater than world size ({world_size}) for now'
+        latent_episode_permutations = list(product(range(num_latents), range(episodes)))
-        num_latents_per_machine = ceil(num_latents / world_size)
+        num_rollouts_per_machine = ceil(num_latent_episodes / world_size)
-        for i in range(num_latents_per_machine):
-            latent_id = rank * num_latents_per_machine + i
+        for i in range(num_rollouts_per_machine):
+            rollout_id = rank * num_rollouts_per_machine + i
-            if latent_id >= num_latents:
+            if rollout_id >= num_latent_episodes:
                 continue
-            yield i
+            latent_id, episode_id = latent_episode_permutations[rollout_id]
+            # maybe synchronized environment seed
+            maybe_seed = None
+            if fix_environ_across_latents:
+                maybe_seed = environment_seeds[episode_id]
+            yield latent_id, episode_id, maybe_seed
     @torch.no_grad()
     def forward(
         self,
         env,
-        fix_seed_across_latents = True
+        fix_environ_across_latents = None
     ) -> MemoriesAndCumulativeRewards:
+        fix_environ_across_latents = default(fix_environ_across_latents, self.fix_environ_across_latents)
         self.agent.eval()
         invalid_episode = tensor(-1) # will use `episode_id` value of `-1` for the `next_value`, needed for not discarding last reward for generalized advantage estimate
@@ -1116,79 +1148,73 @@ class EPO(Module):
         rewards_per_latent_episode = torch.zeros((self.num_latents, self.episodes_per_latent))
-        latent_ids_gen = self.latents_for_machine()
+        rollout_gen = self.rollouts_for_machine(fix_environ_across_latents)
-        for episode_id in tqdm(range(self.episodes_per_latent), desc = 'episode'):
+        for latent_id, episode_id, maybe_seed in tqdm(rollout_gen, desc = 'rollout'):
-            maybe_barrier()
+            time = 0
-            # maybe fix seed for environment across all latents
+            # initial state
-            env_reset_kwargs = dict()
+            reset_kwargs = dict()
-            if fix_seed_across_latents:
-                seed = maybe_sync_seed(device = self.device)
-                env_reset_kwargs = dict(seed = seed)
+            if fix_environ_across_latents:
+                reset_kwargs.update(seed = maybe_seed)
-            # for each latent (on a single machine for now)
+            state = env.reset(**reset_kwargs)
-            for latent_id in tqdm(latent_ids_gen, desc = 'latent'):
-                time = 0
+            # get latent from pool
-                # initial state
+            latent = self.agent.latent_gene_pool(latent_id = latent_id)
-                state = env.reset(**env_reset_kwargs)
+            # until maximum episode length
-                # get latent from pool
+            done = tensor(False)
-                latent = self.agent.latent_gene_pool(latent_id = latent_id)
+            while time < self.max_episode_length and not done:
-                # until maximum episode length
+                # sample action
-                done = tensor(False)
+                action, log_prob = temp_batch_dim(self.agent.get_actor_actions)(state, latent = latent, sample = True, temperature = self.action_sample_temperature)
-                while time < self.max_episode_length and not done:
+                # values
-                    # sample action
+                value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent, use_ema_if_available = True)
-                    action, log_prob = temp_batch_dim(self.agent.get_actor_actions)(state, latent = latent, sample = True, temperature = self.action_sample_temperature)
+                # get the next state, action, and reward
-                    # values
+                state, reward, done = env(action)
-                    value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent, use_ema_if_available = True)
+                # update cumulative rewards per latent, to be used as default fitness score
-                    # get the next state, action, and reward
+                rewards_per_latent_episode[latent_id, episode_id] += reward
+                # store memories
-                    state, reward, done = env(action)
-                    # update cumulative rewards per latent, to be used as default fitness score
-                    rewards_per_latent_episode[latent_id, episode_id] += reward
-                    # store memories
-                    memory = Memory(
-                        tensor(episode_id),
-                        state,
-                        tensor(latent_id),
-                        action,
-                        log_prob,
-                        reward,
-                        value,
-                        done
-                    )
+                memory = Memory(
+                    tensor(episode_id),
+                    state,
+                    tensor(latent_id),
+                    action,
+                    log_prob,
+                    reward,
+                    value,
+                    done
+                )
-                    memories.append(memory)
+                memories.append(memory)
-                    time += 1
+                time += 1
-                # need the final next value for GAE, iiuc
+            if not done:
+                # add bootstrap value if truncated
                 next_value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent)
                 memory_for_gae = memory._replace(
                     episode_id = invalid_episode,
-                    value = next_value
+                    value = next_value,
+                    done = tensor(True)
                 )
                 memories.append(memory_for_gae)

evolutionary_policy_optimization/mock_env.py CHANGED Viewed

@@ -26,7 +26,7 @@ class Env(Module):
     def reset(
         self,
-        seed
+        seed = None
     ):
         state = randn(self.state_shape, device = self.device)
         return state

{evolutionary_policy_optimization-0.0.62.dist-info → evolutionary_policy_optimization-0.0.64.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: evolutionary-policy-optimization
-Version: 0.0.62
+Version: 0.0.64
 Summary: EPO - Pytorch
 Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
 Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization

evolutionary_policy_optimization-0.0.64.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+evolutionary_policy_optimization/__init__.py,sha256=0q0aBuFgWi06MLMD8FiHzBYQ3_W4LYWrwmCtF3u5H2A,201
+evolutionary_policy_optimization/distributed.py,sha256=7KgZdeS_wxBHo_du9XZFB1Cu318J-Bp66Xdr6Log_20,2423
+evolutionary_policy_optimization/epo.py,sha256=0_jC9Tbl6FiscLHklvTKtuQTwZL8egqFKW-4JUxxwvw,37001
+evolutionary_policy_optimization/experimental.py,sha256=-IgqjJ_Wk_CMB1y9YYWpoYqTG9GZHAS6kbRdTluVevg,1563
+evolutionary_policy_optimization/mock_env.py,sha256=gvATGA51Ym5sf3jiR2VmlpjiCcT7KCDDY_SrR-MEwsU,941
+evolutionary_policy_optimization-0.0.64.dist-info/METADATA,sha256=vWdnTe2a86wTenEh29TNJlYEjD8A5CPtsyylxh4XsE0,6220
+evolutionary_policy_optimization-0.0.64.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+evolutionary_policy_optimization-0.0.64.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+evolutionary_policy_optimization-0.0.64.dist-info/RECORD,,

evolutionary_policy_optimization-0.0.62.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-evolutionary_policy_optimization/__init__.py,sha256=0q0aBuFgWi06MLMD8FiHzBYQ3_W4LYWrwmCtF3u5H2A,201
-evolutionary_policy_optimization/distributed.py,sha256=7KgZdeS_wxBHo_du9XZFB1Cu318J-Bp66Xdr6Log_20,2423
-evolutionary_policy_optimization/epo.py,sha256=lWhHpsfq6vpri6yeDXSTLRMKGPwl0kt3klh0fVaInSs,35921
-evolutionary_policy_optimization/experimental.py,sha256=-IgqjJ_Wk_CMB1y9YYWpoYqTG9GZHAS6kbRdTluVevg,1563
-evolutionary_policy_optimization/mock_env.py,sha256=202KJ5g57wQvOzhGYzgHfBa7Y2do5uuDvl5kFg5o73g,934
-evolutionary_policy_optimization-0.0.62.dist-info/METADATA,sha256=oqJyUOXJwHrdf6JCVKPfOmhGJbXgqOmPWN_46l0JtWs,6220
-evolutionary_policy_optimization-0.0.62.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-evolutionary_policy_optimization-0.0.62.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-evolutionary_policy_optimization-0.0.62.dist-info/RECORD,,

{evolutionary_policy_optimization-0.0.62.dist-info → evolutionary_policy_optimization-0.0.64.dist-info}/WHEEL RENAMED Viewed

File without changes

{evolutionary_policy_optimization-0.0.62.dist-info → evolutionary_policy_optimization-0.0.64.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

evolutionary-policy-optimization 0.0.62__py3-none-any.whl → 0.0.64__py3-none-any.whl

evolutionary-policy-optimization 0.0.62py3-none-any.whl → 0.0.64py3-none-any.whl