PyPI - evolutionary-policy-optimization - Versions diffs - 0.0.62__py3-none-any.whl → 0.0.63__py3-none-any.whl - Mend

evolutionary-policy-optimization 0.0.62py3-none-any.whl → 0.0.63py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

evolutionary_policy_optimization/epo.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Callable
 from pathlib import Path
 from math import ceil
+from itertools import product
 from functools import partial, wraps
 from collections import namedtuple
 from random import randrange
@@ -1083,29 +1084,56 @@ class EPO(Module):
     def device(self):
         return self.dummy.device
-    def latents_for_machine(self):
+    def rollouts_for_machine(
+        self,
+        fix_environ_across_latents = False
+    ): # -> (<latent_id>, <episode_id>, <maybe synced env seed>) for the machine
         num_latents = self.num_latents
+        episodes = self.episodes_per_latent
+        num_latent_episodes = num_latents * episodes
+        # if fixing environment across latents, compute all the environment seeds upfront for simplicity
+        environment_seeds = None
+        if fix_environ_across_latents:
+            environment_seeds = torch.randint(0, int(1e6), (episodes,))
+            if is_distributed():
+                dist.all_reduce(environment_seeds) # reduce sum as a way to synchronize. it's fine
+        # get number of machines, and this machine id
         world_size, rank = get_world_and_rank()
-        assert num_latents >= world_size, 'number of latents must be greater than world size for now'
-        assert rank < world_size
+        assert num_latent_episodes >= world_size, f'number of ({self.num_latents} latents x {self.episodes_per_latent} episodes) ({num_latent_episodes}) must be greater than world size ({world_size}) for now'
+        latent_episode_permutations = list(product(range(num_latents), range(episodes)))
-        num_latents_per_machine = ceil(num_latents / world_size)
+        num_rollouts_per_machine = ceil(num_latent_episodes / world_size)
-        for i in range(num_latents_per_machine):
-            latent_id = rank * num_latents_per_machine + i
+        for i in range(num_rollouts_per_machine):
+            rollout_id = rank * num_rollouts_per_machine + i
-            if latent_id >= num_latents:
+            if rollout_id >= num_latent_episodes:
                 continue
-            yield i
+            latent_id, episode_id = latent_episode_permutations[rollout_id]
+            # maybe synchronized environment seed
+            maybe_seed = None
+            if fix_environ_across_latents:
+                maybe_seed = environment_seeds[episode_id]
+            yield latent_id, episode_id, maybe_seed
     @torch.no_grad()
     def forward(
         self,
         env,
-        fix_seed_across_latents = True
+        fix_environ_across_latents = True
     ) -> MemoriesAndCumulativeRewards:
         self.agent.eval()
@@ -1116,82 +1144,74 @@ class EPO(Module):
         rewards_per_latent_episode = torch.zeros((self.num_latents, self.episodes_per_latent))
-        latent_ids_gen = self.latents_for_machine()
-        for episode_id in tqdm(range(self.episodes_per_latent), desc = 'episode'):
+        rollout_gen = self.rollouts_for_machine(fix_environ_across_latents)
-            maybe_barrier()
+        for latent_id, episode_id, maybe_seed in tqdm(rollout_gen, desc = 'rollout'):
-            # maybe fix seed for environment across all latents
+            time = 0
-            env_reset_kwargs = dict()
+            # initial state
-            if fix_seed_across_latents:
-                seed = maybe_sync_seed(device = self.device)
-                env_reset_kwargs = dict(seed = seed)
+            reset_kwargs = dict()
-            # for each latent (on a single machine for now)
+            if fix_environ_across_latents:
+                reset_kwargs.update(seed = maybe_seed)
-            for latent_id in tqdm(latent_ids_gen, desc = 'latent'):
-                time = 0
+            state = env.reset(**reset_kwargs)
-                # initial state
+            # get latent from pool
-                state = env.reset(**env_reset_kwargs)
+            latent = self.agent.latent_gene_pool(latent_id = latent_id)
-                # get latent from pool
+            # until maximum episode length
-                latent = self.agent.latent_gene_pool(latent_id = latent_id)
+            done = tensor(False)
-                # until maximum episode length
+            while time < self.max_episode_length and not done:
-                done = tensor(False)
+                # sample action
-                while time < self.max_episode_length and not done:
+                action, log_prob = temp_batch_dim(self.agent.get_actor_actions)(state, latent = latent, sample = True, temperature = self.action_sample_temperature)
-                    # sample action
+                # values
-                    action, log_prob = temp_batch_dim(self.agent.get_actor_actions)(state, latent = latent, sample = True, temperature = self.action_sample_temperature)
+                value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent, use_ema_if_available = True)
-                    # values
+                # get the next state, action, and reward
-                    value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent, use_ema_if_available = True)
+                state, reward, done = env(action)
-                    # get the next state, action, and reward
+                # update cumulative rewards per latent, to be used as default fitness score
-                    state, reward, done = env(action)
+                rewards_per_latent_episode[latent_id, episode_id] += reward
+                # store memories
-                    # update cumulative rewards per latent, to be used as default fitness score
-                    rewards_per_latent_episode[latent_id, episode_id] += reward
-                    # store memories
-                    memory = Memory(
-                        tensor(episode_id),
-                        state,
-                        tensor(latent_id),
-                        action,
-                        log_prob,
-                        reward,
-                        value,
-                        done
-                    )
+                memory = Memory(
+                    tensor(episode_id),
+                    state,
+                    tensor(latent_id),
+                    action,
+                    log_prob,
+                    reward,
+                    value,
+                    done
+                )
-                    memories.append(memory)
+                memories.append(memory)
-                    time += 1
+                time += 1
-                # need the final next value for GAE, iiuc
+            # need the final next value for GAE, iiuc
-                next_value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent)
+            next_value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent)
-                memory_for_gae = memory._replace(
-                    episode_id = invalid_episode,
-                    value = next_value
-                )
+            memory_for_gae = memory._replace(
+                episode_id = invalid_episode,
+                value = next_value
+            )
-                memories.append(memory_for_gae)
+            memories.append(memory_for_gae)
         return MemoriesAndCumulativeRewards(
             memories = memories,

evolutionary_policy_optimization/mock_env.py CHANGED Viewed

@@ -26,7 +26,7 @@ class Env(Module):
     def reset(
         self,
-        seed
+        seed = None
     ):
         state = randn(self.state_shape, device = self.device)
         return state

{evolutionary_policy_optimization-0.0.62.dist-info → evolutionary_policy_optimization-0.0.63.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: evolutionary-policy-optimization
-Version: 0.0.62
+Version: 0.0.63
 Summary: EPO - Pytorch
 Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
 Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization

evolutionary_policy_optimization-0.0.63.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+evolutionary_policy_optimization/__init__.py,sha256=0q0aBuFgWi06MLMD8FiHzBYQ3_W4LYWrwmCtF3u5H2A,201
+evolutionary_policy_optimization/distributed.py,sha256=7KgZdeS_wxBHo_du9XZFB1Cu318J-Bp66Xdr6Log_20,2423
+evolutionary_policy_optimization/epo.py,sha256=DSG2fYWLk0cyHhfoiwqmSzh2TBOWhz25sD1oWIM5p1k,36695
+evolutionary_policy_optimization/experimental.py,sha256=-IgqjJ_Wk_CMB1y9YYWpoYqTG9GZHAS6kbRdTluVevg,1563
+evolutionary_policy_optimization/mock_env.py,sha256=gvATGA51Ym5sf3jiR2VmlpjiCcT7KCDDY_SrR-MEwsU,941
+evolutionary_policy_optimization-0.0.63.dist-info/METADATA,sha256=X2FKT8WJ9T1t0ydEdtxrJsJGXY1ubfvydQSykv2G03M,6220
+evolutionary_policy_optimization-0.0.63.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+evolutionary_policy_optimization-0.0.63.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+evolutionary_policy_optimization-0.0.63.dist-info/RECORD,,

evolutionary_policy_optimization-0.0.62.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-evolutionary_policy_optimization/__init__.py,sha256=0q0aBuFgWi06MLMD8FiHzBYQ3_W4LYWrwmCtF3u5H2A,201
-evolutionary_policy_optimization/distributed.py,sha256=7KgZdeS_wxBHo_du9XZFB1Cu318J-Bp66Xdr6Log_20,2423
-evolutionary_policy_optimization/epo.py,sha256=lWhHpsfq6vpri6yeDXSTLRMKGPwl0kt3klh0fVaInSs,35921
-evolutionary_policy_optimization/experimental.py,sha256=-IgqjJ_Wk_CMB1y9YYWpoYqTG9GZHAS6kbRdTluVevg,1563
-evolutionary_policy_optimization/mock_env.py,sha256=202KJ5g57wQvOzhGYzgHfBa7Y2do5uuDvl5kFg5o73g,934
-evolutionary_policy_optimization-0.0.62.dist-info/METADATA,sha256=oqJyUOXJwHrdf6JCVKPfOmhGJbXgqOmPWN_46l0JtWs,6220
-evolutionary_policy_optimization-0.0.62.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-evolutionary_policy_optimization-0.0.62.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-evolutionary_policy_optimization-0.0.62.dist-info/RECORD,,

{evolutionary_policy_optimization-0.0.62.dist-info → evolutionary_policy_optimization-0.0.63.dist-info}/WHEEL RENAMED Viewed

File without changes

{evolutionary_policy_optimization-0.0.62.dist-info → evolutionary_policy_optimization-0.0.63.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

evolutionary-policy-optimization 0.0.62__py3-none-any.whl → 0.0.63__py3-none-any.whl

evolutionary-policy-optimization 0.0.62py3-none-any.whl → 0.0.63py3-none-any.whl