PyPI - evolutionary-policy-optimization - Versions diffs - 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl - Mend

evolutionary-policy-optimization 0.0.37py3-none-any.whl → 0.0.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

evolutionary_policy_optimization/__init__.py CHANGED Viewed

@@ -4,5 +4,8 @@ from evolutionary_policy_optimization.epo import (
     Critic,
     create_agent,
     Agent,
-    LatentGenePool
+    LatentGenePool,
+    EPO
 )
+from evolutionary_policy_optimization.mock_env import Env

evolutionary_policy_optimization/epo.py CHANGED Viewed

@@ -5,13 +5,13 @@ from pathlib import Path
 from collections import namedtuple
 import torch
-from torch import nn, cat, is_tensor, tensor
+from torch import nn, cat, stack, is_tensor, tensor
 import torch.nn.functional as F
 from torch.nn import Linear, Module, ModuleList
 from torch.utils.data import TensorDataset, DataLoader
 import einx
-from einops import rearrange, repeat, einsum
+from einops import rearrange, repeat, einsum, pack
 from einops.layers.torch import Rearrange
 from assoc_scan import AssocScan
@@ -22,6 +22,8 @@ from hl_gauss_pytorch import HLGaussLayer
 from ema_pytorch import EMA
+from tqdm import tqdm
 # helpers
 def exists(v):
@@ -47,9 +49,20 @@ def l2norm(t):
 def log(t, eps = 1e-20):
     return t.clamp(min = eps).log()
+def gumbel_noise(t):
+    return -log(-log(torch.rand_like(t)))
+def gumbel_sample(t, temperature = 1.):
+    is_greedy = temperature <= 0.
+    if not is_greedy:
+        t = (t / temperature) + gumbel_noise(t)
+    return t.argmax(dim = -1)
 def calc_entropy(logits):
     prob = logits.softmax(dim = -1)
-    return -prob * log(prob)
+    return -(prob * log(prob)).sum(dim = -1)
 def gather_log_prob(
     logits, # Float[b l]
@@ -63,8 +76,8 @@ def gather_log_prob(
 # generalized advantage estimate
 def calc_generalized_advantage_estimate(
-    rewards, # Float[g n]
-    values,  # Float[g n+1]
+    rewards, # Float[n]
+    values,  # Float[n+1]
     masks,   # Bool[n]
     gamma = 0.99,
     lam = 0.95,
@@ -75,9 +88,7 @@ def calc_generalized_advantage_estimate(
     use_accelerated = default(use_accelerated, rewards.is_cuda)
     device = rewards.device
-    masks = repeat(masks, 'n -> g n', g = rewards.shape[0])
-    values, values_next = values[:, :-1], values[:, 1:]
+    values, values_next = values[:-1], values[1:]
     delta = rewards + gamma * values_next * masks - values
     gates = gamma * lam * masks
@@ -319,7 +330,6 @@ class LatentGenePool(Module):
         num_latents,                     # same as gene pool size
         dim_latent,                      # gene dimension
         num_islands = 1,                 # add the island strategy, which has been effectively used in a few recent works
-        dim_state = None,
         frozen_latents = True,
         crossover_random = True,         # random interp from parent1 to parent2 for crossover, set to `False` for averaging (0.5 constant value)
         l2norm_latent = False,           # whether to enforce latents on hypersphere,
@@ -384,7 +394,6 @@ class LatentGenePool(Module):
         fitness,
         beta0 = 2.,           # exploitation factor, moving fireflies of low light intensity to high
         gamma = 1.,           # controls light intensity decay over distance - setting this to zero will make firefly equivalent to vanilla PSO
-        alpha = 0.1,          # exploration factor
         inplace = True,
     ):
         islands = self.num_islands
@@ -555,7 +564,6 @@ class LatentGenePool(Module):
     def forward(
         self,
         *args,
-        state: Tensor | None = None,
         latent_id: int | None = None,
         net: Module | None = None,
         net_latent_kwarg_name = 'latent',
@@ -568,6 +576,8 @@ class LatentGenePool(Module):
         if not exists(latent_id) and self.num_latents == 1:
             latent_id = 0
+        assert exists(latent_id)
         if not is_tensor(latent_id):
             latent_id = tensor(latent_id, device = device)
@@ -575,8 +585,6 @@ class LatentGenePool(Module):
         # fetch latent
-        fetching_multiple_latents = latent_id.numel() > 1
         latent = self.latents[latent_id]
         latent = self.maybe_l2norm(latent)
@@ -686,17 +694,38 @@ class Agent(Module):
     def get_actor_actions(
         self,
         state,
-        latent_id
+        latent_id = None,
+        latent = None,
+        sample = False,
+        temperature = 1.
     ):
-        latent = self.latent_gene_pool(latent_id = latent_id, state = state)
-        return self.actor(state, latent)
+        assert exists(latent_id) or exists(latent)
+        if not exists(latent):
+            latent = self.latent_gene_pool(latent_id = latent_id)
+        logits = self.actor(state, latent)
+        if not sample:
+            return logits
+        actions = gumbel_sample(logits, temperature = temperature)
+        log_probs = gather_log_prob(logits, actions)
+        return actions, log_probs
     def get_critic_values(
         self,
         state,
-        latent_id
+        latent_id = None,
+        latent = None
     ):
-        latent = self.latent_gene_pool(latent_id = latent_id, state = state)
+        assert exists(latent_id) or exists(latent)
+        if not exists(latent):
+            latent = self.latent_gene_pool(latent_id = latent_id)
         return self.critic(state, latent)
     def update_latent_gene_pool_(
@@ -707,12 +736,13 @@ class Agent(Module):
     def forward(
         self,
-        memories_and_next_value: MemoriesAndNextValue,
+        memories_and_fitness_scores: MemoriesAndFitnessScores,
         epochs = 2
     ):
-        memories, next_value = memories_and_next_value
+        memories, fitness_scores = memories_and_fitness_scores
         (
+            episode_ids,
             states,
             latent_gene_ids,
             actions,
@@ -722,35 +752,46 @@ class Agent(Module):
             dones
         ) = map(stack, zip(*memories))
-        values_with_next, ps = pack((values, next_value), '*')
+        advantages = self.calc_gae(
+            rewards[:-1],
+            values,
+            dones[:-1],
+        )
-        advantages = self.calc_gae(rewards, values_with_next, dones)
+        valid_episode = episode_ids >= 0
-        dataset = TensorDataset(states, latent_gene_ids, actions, log_probs, advantages, values)
+        dataset = TensorDataset(
+            *[
+                advantages[valid_episode[:-1]],
+                *[t[valid_episode] for t in (states, latent_gene_ids, actions, log_probs, values)]
+            ]
+        )
         dataloader = DataLoader(dataset, batch_size = self.batch_size, shuffle = True)
         self.actor.train()
         self.critic.train()
-        for _ in range(epochs):
+        for _ in tqdm(range(epochs), desc = 'learning actor/critic epoch'):
             for (
+                advantages,
                 states,
                 latent_gene_ids,
                 actions,
                 log_probs,
-                advantages,
                 old_values
             ) in dataloader:
-                latents = self.latent_gene_pool(latent_gene_ids)
+                latents = self.latent_gene_pool(latent_id = latent_gene_ids)
                 # learn actor
                 logits = self.actor(states, latents)
                 actor_loss = self.actor_loss(logits, log_probs, actions, advantages)
                 actor_loss.backward()
                 self.actor_optim.step()
                 self.actor_optim.zero_grad()
@@ -759,7 +800,7 @@ class Agent(Module):
                 critic_loss = self.critic(
                     states,
                     latents,
-                    targets = advantages + old_values
+                    target = advantages + old_values
                 )
                 critic_loss.backward()
@@ -767,6 +808,10 @@ class Agent(Module):
                 self.critic_optim.step()
                 self.critic_optim.zero_grad()
+        # apply evolution
+        self.latent_gene_pool.genetic_algorithm_step(fitness_scores)
 # reinforcement learning related - ppo
 def actor_loss(
@@ -785,7 +830,7 @@ def actor_loss(
     clipped_ratio = ratio.clamp(min = 1. - eps_clip, max = 1. + eps_clip)
-    actor_loss = -torch.min(clipped_ratio * advantage, ratio * advantage)
+    actor_loss = -torch.min(clipped_ratio * advantages, ratio * advantages)
     # add entropy loss for exploration
@@ -793,15 +838,7 @@ def actor_loss(
     entropy_aux_loss = -entropy_weight * entropy
-    return actor_loss + entropy_aux_loss
-def critic_loss(
-    pred_values,  # Float[b]
-    advantages,   # Float[b]
-    old_values    # Float[b]
-):
-    discounted_values = advantages + old_values
-    return F.mse_loss(pred_values, discounted_values)
+    return (actor_loss + entropy_aux_loss).mean()
 # agent contains the actor, critic, and the latent genetic pool
@@ -814,6 +851,11 @@ def create_agent(
     critic_dim_hiddens: int | tuple[int, ...],
 ) -> Agent:
+    latent_gene_pool = LatentGenePool(
+        num_latents = num_latents,
+        dim_latent = dim_latent
+    )
     actor = Actor(
         num_actions = actor_num_actions,
         dim_state = dim_state,
@@ -825,46 +867,133 @@ def create_agent(
         dim_state = dim_state,
         dim_latent = dim_latent,
         dim_hiddens = critic_dim_hiddens
-    )
-    latent_gene_pool = LatentGenePool(
-        dim_state = dim_state,
-        num_latents = num_latents,
-        dim_latent = dim_latent,
-    )
+    )
     return Agent(actor = actor, critic = critic, latent_gene_pool = latent_gene_pool)
 # EPO - which is just PPO with natural selection of a population of latent variables conditioning the agent
 # the tricky part is that the latent ids for each episode / trajectory needs to be tracked
 Memory = namedtuple('Memory', [
+    'episode_id',
     'state',
     'latent_gene_id',
     'action',
     'log_prob',
     'reward',
-    'values',
+    'value',
     'done'
 ])
-MemoriesAndNextValue = namedtuple('MemoriesAndNextValue', [
+MemoriesAndFitnessScores = namedtuple('MemoriesAndFitnessScores', [
     'memories',
-    'next_value'
+    'fitness_scores'
 ])
 class EPO(Module):
     def __init__(
         self,
-        agent: Agent
+        agent: Agent,
+        episodes_per_latent,
+        max_episode_length,
+        action_sample_temperature = 1.
     ):
         super().__init__()
         self.agent = agent
+        self.action_sample_temperature = action_sample_temperature
+        self.num_latents = agent.latent_gene_pool.num_latents
+        self.episodes_per_latent = episodes_per_latent
+        self.max_episode_length = max_episode_length
+    @torch.no_grad()
     def forward(
         self,
         env
-    ) -> MemoriesAndNextValue:
+    ) -> MemoriesAndFitnessScores:
+        self.agent.eval()
+        invalid_episode = tensor(-1) # will use `episode_id` value of `-1` for the `next_value`, needed for not discarding last reward for generalized advantage estimate
+        memories: list[Memory] = []
+        fitness_scores = torch.zeros((self.num_latents))
+        for episode_id in tqdm(range(self.episodes_per_latent), desc = 'episode'):
+            for latent_id in tqdm(range(self.num_latents), desc = 'latent'):
+                time = 0
+                # initial state
+                state = env.reset()
+                # get latent from pool
+                latent = self.agent.latent_gene_pool(latent_id = latent_id)
+                # until maximum episode length
+                done = tensor(False)
+                while time < self.max_episode_length:
+                    batched_state = rearrange(state, '... -> 1 ...')
+                    # sample action
-        raise NotImplementedError
+                    action, log_prob = self.agent.get_actor_actions(batched_state, latent = latent, sample = True, temperature = self.action_sample_temperature)
+                    action = rearrange(action, '1 ... -> ...')
+                    log_prob = rearrange(log_prob, '1 ... -> ...')
+                    # values
+                    value = self.agent.get_critic_values(batched_state, latent = latent)
+                    value = rearrange(value, '1 ... -> ...')
+                    # get the next state, action, and reward
+                    state, reward, done = env(action)
+                    # update fitness for each gene as cumulative reward received, but make this customizable at some point
+                    fitness_scores[latent_id] += reward
+                    # store memories
+                    memory = Memory(
+                        tensor(episode_id),
+                        state,
+                        tensor(latent_id),
+                        action,
+                        log_prob,
+                        reward,
+                        value,
+                        done
+                    )
+                    memories.append(memory)
+                    time += 1
+                # need the final next value for GAE, iiuc
+                batched_state = rearrange(state, '... -> 1 ...')
+                next_value = self.agent.get_critic_values(batched_state, latent = latent)
+                next_value = rearrange(next_value, '1 ... -> ...')
+                memory_for_gae = memory._replace(
+                    episode_id = invalid_episode,
+                    value = next_value
+                )
+                memories.append(memory_for_gae)
+        return MemoriesAndFitnessScores(
+            memories = memories,
+            fitness_scores = fitness_scores
+        )

evolutionary_policy_optimization/experimental.py CHANGED Viewed

@@ -1,27 +1,47 @@
 import torch
+from einops import rearrange
 def crossover_weights(w1, w2, transpose = False):
     assert w2.shape == w2.shape
-    assert w1.ndim == 2
+    no_batch = w1.ndim == 2
+    if no_batch:
+        w1, w2 = tuple(rearrange(t, '... -> 1 ...') for t in (w1, w2))
+    assert w1.ndim == 3
     if transpose:
-        w1, w2 = w1.t(), w2.t()
+        w1, w2 = tuple(rearrange(t, 'b i j -> b j i') for t in (w1, w2))
-    rank = min(w2.shape)
+    rank = min(w2.shape[1:])
     assert rank >= 2
+    batch = w1.shape[0]
     u1, s1, v1 = torch.svd(w1)
     u2, s2, v2 = torch.svd(w2)
-    mask = torch.randperm(rank) < (rank // 2)
+    batch_randperm = torch.randn((batch, rank), device = w1.device).argsort(dim = -1)
+    mask = batch_randperm < (rank // 2)
-    u = torch.where(mask[None, :], u1, u2)
+    u = torch.where(mask[:, None, :], u1, u2)
     s = torch.where(mask, s1, s2)
-    v = torch.where(mask[None, :], v1, v2)
+    v = torch.where(mask[:, None, :], v1, v2)
     out = u @ torch.diag_embed(s) @ v.mT
     if transpose:
-        out = out.t()
+        out = rearrange(out, 'b j i -> b i j')
+    if no_batch:
+        out = rearrange(out, '1 ... -> ...')
     return out
+if __name__ == '__main__':
+    w1 = torch.randn(32, 16)
+    w2 = torch.randn(32, 16)
+    child = crossover_weights(w2, w2)
+    assert child.shape == w2.shape

evolutionary_policy_optimization/mock_env.py CHANGED Viewed

@@ -4,15 +4,20 @@ import torch
 from torch import tensor, randn, randint
 from torch.nn import Module
+# functions
+def cast_tuple(v):
+    return v if isinstance(v, tuple) else v\
 # mock env
 class Env(Module):
     def __init__(
         self,
-        state_shape: tuple[int, ...]
+        state_shape: int | tuple[int, ...]
     ):
         super().__init__()
-        self.state_shape = state_shape
+        self.state_shape = cast_tuple(state_shape)
         self.register_buffer('dummy', tensor(0))
     @property
@@ -31,6 +36,6 @@ class Env(Module):
     ):
         state = randn(self.state_shape, device = self.device)
         reward = randint(0, 5, (), device = self.device).float()
-        done = zeros((), device = self.device, dtype = torch.bool)
+        done = torch.zeros((), device = self.device, dtype = torch.bool)
         return state, reward, done

{evolutionary_policy_optimization-0.0.37.dist-info → evolutionary_policy_optimization-0.0.39.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: evolutionary-policy-optimization
-Version: 0.0.37
+Version: 0.0.39
 Summary: EPO - Pytorch
 Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
 Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
@@ -60,8 +60,6 @@ This paper stands out, as I have witnessed the positive effects first hand in an
 Besides their latent variable strategy, I'll also throw in some attempts with crossover in weight space
-Update: I see, mixing genetic algorithms with gradient based method is already a research field, under [Memetic algorithms](https://en.wikipedia.org/wiki/Memetic_algorithm)
 ## Install
 ```bash
@@ -103,6 +101,45 @@ fitness = torch.randn(128)
 latent_pool.genetic_algorithm_step(fitness) # update latent genes with genetic algorithm
 ```
+End to end learning
+```python
+import torch
+from evolutionary_policy_optimization import (
+    create_agent,
+    EPO,
+    Env
+)
+agent = create_agent(
+    dim_state = 512,
+    num_latents = 8,
+    dim_latent = 32,
+    actor_num_actions = 5,
+    actor_dim_hiddens = (256, 128),
+    critic_dim_hiddens = (256, 128, 64)
+)
+epo = EPO(
+    agent,
+    episodes_per_latent = 1,
+    max_episode_length = 10,
+    action_sample_temperature = 1.
+)
+env = Env((512,))
+memories = epo(env)
+agent(memories)
+# saving and loading
+agent.save('./agent.pt', overwrite = True)
+agent.load('./agent.pt')
+```
 ## Citations
 ```bibtex

evolutionary_policy_optimization-0.0.39.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+evolutionary_policy_optimization/__init__.py,sha256=0q0aBuFgWi06MLMD8FiHzBYQ3_W4LYWrwmCtF3u5H2A,201
+evolutionary_policy_optimization/epo.py,sha256=lzxPamJahE5KqBwzyYlGOwNeUoB2vONLwtRcWqCI_Jw,29800
+evolutionary_policy_optimization/experimental.py,sha256=9FrJGviLESlYysHI3i83efT9g2ZB9ha4u3K9HXN98_w,1100
+evolutionary_policy_optimization/mock_env.py,sha256=QqVPZVJtrvQmSDcnYDTob_A5sDwiUzGj6_tmo6BII5c,918
+evolutionary_policy_optimization-0.0.39.dist-info/METADATA,sha256=TTNQD7sTWIgpVwnrQrFFBD-cyySkvwJr_J3ABxTpor8,5409
+evolutionary_policy_optimization-0.0.39.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+evolutionary_policy_optimization-0.0.39.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+evolutionary_policy_optimization-0.0.39.dist-info/RECORD,,

evolutionary_policy_optimization-0.0.37.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-evolutionary_policy_optimization/__init__.py,sha256=Qavcia0n13jjaWIS_LPW7QrxSLT_BBeKujCjF9kQjbA,133
-evolutionary_policy_optimization/epo.py,sha256=onIGNWHg1EGQwJ9TfkkJ8Yz8_S-BPoaqrxJwq54BXp0,25992
-evolutionary_policy_optimization/experimental.py,sha256=ktBKxRF27Qsj7WIgBpYlWXqMVxO9zOx2oD1JuDYRAwM,548
-evolutionary_policy_optimization/mock_env.py,sha256=3xrd-gwjZeVd_sEvxIyX0lppnMWcfQGOapO-XjKmExI,816
-evolutionary_policy_optimization-0.0.37.dist-info/METADATA,sha256=nPWBCvx02MHWdKu5cEoPmHFMFKhwepOfStkXIXR2NHc,4992
-evolutionary_policy_optimization-0.0.37.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-evolutionary_policy_optimization-0.0.37.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-evolutionary_policy_optimization-0.0.37.dist-info/RECORD,,

{evolutionary_policy_optimization-0.0.37.dist-info → evolutionary_policy_optimization-0.0.39.dist-info}/WHEEL RENAMED Viewed

File without changes

{evolutionary_policy_optimization-0.0.37.dist-info → evolutionary_policy_optimization-0.0.39.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

evolutionary-policy-optimization 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl

evolutionary-policy-optimization 0.0.37py3-none-any.whl → 0.0.39py3-none-any.whl