PyPI - evolutionary-policy-optimization - Versions diffs - 0.1.19__tar.gz → 0.2.0__tar.gz - Mend

evolutionary-policy-optimization 0.1.19tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

evolutionary_policy_optimization-0.2.0/.github/workflows/lint.yml ADDED Viewed

@@ -0,0 +1,21 @@
+name: Ruff
+on: [push, pull_request]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        python -m pip install uv
+        python -m uv pip install ruff
+    - name: Lint with Ruff
+      run: |
+        ruff check evolutionary_policy_optimization/

{evolutionary_policy_optimization-0.1.19 → evolutionary_policy_optimization-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: evolutionary-policy-optimization
-Version: 0.1.19
+Version: 0.2.0
 Summary: EPO - Pytorch
 Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
 Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
@@ -49,6 +49,7 @@ Requires-Dist: pufferlib>=2.0.6; extra == 'examples'
 Requires-Dist: tqdm; extra == 'examples'
 Provides-Extra: test
 Requires-Dist: pytest; extra == 'test'
+Requires-Dist: ruff>=0.4.2; extra == 'test'
 Description-Content-Type: text/markdown
 <img width="450px" alt="fig1" src="https://github.com/user-attachments/assets/33bef569-e786-4f09-bdee-56bad7ea9e6d" />
@@ -144,6 +145,22 @@ agent.save('./agent.pt', overwrite = True)
 agent.load('./agent.pt')
 ```
+## Contributing
+At the project root, run
+```bash
+$ pip install '.[test]' # or `uv pip install '.[test]'`
+```
+Then add your tests to `tests/test_epo.py` and run
+```bash
+$ pytest tests/
+```
+That's it
 ## Citations
 ```bibtex
@@ -237,4 +254,15 @@ agent.load('./agent.pt')
 }
 ```
+```bibtex
+@article{Lee2024SimBaSB,
+    title   = {SimBa: Simplicity Bias for Scaling Up Parameters in Deep Reinforcement Learning},
+    author  = {Hojoon Lee and Dongyoon Hwang and Donghu Kim and Hyunseung Kim and Jun Jet Tai and Kaushik Subramanian and Peter R. Wurman and Jaegul Choo and Peter Stone and Takuma Seno},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2410.09754},
+    url     = {https://api.semanticscholar.org/CorpusID:273346233}
+}
+```
 *Evolution is cleverer than you are.* - Leslie Orgel

{evolutionary_policy_optimization-0.1.19 → evolutionary_policy_optimization-0.2.0}/README.md RENAMED Viewed

@@ -91,6 +91,22 @@ agent.save('./agent.pt', overwrite = True)
 agent.load('./agent.pt')
 ```
+## Contributing
+At the project root, run
+```bash
+$ pip install '.[test]' # or `uv pip install '.[test]'`
+```
+Then add your tests to `tests/test_epo.py` and run
+```bash
+$ pytest tests/
+```
+That's it
 ## Citations
 ```bibtex
@@ -184,4 +200,15 @@ agent.load('./agent.pt')
 }
 ```
+```bibtex
+@article{Lee2024SimBaSB,
+    title   = {SimBa: Simplicity Bias for Scaling Up Parameters in Deep Reinforcement Learning},
+    author  = {Hojoon Lee and Dongyoon Hwang and Donghu Kim and Hyunseung Kim and Jun Jet Tai and Kaushik Subramanian and Peter R. Wurman and Jaegul Choo and Peter Stone and Takuma Seno},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2410.09754},
+    url     = {https://api.semanticscholar.org/CorpusID:273346233}
+}
+```
 *Evolution is cleverer than you are.* - Leslie Orgel

{evolutionary_policy_optimization-0.1.19 → evolutionary_policy_optimization-0.2.0}/evolutionary_policy_optimization/distributed.py RENAMED Viewed

@@ -61,8 +61,6 @@ def has_only_one_value(t):
     return (t == t[0]).all()
 def all_gather_variable_dim(t, dim = 0, sizes = None):
-    device, rank, world_size = t.device, dist.get_rank(), dist.get_world_size()
     if not exists(sizes):
         sizes = gather_sizes(t, dim = dim)

{evolutionary_policy_optimization-0.1.19 → evolutionary_policy_optimization-0.2.0}/evolutionary_policy_optimization/epo.py RENAMED Viewed

@@ -19,7 +19,7 @@ from torch.utils.data import TensorDataset, DataLoader
 from torch.utils._pytree import tree_map
 import einx
-from einops import rearrange, repeat, einsum, pack
+from einops import rearrange, repeat, reduce, einsum, pack
 from einops.layers.torch import Rearrange
 from evolutionary_policy_optimization.distributed import (
@@ -192,7 +192,6 @@ def calc_generalized_advantage_estimate(
     use_accelerated = None
 ):
     use_accelerated = default(use_accelerated, rewards.is_cuda)
-    device = rewards.device
     values = F.pad(values, (0, 1), value = 0.)
     values, values_next = values[:-1], values[1:]
@@ -202,7 +201,7 @@ def calc_generalized_advantage_estimate(
     scan = AssocScan(reverse = True, use_accelerated = use_accelerated)
-    return scan(gates, delta)
+    return scan(gates, delta)
 # evolution related functions
@@ -336,6 +335,53 @@ class DynamicLIMe(Module):
         return einsum(hiddens, weights, 'l b d, b l -> b d')
+# state normalization
+class StateNorm(Module):
+    def __init__(
+        self,
+        dim,
+        eps = 1e-5
+    ):
+        # equation (3) in https://arxiv.org/abs/2410.09754 - 'RSMNorm'
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.register_buffer('step', tensor(1))
+        self.register_buffer('running_mean', torch.zeros(dim))
+        self.register_buffer('running_variance', torch.ones(dim))
+    def forward(
+        self,
+        state
+    ):
+        assert state.shape[-1] == self.dim, f'expected feature dimension of {self.dim} but received {x.shape[-1]}'
+        time = self.step.item()
+        mean = self.running_mean
+        variance = self.running_variance
+        normed = (state - mean) / variance.sqrt().clamp(min = self.eps)
+        if not self.training:
+            return normed
+        # update running mean and variance
+        new_obs_mean = reduce(state, '... d -> d', 'mean')
+        delta = new_obs_mean - mean
+        new_mean = mean + delta / time
+        new_variance = (time - 1) / time * (variance + (delta ** 2) / time)
+        self.step.add_(1)
+        self.running_mean.copy_(new_mean)
+        self.running_variance.copy_(new_variance)
+        return normed
 # simple MLP networks, but with latent variables
 # the latent variables are the "genes" with the rest of the network as the scaffold for "gene expression" - as suggested in the paper
@@ -444,10 +490,13 @@ class Actor(Module):
         num_actions,
         dim,
         mlp_depth,
+        state_norm: StateNorm | None = None,
         dim_latent = 0,
     ):
         super().__init__()
+        self.state_norm = state_norm
         self.dim_latent = dim_latent
         self.init_layer = nn.Sequential(
@@ -467,6 +516,10 @@ class Actor(Module):
         state,
         latent
     ):
+        if exists(self.state_norm):
+            with torch.no_grad():
+                self.state_norm.eval()
+                state = self.state_norm(state)
         hidden = self.init_layer(state)
@@ -482,6 +535,7 @@ class Critic(Module):
         mlp_depth,
         dim_latent = 0,
         use_regression = False,
+        state_norm: StateNorm | None = None,
         hl_gauss_loss_kwargs: dict = dict(
             min_value = -100.,
             max_value = 100.,
@@ -490,6 +544,8 @@ class Critic(Module):
     ):
         super().__init__()
+        self.state_norm = state_norm
         self.dim_latent = dim_latent
         self.init_layer = nn.Sequential(
@@ -523,6 +579,12 @@ class Critic(Module):
         eps_clip = 0.4,
         use_improved = True
     ):
+        if exists(self.state_norm):
+            with torch.no_grad():
+                self.state_norm.eval()
+                state = self.state_norm(state)
         logits = self.forward(state, latent, return_logits = True)
         value = self.maybe_bins_to_value(logits)
@@ -535,7 +597,8 @@ class Critic(Module):
             old_values_lo = old_values - eps_clip
             old_values_hi = old_values + eps_clip
-            is_between = lambda lo, hi: (lo < value) & (value < hi)
+            def is_between(lo, hi):
+                return (lo < value) & (value < hi)
             clipped_loss = loss_fn(logits, clipped_target)
             loss = loss_fn(logits, target)
@@ -921,6 +984,7 @@ class Agent(Module):
         critic: Critic,
         latent_gene_pool: LatentGenePool | None,
         optim_klass = AdoptAtan2,
+        state_norm: StateNorm | None = None,
         actor_lr = 8e-4,
         critic_lr = 8e-4,
         latent_lr = 1e-5,
@@ -965,12 +1029,20 @@ class Agent(Module):
             accelerate = Accelerator(**accelerate_kwargs)
             self.accelerate = accelerate
+        # state norm
+        self.state_norm = state_norm
         # actor, critic, and their shared latent gene pool
         self.actor = actor
         self.critic = critic
+        if exists(state_norm):
+            # insurance
+            actor.state_norm = critic.state_norm = state_norm
         self.use_critic_ema = use_critic_ema
         self.critic_ema = EMA(critic, beta = critic_ema_beta, include_online_model = False, **ema_kwargs) if use_critic_ema else None
@@ -1034,6 +1106,7 @@ class Agent(Module):
             self.clip_grad_norm_ = self.accelerate.clip_grad_norm_
             (
+                self.state_norm,
                 self.actor,
                 self.critic,
                 self.latent_gene_pool,
@@ -1042,6 +1115,7 @@ class Agent(Module):
                 self.latent_optim,
             ) = tuple(
                 maybe(self.accelerate.prepare)(m) for m in (
+                    self.state_norm,
                     self.actor,
                     self.critic,
                     self.latent_gene_pool,
@@ -1076,31 +1150,36 @@ class Agent(Module):
     def save(self, path, overwrite = False):
         path = Path(path)
+        unwrap = self.unwrap_model
         assert not path.exists() or overwrite
         pkg = dict(
-            actor = self.actor.state_dict(),
-            critic = self.critic.state_dict(),
+            state_norm = unwrap(self.state_norm).state_dict() if self.state_norm else None,
+            actor = unwrap(self.actor).state_dict(),
+            critic = unwrap(self.critic).state_dict(),
             critic_ema = self.critic_ema.state_dict() if self.use_critic_ema else None,
-            latents = self.latent_gene_pool.state_dict() if self.has_latent_genes else None,
-            actor_optim = self.actor_optim.state_dict(),
-            critic_optim = self.critic_optim.state_dict(),
-            latent_optim = self.latent_optim.state_dict() if exists(self.latent_optim) else None
+            latents = unwrap(self.latent_gene_pool).state_dict() if self.has_latent_genes else None,
+            actor_optim = unwrap(self.actor_optim).state_dict(),
+            critic_optim = unwrap(self.critic_optim).state_dict(),
+            latent_optim = unwrap(self.latent_optim).state_dict() if exists(self.latent_optim) else None
         )
         torch.save(pkg, str(path))
     def load(self, path):
+        unwrap = self.unwrap_model
         path = Path(path)
         assert path.exists()
         pkg = torch.load(str(path), weights_only = True)
-        self.actor.load_state_dict(pkg['actor'])
+        unwrap(self.actor).load_state_dict(pkg['actor'])
+        unwrap(self.critic).load_state_dict(pkg['critic'])
-        self.critic.load_state_dict(pkg['critic'])
+        unwrap(self.latent_gene_pool).load_state_dict(pkg['latents'])
         if self.use_critic_ema:
             self.critic_ema.load_state_dict(pkg['critic_ema'])
@@ -1108,11 +1187,11 @@ class Agent(Module):
         if exists(pkg.get('latents', None)):
             self.latent_gene_pool.load_state_dict(pkg['latents'])
-        self.actor_optim.load_state_dict(pkg['actor_optim'])
-        self.critic_optim.load_state_dict(pkg['critic_optim'])
+        unwrap(self.actor_optim).load_state_dict(pkg['actor_optim'])
+        unwrap(self.critic_optim).load_state_dict(pkg['critic_optim'])
         if exists(pkg.get('latent_optim', None)):
-            self.latent_optim.load_state_dict(pkg['latent_optim'])
+            unwrap(self.latent_optim).load_state_dict(pkg['latent_optim'])
     @move_input_tensors_to_device
     def get_actor_actions(
@@ -1326,6 +1405,14 @@ class Agent(Module):
                         diversity_loss = diversity_loss.item()
                     )
+        # update state norm if needed
+        if exists(self.state_norm):
+            self.state_norm.train()
+            for _, states, *_ in tqdm(dataloader, desc = 'state norm learning'):
+                self.state_norm(states)
         # apply evolution
         if self.has_latent_genes:
@@ -1406,12 +1493,15 @@ def create_agent(
         **latent_gene_pool_kwargs
     ) if has_latent_genes else None
+    state_norm = StateNorm(dim = dim_state)
     actor = Actor(
         num_actions = actor_num_actions,
         dim_state = dim_state,
         dim_latent = dim_latent,
         dim = actor_dim,
         mlp_depth = actor_mlp_depth,
+        state_norm = state_norm,
         **actor_kwargs
     )
@@ -1420,12 +1510,14 @@ def create_agent(
         dim_latent = dim_latent,
         dim = critic_dim,
         mlp_depth = critic_mlp_depth,
+        state_norm = state_norm,
         **critic_kwargs
     )
     agent = Agent(
         actor = actor,
         critic = critic,
+        state_norm = state_norm,
         latent_gene_pool = latent_gene_pool,
         use_critic_ema = use_critic_ema,
         **kwargs
@@ -1639,4 +1731,4 @@ class EPO(Module):
             agent.learn_from(memories)
-        print(f'training complete')
+        print('training complete')

{evolutionary_policy_optimization-0.1.19 → evolutionary_policy_optimization-0.2.0}/evolutionary_policy_optimization/experimental.py RENAMED Viewed

@@ -1,6 +1,10 @@
 import torch
+import torch.nn.functional as F
 from einops import rearrange
+def l2norm(t, dim = -1):
+    return F.normalize(t, dim = dim)
 def crossover_weights(w1, w2, transpose = False):
     assert w2.shape == w2.shape
@@ -27,7 +31,7 @@ def crossover_weights(w1, w2, transpose = False):
     u = torch.where(mask[:, None, :], u1, u2)
     s = torch.where(mask, s1, s2)
-    v = torch.where(mask[:, None, :], v1, v2)
+    v = torch.where(mask[:, :, None], v1, v2)
     out = u @ torch.diag_embed(s) @ v.mT
@@ -52,9 +56,13 @@ def mutate_weight(
     assert rank >= 2
     u, s, v = torch.svd(w)
     u = u + torch.randn_like(u) * mutation_strength
     v = v + torch.randn_like(v) * mutation_strength
+    u = l2norm(u, dim = -2)
+    v = l2norm(v, dim = -1)
     out = u @ torch.diag_embed(s) @ v.mT
     if transpose:

{evolutionary_policy_optimization-0.1.19 → evolutionary_policy_optimization-0.2.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "evolutionary-policy-optimization"
-version = "0.1.19"
+version = "0.2.0"
 description = "EPO - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -47,7 +47,8 @@ examples = [
 ]
 test = [
-    "pytest"
+    "pytest",
+    "ruff>=0.4.2",
 ]
 [tool.pytest.ini_options]
@@ -55,6 +56,19 @@ pythonpath = [
   "."
 ]
+[tool.ruff]
+line-length = 1000
+lint.ignore = [
+    "F722", # for jaxtyping shape annotation
+    "F401",
+    "F821"
+]
+lint.extend-select = [
+    "W291"
+]
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"

{evolutionary_policy_optimization-0.1.19 → evolutionary_policy_optimization-0.2.0}/tests/test_epo.py RENAMED Viewed

@@ -19,7 +19,7 @@ def test_readme(
     latent_pool = LatentGenePool(
         num_latents = 128,
-        dim_latent = 32,
+        dim_latent = 32,
         num_islands = num_islands,
         fast_genetic_algorithm = sampled_mutation_strengths
     )
@@ -31,8 +31,8 @@ def test_readme(
     latent = latent_pool(latent_id = latent_ids, state = state)
-    actions = actor(state, latent)
-    value = critic(state, latent)
+    actions = actor(state, latent) # noqa: F841
+    value = critic(state, latent) # noqa: F841
     # interact with environment and receive rewards, termination etc
@@ -63,8 +63,8 @@ def test_create_agent(
     state = torch.randn(2, 512)
-    actions = agent.get_actor_actions(state, latent_id = latent_ids)
-    value = agent.get_critic_values(state, latent_id = latent_ids)
+    actions = agent.get_actor_actions(state, latent_id = latent_ids) # noqa: F841
+    value = agent.get_critic_values(state, latent_id = latent_ids) # noqa: F841
     # interact with environment and receive rewards, termination etc

{evolutionary_policy_optimization-0.1.19 → evolutionary_policy_optimization-0.2.0}/train_gym.py RENAMED Viewed

@@ -21,7 +21,7 @@ env = gym.wrappers.RecordVideo(
     env = env,
     video_folder = './recordings',
     name_prefix = 'lunar-video',
-    episode_trigger = lambda eps_num: (eps_num % (250 * 4)) == 0,
+    episode_trigger = lambda eps_num: (eps_num % 250) == 0,
     disable_logger = True
 )
@@ -53,8 +53,8 @@ agent = env.to_epo_agent(
 epo = EPO(
     agent,
-    episodes_per_latent = 5,
-    max_episode_length = 500,
+    episodes_per_latent = 10,
+    max_episode_length = 250,
     action_sample_temperature = 1.,
 )