evolutionary-policy-optimization 0.0.45__tar.gz → 0.0.47__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (13) hide show
  1. {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/PKG-INFO +1 -1
  2. {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/evolutionary_policy_optimization/epo.py +36 -12
  3. {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/evolutionary_policy_optimization/mock_env.py +2 -1
  4. {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/pyproject.toml +1 -1
  5. {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/.github/workflows/python-publish.yml +0 -0
  6. {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/.github/workflows/test.yml +0 -0
  7. {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/.gitignore +0 -0
  8. {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/LICENSE +0 -0
  9. {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/README.md +0 -0
  10. {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/evolutionary_policy_optimization/__init__.py +0 -0
  11. {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/evolutionary_policy_optimization/experimental.py +0 -0
  12. {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/requirements.txt +0 -0
  13. {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/tests/test_epo.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evolutionary-policy-optimization
3
- Version: 0.0.45
3
+ Version: 0.0.47
4
4
  Summary: EPO - Pytorch
5
5
  Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
6
6
  Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  from functools import partial, wraps
4
4
  from pathlib import Path
5
5
  from collections import namedtuple
6
+ from random import randrange
6
7
 
7
8
  import torch
8
9
  from torch import nn, cat, stack, is_tensor, tensor
@@ -87,6 +88,11 @@ def temp_batch_dim(fn):
87
88
 
88
89
  return inner
89
90
 
91
+ # fitness related
92
+
93
+ def get_fitness_scores(cum_rewards, memories):
94
+ return cum_rewards
95
+
90
96
  # generalized advantage estimate
91
97
 
92
98
  def calc_generalized_advantage_estimate(
@@ -643,6 +649,7 @@ class Agent(Module):
643
649
  actor_optim_kwargs: dict = dict(),
644
650
  critic_optim_kwargs: dict = dict(),
645
651
  latent_optim_kwargs: dict = dict(),
652
+ get_fitness_scores: Callable[..., Tensor] = get_fitness_scores
646
653
  ):
647
654
  super().__init__()
648
655
 
@@ -663,6 +670,10 @@ class Agent(Module):
663
670
  self.actor_loss = partial(actor_loss, **actor_loss_kwargs)
664
671
  self.calc_gae = partial(calc_generalized_advantage_estimate, **calc_gae_kwargs)
665
672
 
673
+ # fitness score related
674
+
675
+ self.get_fitness_scores = get_fitness_scores
676
+
666
677
  # learning hparams
667
678
 
668
679
  self.batch_size = batch_size
@@ -766,10 +777,12 @@ class Agent(Module):
766
777
 
767
778
  def forward(
768
779
  self,
769
- memories_and_fitness_scores: MemoriesAndFitnessScores,
780
+ memories_and_cumulative_rewards: MemoriesAndCumulativeRewards,
770
781
  epochs = 2
771
782
  ):
772
- memories, fitness_scores = memories_and_fitness_scores
783
+ memories, cumulative_rewards = memories_and_cumulative_rewards
784
+
785
+ fitness_scores = self.get_fitness_scores(cumulative_rewards, memories)
773
786
 
774
787
  (
775
788
  episode_ids,
@@ -952,9 +965,9 @@ Memory = namedtuple('Memory', [
952
965
  'done'
953
966
  ])
954
967
 
955
- MemoriesAndFitnessScores = namedtuple('MemoriesAndFitnessScores', [
968
+ MemoriesAndCumulativeRewards = namedtuple('MemoriesAndCumulativeRewards', [
956
969
  'memories',
957
- 'fitness_scores'
970
+ 'cumulative_rewards'
958
971
  ])
959
972
 
960
973
  class EPO(Module):
@@ -977,8 +990,9 @@ class EPO(Module):
977
990
  @torch.no_grad()
978
991
  def forward(
979
992
  self,
980
- env
981
- ) -> MemoriesAndFitnessScores:
993
+ env,
994
+ fix_seed_across_latents = True
995
+ ) -> MemoriesAndCumulativeRewards:
982
996
 
983
997
  self.agent.eval()
984
998
 
@@ -986,16 +1000,26 @@ class EPO(Module):
986
1000
 
987
1001
  memories: list[Memory] = []
988
1002
 
989
- fitness_scores = torch.zeros((self.num_latents))
1003
+ cumulative_rewards = torch.zeros((self.num_latents))
990
1004
 
991
1005
  for episode_id in tqdm(range(self.episodes_per_latent), desc = 'episode'):
992
1006
 
1007
+ # maybe fix seed for environment across all latents
1008
+
1009
+ env_reset_kwargs = dict()
1010
+
1011
+ if fix_seed_across_latents:
1012
+ seed = randrange(int(1e6))
1013
+ env_reset_kwargs = dict(seed = seed)
1014
+
1015
+ # for each latent (on a single machine for now)
1016
+
993
1017
  for latent_id in tqdm(range(self.num_latents), desc = 'latent'):
994
1018
  time = 0
995
1019
 
996
1020
  # initial state
997
1021
 
998
- state = env.reset()
1022
+ state = env.reset(**env_reset_kwargs)
999
1023
 
1000
1024
  # get latent from pool
1001
1025
 
@@ -1019,9 +1043,9 @@ class EPO(Module):
1019
1043
 
1020
1044
  state, reward, done = env(action)
1021
1045
 
1022
- # update fitness for each gene as cumulative reward received, but make this customizable at some point
1046
+ # update cumulative rewards per latent, to be used as default fitness score
1023
1047
 
1024
- fitness_scores[latent_id] += reward
1048
+ cumulative_rewards[latent_id] += reward
1025
1049
 
1026
1050
  # store memories
1027
1051
 
@@ -1051,7 +1075,7 @@ class EPO(Module):
1051
1075
 
1052
1076
  memories.append(memory_for_gae)
1053
1077
 
1054
- return MemoriesAndFitnessScores(
1078
+ return MemoriesAndCumulativeRewards(
1055
1079
  memories = memories,
1056
- fitness_scores = fitness_scores
1080
+ cumulative_rewards = cumulative_rewards
1057
1081
  )
@@ -25,7 +25,8 @@ class Env(Module):
25
25
  return self.dummy.device
26
26
 
27
27
  def reset(
28
- self
28
+ self,
29
+ seed
29
30
  ):
30
31
  state = randn(self.state_shape, device = self.device)
31
32
  return state
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "evolutionary-policy-optimization"
3
- version = "0.0.45"
3
+ version = "0.0.47"
4
4
  description = "EPO - Pytorch"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }