evolutionary-policy-optimization 0.0.45__py3-none-any.whl → 0.0.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -87,6 +87,11 @@ def temp_batch_dim(fn):
87
87
 
88
88
  return inner
89
89
 
90
+ # fitness related
91
+
92
+ def get_fitness_scores(cum_rewards, memories):
93
+ return cum_rewards
94
+
90
95
  # generalized advantage estimate
91
96
 
92
97
  def calc_generalized_advantage_estimate(
@@ -643,6 +648,7 @@ class Agent(Module):
643
648
  actor_optim_kwargs: dict = dict(),
644
649
  critic_optim_kwargs: dict = dict(),
645
650
  latent_optim_kwargs: dict = dict(),
651
+ get_fitness_scores: Callable[..., Tensor] = get_fitness_scores
646
652
  ):
647
653
  super().__init__()
648
654
 
@@ -663,6 +669,10 @@ class Agent(Module):
663
669
  self.actor_loss = partial(actor_loss, **actor_loss_kwargs)
664
670
  self.calc_gae = partial(calc_generalized_advantage_estimate, **calc_gae_kwargs)
665
671
 
672
+ # fitness score related
673
+
674
+ self.get_fitness_scores = get_fitness_scores
675
+
666
676
  # learning hparams
667
677
 
668
678
  self.batch_size = batch_size
@@ -766,10 +776,12 @@ class Agent(Module):
766
776
 
767
777
  def forward(
768
778
  self,
769
- memories_and_fitness_scores: MemoriesAndFitnessScores,
779
+ memories_and_cumulative_rewards: MemoriesAndCumulativeRewards,
770
780
  epochs = 2
771
781
  ):
772
- memories, fitness_scores = memories_and_fitness_scores
782
+ memories, cumulative_rewards = memories_and_cumulative_rewards
783
+
784
+ fitness_scores = self.get_fitness_scores(cumulative_rewards, memories)
773
785
 
774
786
  (
775
787
  episode_ids,
@@ -952,9 +964,9 @@ Memory = namedtuple('Memory', [
952
964
  'done'
953
965
  ])
954
966
 
955
- MemoriesAndFitnessScores = namedtuple('MemoriesAndFitnessScores', [
967
+ MemoriesAndCumulativeRewards = namedtuple('MemoriesAndCumulativeRewards', [
956
968
  'memories',
957
- 'fitness_scores'
969
+ 'cumulative_rewards'
958
970
  ])
959
971
 
960
972
  class EPO(Module):
@@ -978,7 +990,7 @@ class EPO(Module):
978
990
  def forward(
979
991
  self,
980
992
  env
981
- ) -> MemoriesAndFitnessScores:
993
+ ) -> MemoriesAndCumulativeRewards:
982
994
 
983
995
  self.agent.eval()
984
996
 
@@ -986,7 +998,7 @@ class EPO(Module):
986
998
 
987
999
  memories: list[Memory] = []
988
1000
 
989
- fitness_scores = torch.zeros((self.num_latents))
1001
+ cumulative_rewards = torch.zeros((self.num_latents))
990
1002
 
991
1003
  for episode_id in tqdm(range(self.episodes_per_latent), desc = 'episode'):
992
1004
 
@@ -1019,9 +1031,9 @@ class EPO(Module):
1019
1031
 
1020
1032
  state, reward, done = env(action)
1021
1033
 
1022
- # update fitness for each gene as cumulative reward received, but make this customizable at some point
1034
+ # update cumulative rewards per latent, to be used as default fitness score
1023
1035
 
1024
- fitness_scores[latent_id] += reward
1036
+ cumulative_rewards[latent_id] += reward
1025
1037
 
1026
1038
  # store memories
1027
1039
 
@@ -1051,7 +1063,7 @@ class EPO(Module):
1051
1063
 
1052
1064
  memories.append(memory_for_gae)
1053
1065
 
1054
- return MemoriesAndFitnessScores(
1066
+ return MemoriesAndCumulativeRewards(
1055
1067
  memories = memories,
1056
- fitness_scores = fitness_scores
1068
+ cumulative_rewards = cumulative_rewards
1057
1069
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evolutionary-policy-optimization
3
- Version: 0.0.45
3
+ Version: 0.0.46
4
4
  Summary: EPO - Pytorch
5
5
  Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
6
6
  Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
@@ -1,8 +1,8 @@
1
1
  evolutionary_policy_optimization/__init__.py,sha256=0q0aBuFgWi06MLMD8FiHzBYQ3_W4LYWrwmCtF3u5H2A,201
2
- evolutionary_policy_optimization/epo.py,sha256=NA-d7pWDJyQYULDIVB25lnbpTbwMyxc1U8RU8XGTNts,31500
2
+ evolutionary_policy_optimization/epo.py,sha256=SAhWgRY8uPQEKFg1_nz1mvh8A6S_sHwnDykhd0F5xEI,31853
3
3
  evolutionary_policy_optimization/experimental.py,sha256=9FrJGviLESlYysHI3i83efT9g2ZB9ha4u3K9HXN98_w,1100
4
4
  evolutionary_policy_optimization/mock_env.py,sha256=6AIc4mwL_C6JkAxwESJgCLxXHMzCAu2FcffVg3HkSm0,920
5
- evolutionary_policy_optimization-0.0.45.dist-info/METADATA,sha256=3jXsZBoltrWQJk2Yd6zu1KmCcl9AEuhxES_mX8E1lAk,6213
6
- evolutionary_policy_optimization-0.0.45.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
7
- evolutionary_policy_optimization-0.0.45.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
8
- evolutionary_policy_optimization-0.0.45.dist-info/RECORD,,
5
+ evolutionary_policy_optimization-0.0.46.dist-info/METADATA,sha256=xP2kdKo52-X4Z5XXTPpW0M_NFI0spuigeL7fvqFlsRM,6213
6
+ evolutionary_policy_optimization-0.0.46.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
7
+ evolutionary_policy_optimization-0.0.46.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
8
+ evolutionary_policy_optimization-0.0.46.dist-info/RECORD,,