evolutionary-policy-optimization 0.0.60__py3-none-any.whl → 0.0.62__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -104,8 +104,11 @@ def temp_batch_dim(fn):
104
104
 
105
105
  # fitness related
106
106
 
107
- def get_fitness_scores(cum_rewards, memories):
108
- return cum_rewards
107
+ def get_fitness_scores(
108
+ cum_rewards, # Float['gene episodes']
109
+ memories
110
+ ): # Float['gene']
111
+ return cum_rewards.sum(dim = -1) # sum all rewards across episodes, but could override this function for normalizing with whatever
109
112
 
110
113
  # generalized advantage estimate
111
114
 
@@ -684,7 +687,8 @@ class Agent(Module):
684
687
  ),
685
688
  actor_loss_kwargs: dict = dict(
686
689
  eps_clip = 0.2,
687
- entropy_weight = .01
690
+ entropy_weight = .01,
691
+ norm_advantages = True
688
692
  ),
689
693
  ema_kwargs: dict = dict(),
690
694
  actor_optim_kwargs: dict = dict(),
@@ -826,9 +830,7 @@ class Agent(Module):
826
830
  memories_and_cumulative_rewards: MemoriesAndCumulativeRewards,
827
831
  epochs = 2
828
832
  ):
829
- memories, cumulative_rewards = memories_and_cumulative_rewards
830
-
831
- fitness_scores = self.get_fitness_scores(cumulative_rewards, memories)
833
+ memories, rewards_per_latent_episode = memories_and_cumulative_rewards
832
834
 
833
835
  # stack memories
834
836
 
@@ -839,6 +841,14 @@ class Agent(Module):
839
841
  if is_distributed():
840
842
  memories = map(partial(all_gather_variable_dim, dim = 0), memories)
841
843
 
844
+ rewards_per_latent_episode = dist.all_reduce(rewards_per_latent_episode)
845
+
846
+ # calculate fitness scores
847
+
848
+ fitness_scores = self.get_fitness_scores(rewards_per_latent_episode, memories)
849
+
850
+ # process memories
851
+
842
852
  (
843
853
  episode_ids,
844
854
  states,
@@ -852,12 +862,16 @@ class Agent(Module):
852
862
 
853
863
  masks = 1. - dones.float()
854
864
 
865
+ # generalized advantage estimate
866
+
855
867
  advantages = self.calc_gae(
856
868
  rewards[:-1],
857
869
  values,
858
870
  masks[:-1],
859
871
  )
860
872
 
873
+ # dataset and dataloader
874
+
861
875
  valid_episode = episode_ids >= 0
862
876
 
863
877
  dataset = TensorDataset(
@@ -869,6 +883,8 @@ class Agent(Module):
869
883
 
870
884
  dataloader = DataLoader(dataset, batch_size = self.batch_size, shuffle = True)
871
885
 
886
+ # updating actor and critic
887
+
872
888
  self.actor.train()
873
889
  self.critic.train()
874
890
 
@@ -952,7 +968,8 @@ def actor_loss(
952
968
  advantages, # Float[b]
953
969
  eps_clip = 0.2,
954
970
  entropy_weight = .01,
955
- eps = 1e-5
971
+ eps = 1e-5,
972
+ norm_advantages = True
956
973
  ):
957
974
  batch = logits.shape[0]
958
975
 
@@ -964,7 +981,8 @@ def actor_loss(
964
981
 
965
982
  clipped_ratio = ratio.clamp(min = 1. - eps_clip, max = 1. + eps_clip)
966
983
 
967
- advantages = F.layer_norm(advantages, (batch,), eps = eps)
984
+ if norm_advantages:
985
+ advantages = F.layer_norm(advantages, (batch,), eps = eps)
968
986
 
969
987
  actor_loss = -torch.min(clipped_ratio * advantages, ratio * advantages)
970
988
 
@@ -1039,7 +1057,7 @@ Memory = namedtuple('Memory', [
1039
1057
 
1040
1058
  MemoriesAndCumulativeRewards = namedtuple('MemoriesAndCumulativeRewards', [
1041
1059
  'memories',
1042
- 'cumulative_rewards'
1060
+ 'cumulative_rewards' # Float['latent episodes']
1043
1061
  ])
1044
1062
 
1045
1063
  class EPO(Module):
@@ -1096,7 +1114,7 @@ class EPO(Module):
1096
1114
 
1097
1115
  memories: list[Memory] = []
1098
1116
 
1099
- cumulative_rewards = torch.zeros((self.num_latents))
1117
+ rewards_per_latent_episode = torch.zeros((self.num_latents, self.episodes_per_latent))
1100
1118
 
1101
1119
  latent_ids_gen = self.latents_for_machine()
1102
1120
 
@@ -1145,7 +1163,7 @@ class EPO(Module):
1145
1163
 
1146
1164
  # update cumulative rewards per latent, to be used as default fitness score
1147
1165
 
1148
- cumulative_rewards[latent_id] += reward
1166
+ rewards_per_latent_episode[latent_id, episode_id] += reward
1149
1167
 
1150
1168
  # store memories
1151
1169
 
@@ -1177,5 +1195,5 @@ class EPO(Module):
1177
1195
 
1178
1196
  return MemoriesAndCumulativeRewards(
1179
1197
  memories = memories,
1180
- cumulative_rewards = cumulative_rewards
1198
+ cumulative_rewards = rewards_per_latent_episode
1181
1199
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evolutionary-policy-optimization
3
- Version: 0.0.60
3
+ Version: 0.0.62
4
4
  Summary: EPO - Pytorch
5
5
  Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
6
6
  Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
@@ -1,9 +1,9 @@
1
1
  evolutionary_policy_optimization/__init__.py,sha256=0q0aBuFgWi06MLMD8FiHzBYQ3_W4LYWrwmCtF3u5H2A,201
2
2
  evolutionary_policy_optimization/distributed.py,sha256=7KgZdeS_wxBHo_du9XZFB1Cu318J-Bp66Xdr6Log_20,2423
3
- evolutionary_policy_optimization/epo.py,sha256=pD4j_oP7Cg8vSVQE34oJMcXcYN4oOi2_TtOAI5YbZDQ,35298
3
+ evolutionary_policy_optimization/epo.py,sha256=lWhHpsfq6vpri6yeDXSTLRMKGPwl0kt3klh0fVaInSs,35921
4
4
  evolutionary_policy_optimization/experimental.py,sha256=-IgqjJ_Wk_CMB1y9YYWpoYqTG9GZHAS6kbRdTluVevg,1563
5
5
  evolutionary_policy_optimization/mock_env.py,sha256=202KJ5g57wQvOzhGYzgHfBa7Y2do5uuDvl5kFg5o73g,934
6
- evolutionary_policy_optimization-0.0.60.dist-info/METADATA,sha256=vfiTPTi00-ZPqXgeVn4yEzCcD8p2k61aDmp5YX99Uww,6220
7
- evolutionary_policy_optimization-0.0.60.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
8
- evolutionary_policy_optimization-0.0.60.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
9
- evolutionary_policy_optimization-0.0.60.dist-info/RECORD,,
6
+ evolutionary_policy_optimization-0.0.62.dist-info/METADATA,sha256=oqJyUOXJwHrdf6JCVKPfOmhGJbXgqOmPWN_46l0JtWs,6220
7
+ evolutionary_policy_optimization-0.0.62.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
8
+ evolutionary_policy_optimization-0.0.62.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
9
+ evolutionary_policy_optimization-0.0.62.dist-info/RECORD,,