evolutionary-policy-optimization 0.0.61__tar.gz → 0.0.62__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (14) hide show
  1. {evolutionary_policy_optimization-0.0.61 → evolutionary_policy_optimization-0.0.62}/PKG-INFO +1 -1
  2. {evolutionary_policy_optimization-0.0.61 → evolutionary_policy_optimization-0.0.62}/evolutionary_policy_optimization/epo.py +29 -13
  3. {evolutionary_policy_optimization-0.0.61 → evolutionary_policy_optimization-0.0.62}/pyproject.toml +1 -1
  4. {evolutionary_policy_optimization-0.0.61 → evolutionary_policy_optimization-0.0.62}/.github/workflows/python-publish.yml +0 -0
  5. {evolutionary_policy_optimization-0.0.61 → evolutionary_policy_optimization-0.0.62}/.github/workflows/test.yml +0 -0
  6. {evolutionary_policy_optimization-0.0.61 → evolutionary_policy_optimization-0.0.62}/.gitignore +0 -0
  7. {evolutionary_policy_optimization-0.0.61 → evolutionary_policy_optimization-0.0.62}/LICENSE +0 -0
  8. {evolutionary_policy_optimization-0.0.61 → evolutionary_policy_optimization-0.0.62}/README.md +0 -0
  9. {evolutionary_policy_optimization-0.0.61 → evolutionary_policy_optimization-0.0.62}/evolutionary_policy_optimization/__init__.py +0 -0
  10. {evolutionary_policy_optimization-0.0.61 → evolutionary_policy_optimization-0.0.62}/evolutionary_policy_optimization/distributed.py +0 -0
  11. {evolutionary_policy_optimization-0.0.61 → evolutionary_policy_optimization-0.0.62}/evolutionary_policy_optimization/experimental.py +0 -0
  12. {evolutionary_policy_optimization-0.0.61 → evolutionary_policy_optimization-0.0.62}/evolutionary_policy_optimization/mock_env.py +0 -0
  13. {evolutionary_policy_optimization-0.0.61 → evolutionary_policy_optimization-0.0.62}/requirements.txt +0 -0
  14. {evolutionary_policy_optimization-0.0.61 → evolutionary_policy_optimization-0.0.62}/tests/test_epo.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evolutionary-policy-optimization
3
- Version: 0.0.61
3
+ Version: 0.0.62
4
4
  Summary: EPO - Pytorch
5
5
  Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
6
6
  Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
@@ -104,8 +104,11 @@ def temp_batch_dim(fn):
104
104
 
105
105
  # fitness related
106
106
 
107
- def get_fitness_scores(cum_rewards, memories):
108
- return cum_rewards
107
+ def get_fitness_scores(
108
+ cum_rewards, # Float['gene episodes']
109
+ memories
110
+ ): # Float['gene']
111
+ return cum_rewards.sum(dim = -1) # sum all rewards across episodes, but could override this function for normalizing with whatever
109
112
 
110
113
  # generalized advantage estimate
111
114
 
@@ -684,7 +687,8 @@ class Agent(Module):
684
687
  ),
685
688
  actor_loss_kwargs: dict = dict(
686
689
  eps_clip = 0.2,
687
- entropy_weight = .01
690
+ entropy_weight = .01,
691
+ norm_advantages = True
688
692
  ),
689
693
  ema_kwargs: dict = dict(),
690
694
  actor_optim_kwargs: dict = dict(),
@@ -826,9 +830,7 @@ class Agent(Module):
826
830
  memories_and_cumulative_rewards: MemoriesAndCumulativeRewards,
827
831
  epochs = 2
828
832
  ):
829
- memories, cumulative_rewards = memories_and_cumulative_rewards
830
-
831
- fitness_scores = self.get_fitness_scores(cumulative_rewards, memories)
833
+ memories, rewards_per_latent_episode = memories_and_cumulative_rewards
832
834
 
833
835
  # stack memories
834
836
 
@@ -839,7 +841,13 @@ class Agent(Module):
839
841
  if is_distributed():
840
842
  memories = map(partial(all_gather_variable_dim, dim = 0), memories)
841
843
 
842
- fitness_scores = all_gather_variable_dim(fitness_scores, dim = 0)
844
+ rewards_per_latent_episode = dist.all_reduce(rewards_per_latent_episode)
845
+
846
+ # calculate fitness scores
847
+
848
+ fitness_scores = self.get_fitness_scores(rewards_per_latent_episode, memories)
849
+
850
+ # process memories
843
851
 
844
852
  (
845
853
  episode_ids,
@@ -854,12 +862,16 @@ class Agent(Module):
854
862
 
855
863
  masks = 1. - dones.float()
856
864
 
865
+ # generalized advantage estimate
866
+
857
867
  advantages = self.calc_gae(
858
868
  rewards[:-1],
859
869
  values,
860
870
  masks[:-1],
861
871
  )
862
872
 
873
+ # dataset and dataloader
874
+
863
875
  valid_episode = episode_ids >= 0
864
876
 
865
877
  dataset = TensorDataset(
@@ -871,6 +883,8 @@ class Agent(Module):
871
883
 
872
884
  dataloader = DataLoader(dataset, batch_size = self.batch_size, shuffle = True)
873
885
 
886
+ # updating actor and critic
887
+
874
888
  self.actor.train()
875
889
  self.critic.train()
876
890
 
@@ -954,7 +968,8 @@ def actor_loss(
954
968
  advantages, # Float[b]
955
969
  eps_clip = 0.2,
956
970
  entropy_weight = .01,
957
- eps = 1e-5
971
+ eps = 1e-5,
972
+ norm_advantages = True
958
973
  ):
959
974
  batch = logits.shape[0]
960
975
 
@@ -966,7 +981,8 @@ def actor_loss(
966
981
 
967
982
  clipped_ratio = ratio.clamp(min = 1. - eps_clip, max = 1. + eps_clip)
968
983
 
969
- advantages = F.layer_norm(advantages, (batch,), eps = eps)
984
+ if norm_advantages:
985
+ advantages = F.layer_norm(advantages, (batch,), eps = eps)
970
986
 
971
987
  actor_loss = -torch.min(clipped_ratio * advantages, ratio * advantages)
972
988
 
@@ -1041,7 +1057,7 @@ Memory = namedtuple('Memory', [
1041
1057
 
1042
1058
  MemoriesAndCumulativeRewards = namedtuple('MemoriesAndCumulativeRewards', [
1043
1059
  'memories',
1044
- 'cumulative_rewards'
1060
+ 'cumulative_rewards' # Float['latent episodes']
1045
1061
  ])
1046
1062
 
1047
1063
  class EPO(Module):
@@ -1098,7 +1114,7 @@ class EPO(Module):
1098
1114
 
1099
1115
  memories: list[Memory] = []
1100
1116
 
1101
- cumulative_rewards = torch.zeros((self.num_latents))
1117
+ rewards_per_latent_episode = torch.zeros((self.num_latents, self.episodes_per_latent))
1102
1118
 
1103
1119
  latent_ids_gen = self.latents_for_machine()
1104
1120
 
@@ -1147,7 +1163,7 @@ class EPO(Module):
1147
1163
 
1148
1164
  # update cumulative rewards per latent, to be used as default fitness score
1149
1165
 
1150
- cumulative_rewards[latent_id] += reward
1166
+ rewards_per_latent_episode[latent_id, episode_id] += reward
1151
1167
 
1152
1168
  # store memories
1153
1169
 
@@ -1179,5 +1195,5 @@ class EPO(Module):
1179
1195
 
1180
1196
  return MemoriesAndCumulativeRewards(
1181
1197
  memories = memories,
1182
- cumulative_rewards = cumulative_rewards
1198
+ cumulative_rewards = rewards_per_latent_episode
1183
1199
  )
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "evolutionary-policy-optimization"
3
- version = "0.0.61"
3
+ version = "0.0.62"
4
4
  description = "EPO - Pytorch"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }