evolutionary-policy-optimization 0.0.61__py3-none-any.whl → 0.0.62__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evolutionary_policy_optimization/epo.py +29 -13
- {evolutionary_policy_optimization-0.0.61.dist-info → evolutionary_policy_optimization-0.0.62.dist-info}/METADATA +1 -1
- {evolutionary_policy_optimization-0.0.61.dist-info → evolutionary_policy_optimization-0.0.62.dist-info}/RECORD +5 -5
- {evolutionary_policy_optimization-0.0.61.dist-info → evolutionary_policy_optimization-0.0.62.dist-info}/WHEEL +0 -0
- {evolutionary_policy_optimization-0.0.61.dist-info → evolutionary_policy_optimization-0.0.62.dist-info}/licenses/LICENSE +0 -0
@@ -104,8 +104,11 @@ def temp_batch_dim(fn):
|
|
104
104
|
|
105
105
|
# fitness related
|
106
106
|
|
107
|
-
def get_fitness_scores(
|
108
|
-
|
107
|
+
def get_fitness_scores(
|
108
|
+
cum_rewards, # Float['gene episodes']
|
109
|
+
memories
|
110
|
+
): # Float['gene']
|
111
|
+
return cum_rewards.sum(dim = -1) # sum all rewards across episodes, but could override this function for normalizing with whatever
|
109
112
|
|
110
113
|
# generalized advantage estimate
|
111
114
|
|
@@ -684,7 +687,8 @@ class Agent(Module):
|
|
684
687
|
),
|
685
688
|
actor_loss_kwargs: dict = dict(
|
686
689
|
eps_clip = 0.2,
|
687
|
-
entropy_weight = .01
|
690
|
+
entropy_weight = .01,
|
691
|
+
norm_advantages = True
|
688
692
|
),
|
689
693
|
ema_kwargs: dict = dict(),
|
690
694
|
actor_optim_kwargs: dict = dict(),
|
@@ -826,9 +830,7 @@ class Agent(Module):
|
|
826
830
|
memories_and_cumulative_rewards: MemoriesAndCumulativeRewards,
|
827
831
|
epochs = 2
|
828
832
|
):
|
829
|
-
memories,
|
830
|
-
|
831
|
-
fitness_scores = self.get_fitness_scores(cumulative_rewards, memories)
|
833
|
+
memories, rewards_per_latent_episode = memories_and_cumulative_rewards
|
832
834
|
|
833
835
|
# stack memories
|
834
836
|
|
@@ -839,7 +841,13 @@ class Agent(Module):
|
|
839
841
|
if is_distributed():
|
840
842
|
memories = map(partial(all_gather_variable_dim, dim = 0), memories)
|
841
843
|
|
842
|
-
|
844
|
+
rewards_per_latent_episode = dist.all_reduce(rewards_per_latent_episode)
|
845
|
+
|
846
|
+
# calculate fitness scores
|
847
|
+
|
848
|
+
fitness_scores = self.get_fitness_scores(rewards_per_latent_episode, memories)
|
849
|
+
|
850
|
+
# process memories
|
843
851
|
|
844
852
|
(
|
845
853
|
episode_ids,
|
@@ -854,12 +862,16 @@ class Agent(Module):
|
|
854
862
|
|
855
863
|
masks = 1. - dones.float()
|
856
864
|
|
865
|
+
# generalized advantage estimate
|
866
|
+
|
857
867
|
advantages = self.calc_gae(
|
858
868
|
rewards[:-1],
|
859
869
|
values,
|
860
870
|
masks[:-1],
|
861
871
|
)
|
862
872
|
|
873
|
+
# dataset and dataloader
|
874
|
+
|
863
875
|
valid_episode = episode_ids >= 0
|
864
876
|
|
865
877
|
dataset = TensorDataset(
|
@@ -871,6 +883,8 @@ class Agent(Module):
|
|
871
883
|
|
872
884
|
dataloader = DataLoader(dataset, batch_size = self.batch_size, shuffle = True)
|
873
885
|
|
886
|
+
# updating actor and critic
|
887
|
+
|
874
888
|
self.actor.train()
|
875
889
|
self.critic.train()
|
876
890
|
|
@@ -954,7 +968,8 @@ def actor_loss(
|
|
954
968
|
advantages, # Float[b]
|
955
969
|
eps_clip = 0.2,
|
956
970
|
entropy_weight = .01,
|
957
|
-
eps = 1e-5
|
971
|
+
eps = 1e-5,
|
972
|
+
norm_advantages = True
|
958
973
|
):
|
959
974
|
batch = logits.shape[0]
|
960
975
|
|
@@ -966,7 +981,8 @@ def actor_loss(
|
|
966
981
|
|
967
982
|
clipped_ratio = ratio.clamp(min = 1. - eps_clip, max = 1. + eps_clip)
|
968
983
|
|
969
|
-
|
984
|
+
if norm_advantages:
|
985
|
+
advantages = F.layer_norm(advantages, (batch,), eps = eps)
|
970
986
|
|
971
987
|
actor_loss = -torch.min(clipped_ratio * advantages, ratio * advantages)
|
972
988
|
|
@@ -1041,7 +1057,7 @@ Memory = namedtuple('Memory', [
|
|
1041
1057
|
|
1042
1058
|
MemoriesAndCumulativeRewards = namedtuple('MemoriesAndCumulativeRewards', [
|
1043
1059
|
'memories',
|
1044
|
-
'cumulative_rewards'
|
1060
|
+
'cumulative_rewards' # Float['latent episodes']
|
1045
1061
|
])
|
1046
1062
|
|
1047
1063
|
class EPO(Module):
|
@@ -1098,7 +1114,7 @@ class EPO(Module):
|
|
1098
1114
|
|
1099
1115
|
memories: list[Memory] = []
|
1100
1116
|
|
1101
|
-
|
1117
|
+
rewards_per_latent_episode = torch.zeros((self.num_latents, self.episodes_per_latent))
|
1102
1118
|
|
1103
1119
|
latent_ids_gen = self.latents_for_machine()
|
1104
1120
|
|
@@ -1147,7 +1163,7 @@ class EPO(Module):
|
|
1147
1163
|
|
1148
1164
|
# update cumulative rewards per latent, to be used as default fitness score
|
1149
1165
|
|
1150
|
-
|
1166
|
+
rewards_per_latent_episode[latent_id, episode_id] += reward
|
1151
1167
|
|
1152
1168
|
# store memories
|
1153
1169
|
|
@@ -1179,5 +1195,5 @@ class EPO(Module):
|
|
1179
1195
|
|
1180
1196
|
return MemoriesAndCumulativeRewards(
|
1181
1197
|
memories = memories,
|
1182
|
-
cumulative_rewards =
|
1198
|
+
cumulative_rewards = rewards_per_latent_episode
|
1183
1199
|
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: evolutionary-policy-optimization
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.62
|
4
4
|
Summary: EPO - Pytorch
|
5
5
|
Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
|
6
6
|
Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
|
@@ -1,9 +1,9 @@
|
|
1
1
|
evolutionary_policy_optimization/__init__.py,sha256=0q0aBuFgWi06MLMD8FiHzBYQ3_W4LYWrwmCtF3u5H2A,201
|
2
2
|
evolutionary_policy_optimization/distributed.py,sha256=7KgZdeS_wxBHo_du9XZFB1Cu318J-Bp66Xdr6Log_20,2423
|
3
|
-
evolutionary_policy_optimization/epo.py,sha256=
|
3
|
+
evolutionary_policy_optimization/epo.py,sha256=lWhHpsfq6vpri6yeDXSTLRMKGPwl0kt3klh0fVaInSs,35921
|
4
4
|
evolutionary_policy_optimization/experimental.py,sha256=-IgqjJ_Wk_CMB1y9YYWpoYqTG9GZHAS6kbRdTluVevg,1563
|
5
5
|
evolutionary_policy_optimization/mock_env.py,sha256=202KJ5g57wQvOzhGYzgHfBa7Y2do5uuDvl5kFg5o73g,934
|
6
|
-
evolutionary_policy_optimization-0.0.
|
7
|
-
evolutionary_policy_optimization-0.0.
|
8
|
-
evolutionary_policy_optimization-0.0.
|
9
|
-
evolutionary_policy_optimization-0.0.
|
6
|
+
evolutionary_policy_optimization-0.0.62.dist-info/METADATA,sha256=oqJyUOXJwHrdf6JCVKPfOmhGJbXgqOmPWN_46l0JtWs,6220
|
7
|
+
evolutionary_policy_optimization-0.0.62.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
8
|
+
evolutionary_policy_optimization-0.0.62.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
|
9
|
+
evolutionary_policy_optimization-0.0.62.dist-info/RECORD,,
|
File without changes
|