evolutionary-policy-optimization 0.0.60__tar.gz → 0.0.62__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/PKG-INFO +1 -1
- {evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/evolutionary_policy_optimization/epo.py +30 -12
- {evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/pyproject.toml +1 -1
- {evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/.github/workflows/python-publish.yml +0 -0
- {evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/.github/workflows/test.yml +0 -0
- {evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/.gitignore +0 -0
- {evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/LICENSE +0 -0
- {evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/README.md +0 -0
- {evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/evolutionary_policy_optimization/__init__.py +0 -0
- {evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/evolutionary_policy_optimization/distributed.py +0 -0
- {evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/evolutionary_policy_optimization/experimental.py +0 -0
- {evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/evolutionary_policy_optimization/mock_env.py +0 -0
- {evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/requirements.txt +0 -0
- {evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/tests/test_epo.py +0 -0
{evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: evolutionary-policy-optimization
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.62
|
4
4
|
Summary: EPO - Pytorch
|
5
5
|
Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
|
6
6
|
Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
|
@@ -104,8 +104,11 @@ def temp_batch_dim(fn):
|
|
104
104
|
|
105
105
|
# fitness related
|
106
106
|
|
107
|
-
def get_fitness_scores(
|
108
|
-
|
107
|
+
def get_fitness_scores(
|
108
|
+
cum_rewards, # Float['gene episodes']
|
109
|
+
memories
|
110
|
+
): # Float['gene']
|
111
|
+
return cum_rewards.sum(dim = -1) # sum all rewards across episodes, but could override this function for normalizing with whatever
|
109
112
|
|
110
113
|
# generalized advantage estimate
|
111
114
|
|
@@ -684,7 +687,8 @@ class Agent(Module):
|
|
684
687
|
),
|
685
688
|
actor_loss_kwargs: dict = dict(
|
686
689
|
eps_clip = 0.2,
|
687
|
-
entropy_weight = .01
|
690
|
+
entropy_weight = .01,
|
691
|
+
norm_advantages = True
|
688
692
|
),
|
689
693
|
ema_kwargs: dict = dict(),
|
690
694
|
actor_optim_kwargs: dict = dict(),
|
@@ -826,9 +830,7 @@ class Agent(Module):
|
|
826
830
|
memories_and_cumulative_rewards: MemoriesAndCumulativeRewards,
|
827
831
|
epochs = 2
|
828
832
|
):
|
829
|
-
memories,
|
830
|
-
|
831
|
-
fitness_scores = self.get_fitness_scores(cumulative_rewards, memories)
|
833
|
+
memories, rewards_per_latent_episode = memories_and_cumulative_rewards
|
832
834
|
|
833
835
|
# stack memories
|
834
836
|
|
@@ -839,6 +841,14 @@ class Agent(Module):
|
|
839
841
|
if is_distributed():
|
840
842
|
memories = map(partial(all_gather_variable_dim, dim = 0), memories)
|
841
843
|
|
844
|
+
rewards_per_latent_episode = dist.all_reduce(rewards_per_latent_episode)
|
845
|
+
|
846
|
+
# calculate fitness scores
|
847
|
+
|
848
|
+
fitness_scores = self.get_fitness_scores(rewards_per_latent_episode, memories)
|
849
|
+
|
850
|
+
# process memories
|
851
|
+
|
842
852
|
(
|
843
853
|
episode_ids,
|
844
854
|
states,
|
@@ -852,12 +862,16 @@ class Agent(Module):
|
|
852
862
|
|
853
863
|
masks = 1. - dones.float()
|
854
864
|
|
865
|
+
# generalized advantage estimate
|
866
|
+
|
855
867
|
advantages = self.calc_gae(
|
856
868
|
rewards[:-1],
|
857
869
|
values,
|
858
870
|
masks[:-1],
|
859
871
|
)
|
860
872
|
|
873
|
+
# dataset and dataloader
|
874
|
+
|
861
875
|
valid_episode = episode_ids >= 0
|
862
876
|
|
863
877
|
dataset = TensorDataset(
|
@@ -869,6 +883,8 @@ class Agent(Module):
|
|
869
883
|
|
870
884
|
dataloader = DataLoader(dataset, batch_size = self.batch_size, shuffle = True)
|
871
885
|
|
886
|
+
# updating actor and critic
|
887
|
+
|
872
888
|
self.actor.train()
|
873
889
|
self.critic.train()
|
874
890
|
|
@@ -952,7 +968,8 @@ def actor_loss(
|
|
952
968
|
advantages, # Float[b]
|
953
969
|
eps_clip = 0.2,
|
954
970
|
entropy_weight = .01,
|
955
|
-
eps = 1e-5
|
971
|
+
eps = 1e-5,
|
972
|
+
norm_advantages = True
|
956
973
|
):
|
957
974
|
batch = logits.shape[0]
|
958
975
|
|
@@ -964,7 +981,8 @@ def actor_loss(
|
|
964
981
|
|
965
982
|
clipped_ratio = ratio.clamp(min = 1. - eps_clip, max = 1. + eps_clip)
|
966
983
|
|
967
|
-
|
984
|
+
if norm_advantages:
|
985
|
+
advantages = F.layer_norm(advantages, (batch,), eps = eps)
|
968
986
|
|
969
987
|
actor_loss = -torch.min(clipped_ratio * advantages, ratio * advantages)
|
970
988
|
|
@@ -1039,7 +1057,7 @@ Memory = namedtuple('Memory', [
|
|
1039
1057
|
|
1040
1058
|
MemoriesAndCumulativeRewards = namedtuple('MemoriesAndCumulativeRewards', [
|
1041
1059
|
'memories',
|
1042
|
-
'cumulative_rewards'
|
1060
|
+
'cumulative_rewards' # Float['latent episodes']
|
1043
1061
|
])
|
1044
1062
|
|
1045
1063
|
class EPO(Module):
|
@@ -1096,7 +1114,7 @@ class EPO(Module):
|
|
1096
1114
|
|
1097
1115
|
memories: list[Memory] = []
|
1098
1116
|
|
1099
|
-
|
1117
|
+
rewards_per_latent_episode = torch.zeros((self.num_latents, self.episodes_per_latent))
|
1100
1118
|
|
1101
1119
|
latent_ids_gen = self.latents_for_machine()
|
1102
1120
|
|
@@ -1145,7 +1163,7 @@ class EPO(Module):
|
|
1145
1163
|
|
1146
1164
|
# update cumulative rewards per latent, to be used as default fitness score
|
1147
1165
|
|
1148
|
-
|
1166
|
+
rewards_per_latent_episode[latent_id, episode_id] += reward
|
1149
1167
|
|
1150
1168
|
# store memories
|
1151
1169
|
|
@@ -1177,5 +1195,5 @@ class EPO(Module):
|
|
1177
1195
|
|
1178
1196
|
return MemoriesAndCumulativeRewards(
|
1179
1197
|
memories = memories,
|
1180
|
-
cumulative_rewards =
|
1198
|
+
cumulative_rewards = rewards_per_latent_episode
|
1181
1199
|
)
|
File without changes
|
File without changes
|
{evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/.gitignore
RENAMED
File without changes
|
File without changes
|
{evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/README.md
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{evolutionary_policy_optimization-0.0.60 → evolutionary_policy_optimization-0.0.62}/requirements.txt
RENAMED
File without changes
|
File without changes
|