evolutionary-policy-optimization 0.0.63__tar.gz → 0.0.65__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (14) hide show
  1. {evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/PKG-INFO +1 -1
  2. {evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/evolutionary_policy_optimization/epo.py +27 -25
  3. {evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/evolutionary_policy_optimization/mock_env.py +17 -5
  4. {evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/pyproject.toml +1 -1
  5. {evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/.github/workflows/python-publish.yml +0 -0
  6. {evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/.github/workflows/test.yml +0 -0
  7. {evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/.gitignore +0 -0
  8. {evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/LICENSE +0 -0
  9. {evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/README.md +0 -0
  10. {evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/evolutionary_policy_optimization/__init__.py +0 -0
  11. {evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/evolutionary_policy_optimization/distributed.py +0 -0
  12. {evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/evolutionary_policy_optimization/experimental.py +0 -0
  13. {evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/requirements.txt +0 -0
  14. {evolutionary_policy_optimization-0.0.63 → evolutionary_policy_optimization-0.0.65}/tests/test_epo.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evolutionary-policy-optimization
3
- Version: 0.0.63
3
+ Version: 0.0.65
4
4
  Summary: EPO - Pytorch
5
5
  Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
6
6
  Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
@@ -114,18 +114,17 @@ def get_fitness_scores(
114
114
  # generalized advantage estimate
115
115
 
116
116
  def calc_generalized_advantage_estimate(
117
- rewards, # Float[n]
118
- values, # Float[n+1]
119
- masks, # Bool[n]
117
+ rewards,
118
+ values,
119
+ masks,
120
120
  gamma = 0.99,
121
121
  lam = 0.95,
122
122
  use_accelerated = None
123
123
  ):
124
- assert values.shape[-1] == (rewards.shape[-1] + 1)
125
-
126
124
  use_accelerated = default(use_accelerated, rewards.is_cuda)
127
125
  device = rewards.device
128
126
 
127
+ values = F.pad(values, (0, 1), value = 0.)
129
128
  values, values_next = values[:-1], values[1:]
130
129
 
131
130
  delta = rewards + gamma * values_next * masks - values
@@ -866,21 +865,16 @@ class Agent(Module):
866
865
  # generalized advantage estimate
867
866
 
868
867
  advantages = self.calc_gae(
869
- rewards[:-1],
868
+ rewards,
870
869
  values,
871
- masks[:-1],
870
+ masks,
872
871
  )
873
872
 
874
873
  # dataset and dataloader
875
874
 
876
875
  valid_episode = episode_ids >= 0
877
876
 
878
- dataset = TensorDataset(
879
- *[
880
- advantages[valid_episode[:-1]],
881
- *[t[valid_episode] for t in (states, latent_gene_ids, actions, log_probs, values)]
882
- ]
883
- )
877
+ dataset = TensorDataset(*[t[valid_episode] for t in (advantages, states, latent_gene_ids, actions, log_probs, values)])
884
878
 
885
879
  dataloader = DataLoader(dataset, batch_size = self.batch_size, shuffle = True)
886
880
 
@@ -1068,7 +1062,8 @@ class EPO(Module):
1068
1062
  agent: Agent,
1069
1063
  episodes_per_latent,
1070
1064
  max_episode_length,
1071
- action_sample_temperature = 1.
1065
+ action_sample_temperature = 1.,
1066
+ fix_environ_across_latents = True
1072
1067
  ):
1073
1068
  super().__init__()
1074
1069
  self.agent = agent
@@ -1077,6 +1072,7 @@ class EPO(Module):
1077
1072
  self.num_latents = agent.latent_gene_pool.num_latents
1078
1073
  self.episodes_per_latent = episodes_per_latent
1079
1074
  self.max_episode_length = max_episode_length
1075
+ self.fix_environ_across_latents = fix_environ_across_latents
1080
1076
 
1081
1077
  self.register_buffer('dummy', tensor(0))
1082
1078
 
@@ -1133,9 +1129,11 @@ class EPO(Module):
1133
1129
  def forward(
1134
1130
  self,
1135
1131
  env,
1136
- fix_environ_across_latents = True
1132
+ fix_environ_across_latents = None
1137
1133
  ) -> MemoriesAndCumulativeRewards:
1138
1134
 
1135
+ fix_environ_across_latents = default(fix_environ_across_latents, self.fix_environ_across_latents)
1136
+
1139
1137
  self.agent.eval()
1140
1138
 
1141
1139
  invalid_episode = tensor(-1) # will use `episode_id` value of `-1` for the `next_value`, needed for not discarding last reward for generalized advantage estimate
@@ -1179,12 +1177,14 @@ class EPO(Module):
1179
1177
 
1180
1178
  # get the next state, action, and reward
1181
1179
 
1182
- state, reward, done = env(action)
1180
+ state, reward, truncated, terminated = env(action)
1181
+
1182
+ done = truncated or terminated
1183
1183
 
1184
1184
  # update cumulative rewards per latent, to be used as default fitness score
1185
1185
 
1186
1186
  rewards_per_latent_episode[latent_id, episode_id] += reward
1187
-
1187
+
1188
1188
  # store memories
1189
1189
 
1190
1190
  memory = Memory(
@@ -1195,23 +1195,25 @@ class EPO(Module):
1195
1195
  log_prob,
1196
1196
  reward,
1197
1197
  value,
1198
- done
1198
+ terminated
1199
1199
  )
1200
1200
 
1201
1201
  memories.append(memory)
1202
1202
 
1203
1203
  time += 1
1204
1204
 
1205
- # need the final next value for GAE, iiuc
1205
+ if not terminated:
1206
+ # add bootstrap value if truncated
1206
1207
 
1207
- next_value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent)
1208
+ next_value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent)
1208
1209
 
1209
- memory_for_gae = memory._replace(
1210
- episode_id = invalid_episode,
1211
- value = next_value
1212
- )
1210
+ memory_for_gae = memory._replace(
1211
+ episode_id = invalid_episode,
1212
+ value = next_value,
1213
+ done = tensor(True)
1214
+ )
1213
1215
 
1214
- memories.append(memory_for_gae)
1216
+ memories.append(memory_for_gae)
1215
1217
 
1216
1218
  return MemoriesAndCumulativeRewards(
1217
1219
  memories = memories,
@@ -1,4 +1,5 @@
1
1
  from __future__ import annotations
2
+ from random import choice
2
3
 
3
4
  import torch
4
5
  from torch import tensor, randn, randint
@@ -14,21 +15,25 @@ def cast_tuple(v):
14
15
  class Env(Module):
15
16
  def __init__(
16
17
  self,
17
- state_shape: int | tuple[int, ...]
18
+ state_shape: int | tuple[int, ...],
19
+ can_terminate_after = 2
18
20
  ):
19
21
  super().__init__()
20
22
  self.state_shape = cast_tuple(state_shape)
21
- self.register_buffer('dummy', tensor(0))
23
+
24
+ self.can_terminate_after = can_terminate_after
25
+ self.register_buffer('step', tensor(0))
22
26
 
23
27
  @property
24
28
  def device(self):
25
- return self.dummy.device
29
+ return self.step.device
26
30
 
27
31
  def reset(
28
32
  self,
29
33
  seed = None
30
34
  ):
31
35
  state = randn(self.state_shape, device = self.device)
36
+ self.step.zero_()
32
37
  return state
33
38
 
34
39
  def forward(
@@ -37,6 +42,13 @@ class Env(Module):
37
42
  ):
38
43
  state = randn(self.state_shape, device = self.device)
39
44
  reward = randint(0, 5, (), device = self.device).float()
40
- done = torch.zeros((), device = self.device, dtype = torch.bool)
41
45
 
42
- return state, reward, done
46
+ if self.step > self.can_terminate_after:
47
+ truncated = tensor(choice((True, False)), device =self.device)
48
+ terminated = tensor(choice((True, False)), device =self.device)
49
+ else:
50
+ truncated = terminated = tensor(False, device = self.device)
51
+
52
+ self.step.add_(1)
53
+
54
+ return state, reward, truncated, terminated
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "evolutionary-policy-optimization"
3
- version = "0.0.63"
3
+ version = "0.0.65"
4
4
  description = "EPO - Pytorch"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }