evolutionary-policy-optimization 0.0.72__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (15) hide show
  1. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/PKG-INFO +1 -1
  2. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/evolutionary_policy_optimization/epo.py +5 -5
  3. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/evolutionary_policy_optimization/mock_env.py +9 -9
  4. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/pyproject.toml +1 -1
  5. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/requirements.txt +2 -0
  6. evolutionary_policy_optimization-0.1.0/train_gym.py +44 -0
  7. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/.github/workflows/python-publish.yml +0 -0
  8. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/.github/workflows/test.yml +0 -0
  9. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/.gitignore +0 -0
  10. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/LICENSE +0 -0
  11. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/README.md +0 -0
  12. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/evolutionary_policy_optimization/__init__.py +0 -0
  13. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/evolutionary_policy_optimization/distributed.py +0 -0
  14. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/evolutionary_policy_optimization/experimental.py +0 -0
  15. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/tests/test_epo.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evolutionary-policy-optimization
3
- Version: 0.0.72
3
+ Version: 0.1.0
4
4
  Summary: EPO - Pytorch
5
5
  Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
6
6
  Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
@@ -70,7 +70,7 @@ def interface_torch_numpy(fn, device):
70
70
 
71
71
  out = fn(*args, **kwargs)
72
72
 
73
- out = tree_map(lambda t: from_numpy(t).to(device) if isinstance(t, np.ndarray) else t, out)
73
+ out = tree_map(lambda t: from_numpy(np.array(t)).to(device) if isinstance(t, (np.ndarray, np.float64)) else t, out)
74
74
  return out
75
75
 
76
76
  return decorated_fn
@@ -1176,7 +1176,7 @@ class EPO(Module):
1176
1176
  if fix_environ_across_latents:
1177
1177
  maybe_seed = environment_seeds[episode_id]
1178
1178
 
1179
- yield latent_id, episode_id, maybe_seed
1179
+ yield latent_id, episode_id, maybe_seed.item()
1180
1180
 
1181
1181
  @torch.no_grad()
1182
1182
  def gather_experience_from(
@@ -1210,7 +1210,7 @@ class EPO(Module):
1210
1210
  if fix_environ_across_latents:
1211
1211
  reset_kwargs.update(seed = maybe_seed)
1212
1212
 
1213
- state = interface_torch_numpy(env.reset, device = self.device)(**reset_kwargs)
1213
+ state, _ = interface_torch_numpy(env.reset, device = self.device)(**reset_kwargs)
1214
1214
 
1215
1215
  # get latent from pool
1216
1216
 
@@ -1232,7 +1232,7 @@ class EPO(Module):
1232
1232
 
1233
1233
  # get the next state, action, and reward
1234
1234
 
1235
- state, reward, truncated, terminated = interface_torch_numpy(env.forward, device = self.device)(action)
1235
+ state, reward, truncated, terminated, _ = interface_torch_numpy(env.step, device = self.device)(action)
1236
1236
 
1237
1237
  done = truncated or terminated
1238
1238
 
@@ -1250,7 +1250,7 @@ class EPO(Module):
1250
1250
  log_prob,
1251
1251
  reward,
1252
1252
  value,
1253
- terminated
1253
+ tensor(terminated)
1254
1254
  )
1255
1255
 
1256
1256
  memory = Memory(*tuple(t.cpu() for t in memory))
@@ -22,34 +22,34 @@ class Env(Module):
22
22
  self.state_shape = cast_tuple(state_shape)
23
23
 
24
24
  self.can_terminate_after = can_terminate_after
25
- self.register_buffer('step', tensor(0))
25
+ self.register_buffer('_step', tensor(0))
26
26
 
27
27
  @property
28
28
  def device(self):
29
- return self.step.device
29
+ return self._step.device
30
30
 
31
31
  def reset(
32
32
  self,
33
33
  seed = None
34
34
  ):
35
35
  state = randn(self.state_shape, device = self.device)
36
- self.step.zero_()
37
- return state.numpy()
36
+ self._step.zero_()
37
+ return state.numpy(), None
38
38
 
39
- def forward(
39
+ def step(
40
40
  self,
41
41
  actions,
42
42
  ):
43
43
  state = randn(self.state_shape, device = self.device)
44
44
  reward = randint(0, 5, (), device = self.device).float()
45
45
 
46
- if self.step > self.can_terminate_after:
46
+ if self._step > self.can_terminate_after:
47
47
  truncated = tensor(choice((True, False)), device =self.device)
48
48
  terminated = tensor(choice((True, False)), device =self.device)
49
49
  else:
50
50
  truncated = terminated = tensor(False, device = self.device)
51
51
 
52
- self.step.add_(1)
52
+ self._step.add_(1)
53
53
 
54
- out = state, reward, truncated, terminated
55
- return tuple(t.numpy() for t in out)
54
+ out = (state, reward, truncated, terminated)
55
+ return (*tuple(t.numpy() for t in out), None)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "evolutionary-policy-optimization"
3
- version = "0.0.72"
3
+ version = "0.1.0"
4
4
  description = "EPO - Pytorch"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -1,3 +1,5 @@
1
1
  box2d-py
2
2
  gymnasium[box2d]>=1.0.0
3
+ moviepy>=1.0.3
4
+ numpy>=2.2.5
3
5
  tqdm
@@ -0,0 +1,44 @@
1
+ # gymnasium
2
+
3
+ import gymnasium as gym
4
+
5
+ env = gym.make(
6
+ 'LunarLander-v3',
7
+ render_mode = 'rgb_array'
8
+ )
9
+
10
+ state_dim = env.observation_space.shape[0]
11
+ num_actions = env.action_space.n
12
+
13
+ # epo
14
+
15
+ import torch
16
+
17
+ from evolutionary_policy_optimization import (
18
+ create_agent,
19
+ EPO,
20
+ Env
21
+ )
22
+
23
+ agent = create_agent(
24
+ dim_state = state_dim,
25
+ num_latents = 1,
26
+ dim_latent = 32,
27
+ actor_num_actions = num_actions,
28
+ actor_dim_hiddens = (256, 128),
29
+ critic_dim_hiddens = (256, 128, 64),
30
+ latent_gene_pool_kwargs = dict(
31
+ frac_natural_selected = 0.5
32
+ )
33
+ )
34
+
35
+ epo = EPO(
36
+ agent,
37
+ episodes_per_latent = 1,
38
+ max_episode_length = 10,
39
+ action_sample_temperature = 1.
40
+ )
41
+
42
+ epo.to('cpu' if not torch.cuda.is_available() else 'cuda')
43
+
44
+ epo(agent, env, num_learning_cycles = 5)