evolutionary-policy-optimization 0.0.72__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (16) hide show
  1. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/PKG-INFO +2 -1
  2. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/evolutionary_policy_optimization/__init__.py +4 -0
  3. evolutionary_policy_optimization-0.1.1/evolutionary_policy_optimization/env_wrappers.py +36 -0
  4. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/evolutionary_policy_optimization/epo.py +6 -5
  5. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/evolutionary_policy_optimization/mock_env.py +9 -9
  6. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/pyproject.toml +2 -1
  7. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/requirements.txt +2 -0
  8. evolutionary_policy_optimization-0.1.1/train_gym.py +43 -0
  9. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/.github/workflows/python-publish.yml +0 -0
  10. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/.github/workflows/test.yml +0 -0
  11. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/.gitignore +0 -0
  12. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/LICENSE +0 -0
  13. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/README.md +0 -0
  14. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/evolutionary_policy_optimization/distributed.py +0 -0
  15. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/evolutionary_policy_optimization/experimental.py +0 -0
  16. {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/tests/test_epo.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evolutionary-policy-optimization
3
- Version: 0.0.72
3
+ Version: 0.1.1
4
4
  Summary: EPO - Pytorch
5
5
  Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
6
6
  Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
@@ -34,6 +34,7 @@ Classifier: License :: OSI Approved :: MIT License
34
34
  Classifier: Programming Language :: Python :: 3.8
35
35
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
36
36
  Requires-Python: >=3.9
37
+ Requires-Dist: accelerate
37
38
  Requires-Dist: adam-atan2-pytorch
38
39
  Requires-Dist: assoc-scan>=0.0.2
39
40
  Requires-Dist: einops>=0.8.1
@@ -9,3 +9,7 @@ from evolutionary_policy_optimization.epo import (
9
9
  )
10
10
 
11
11
  from evolutionary_policy_optimization.mock_env import Env
12
+
13
+ from evolutionary_policy_optimization.env_wrappers import (
14
+ GymnasiumEnvWrapper
15
+ )
@@ -0,0 +1,36 @@
1
+ import torch
2
+ from torch.nn import Module
3
+
4
+ from evolutionary_policy_optimization.epo import create_agent, Agent
5
+
6
+ class GymnasiumEnvWrapper(Module):
7
+ def __init__(
8
+ self,
9
+ env
10
+ ):
11
+ super().__init__()
12
+ self.env = env
13
+
14
+ def reset(self, *args, **kwargs):
15
+ return self.env.reset(*args, **kwargs)
16
+
17
+ def step(self, *args, **kwargs):
18
+ return self.env.step(*args, **kwargs)
19
+
20
+ def to_agent_hparams(self):
21
+ return dict(
22
+ dim_state = self.env.observation_space.shape[0],
23
+ actor_num_actions = self.env.action_space.n
24
+ )
25
+
26
+ def to_epo_agent(
27
+ self,
28
+ *args,
29
+ **kwargs
30
+ ) -> Agent:
31
+
32
+ return create_agent(
33
+ *args,
34
+ **self.to_agent_hparams(),
35
+ **kwargs
36
+ )
@@ -70,7 +70,7 @@ def interface_torch_numpy(fn, device):
70
70
 
71
71
  out = fn(*args, **kwargs)
72
72
 
73
- out = tree_map(lambda t: from_numpy(t).to(device) if isinstance(t, np.ndarray) else t, out)
73
+ out = tree_map(lambda t: from_numpy(np.array(t)).to(device) if isinstance(t, (np.ndarray, np.float64)) else t, out)
74
74
  return out
75
75
 
76
76
  return decorated_fn
@@ -1040,6 +1040,7 @@ def actor_loss(
1040
1040
  # agent contains the actor, critic, and the latent genetic pool
1041
1041
 
1042
1042
  def create_agent(
1043
+ *,
1043
1044
  dim_state,
1044
1045
  num_latents,
1045
1046
  dim_latent,
@@ -1176,7 +1177,7 @@ class EPO(Module):
1176
1177
  if fix_environ_across_latents:
1177
1178
  maybe_seed = environment_seeds[episode_id]
1178
1179
 
1179
- yield latent_id, episode_id, maybe_seed
1180
+ yield latent_id, episode_id, maybe_seed.item()
1180
1181
 
1181
1182
  @torch.no_grad()
1182
1183
  def gather_experience_from(
@@ -1210,7 +1211,7 @@ class EPO(Module):
1210
1211
  if fix_environ_across_latents:
1211
1212
  reset_kwargs.update(seed = maybe_seed)
1212
1213
 
1213
- state = interface_torch_numpy(env.reset, device = self.device)(**reset_kwargs)
1214
+ state, _ = interface_torch_numpy(env.reset, device = self.device)(**reset_kwargs)
1214
1215
 
1215
1216
  # get latent from pool
1216
1217
 
@@ -1232,7 +1233,7 @@ class EPO(Module):
1232
1233
 
1233
1234
  # get the next state, action, and reward
1234
1235
 
1235
- state, reward, truncated, terminated = interface_torch_numpy(env.forward, device = self.device)(action)
1236
+ state, reward, truncated, terminated, _ = interface_torch_numpy(env.step, device = self.device)(action)
1236
1237
 
1237
1238
  done = truncated or terminated
1238
1239
 
@@ -1250,7 +1251,7 @@ class EPO(Module):
1250
1251
  log_prob,
1251
1252
  reward,
1252
1253
  value,
1253
- terminated
1254
+ tensor(terminated)
1254
1255
  )
1255
1256
 
1256
1257
  memory = Memory(*tuple(t.cpu() for t in memory))
@@ -22,34 +22,34 @@ class Env(Module):
22
22
  self.state_shape = cast_tuple(state_shape)
23
23
 
24
24
  self.can_terminate_after = can_terminate_after
25
- self.register_buffer('step', tensor(0))
25
+ self.register_buffer('_step', tensor(0))
26
26
 
27
27
  @property
28
28
  def device(self):
29
- return self.step.device
29
+ return self._step.device
30
30
 
31
31
  def reset(
32
32
  self,
33
33
  seed = None
34
34
  ):
35
35
  state = randn(self.state_shape, device = self.device)
36
- self.step.zero_()
37
- return state.numpy()
36
+ self._step.zero_()
37
+ return state.numpy(), None
38
38
 
39
- def forward(
39
+ def step(
40
40
  self,
41
41
  actions,
42
42
  ):
43
43
  state = randn(self.state_shape, device = self.device)
44
44
  reward = randint(0, 5, (), device = self.device).float()
45
45
 
46
- if self.step > self.can_terminate_after:
46
+ if self._step > self.can_terminate_after:
47
47
  truncated = tensor(choice((True, False)), device =self.device)
48
48
  terminated = tensor(choice((True, False)), device =self.device)
49
49
  else:
50
50
  truncated = terminated = tensor(False, device = self.device)
51
51
 
52
- self.step.add_(1)
52
+ self._step.add_(1)
53
53
 
54
- out = state, reward, truncated, terminated
55
- return tuple(t.numpy() for t in out)
54
+ out = (state, reward, truncated, terminated)
55
+ return (*tuple(t.numpy() for t in out), None)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "evolutionary-policy-optimization"
3
- version = "0.0.72"
3
+ version = "0.1.1"
4
4
  description = "EPO - Pytorch"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -24,6 +24,7 @@ classifiers = [
24
24
  ]
25
25
 
26
26
  dependencies = [
27
+ "accelerate",
27
28
  "adam-atan2-pytorch",
28
29
  'assoc-scan>=0.0.2',
29
30
  'einx>=0.3.0',
@@ -1,3 +1,5 @@
1
1
  box2d-py
2
2
  gymnasium[box2d]>=1.0.0
3
+ moviepy>=1.0.3
4
+ numpy>=2.2.5
3
5
  tqdm
@@ -0,0 +1,43 @@
1
+ import torch
2
+
3
+ from evolutionary_policy_optimization import (
4
+ EPO,
5
+ GymnasiumEnvWrapper
6
+ )
7
+
8
+ # gymnasium
9
+
10
+ import gymnasium as gym
11
+
12
+ env = gym.make(
13
+ 'LunarLander-v3',
14
+ render_mode = 'rgb_array'
15
+ )
16
+
17
+ env = GymnasiumEnvWrapper(env)
18
+
19
+ # epo
20
+
21
+ agent = env.to_epo_agent(
22
+ num_latents = 8,
23
+ dim_latent = 32,
24
+ actor_dim_hiddens = (256, 128),
25
+ critic_dim_hiddens = (256, 128, 64),
26
+ latent_gene_pool_kwargs = dict(
27
+ frac_natural_selected = 0.5,
28
+ frac_tournaments = 0.5
29
+ )
30
+ )
31
+
32
+ epo = EPO(
33
+ agent,
34
+ episodes_per_latent = 1,
35
+ max_episode_length = 10,
36
+ action_sample_temperature = 1.
37
+ )
38
+
39
+ epo.to('cpu' if not torch.cuda.is_available() else 'cuda')
40
+
41
+ epo(agent, env, num_learning_cycles = 1)
42
+
43
+ agent.save('./agent.pt', overwrite = True)