evolutionary-policy-optimization 0.1.10__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/PKG-INFO +12 -1
  2. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/README.md +11 -0
  3. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/evolutionary_policy_optimization/epo.py +81 -54
  4. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/pyproject.toml +1 -1
  5. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/tests/test_epo.py +5 -2
  6. evolutionary_policy_optimization-0.1.14/train_gym.py +63 -0
  7. evolutionary_policy_optimization-0.1.10/train_gym.py +0 -44
  8. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/.github/workflows/python-publish.yml +0 -0
  9. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/.github/workflows/test.yml +0 -0
  10. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/.gitignore +0 -0
  11. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/LICENSE +0 -0
  12. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/evolutionary_policy_optimization/__init__.py +0 -0
  13. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/evolutionary_policy_optimization/distributed.py +0 -0
  14. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/evolutionary_policy_optimization/env_wrappers.py +0 -0
  15. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/evolutionary_policy_optimization/experimental.py +0 -0
  16. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/evolutionary_policy_optimization/mock_env.py +0 -0
  17. {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/requirements.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evolutionary-policy-optimization
3
- Version: 0.1.10
3
+ Version: 0.1.14
4
4
  Summary: EPO - Pytorch
5
5
  Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
6
6
  Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
@@ -215,4 +215,15 @@ agent.load('./agent.pt')
215
215
  }
216
216
  ```
217
217
 
218
+ ```bibtex
219
+ @article{Ash2019OnTD,
220
+ title = {On the Difficulty of Warm-Starting Neural Network Training},
221
+ author = {Jordan T. Ash and Ryan P. Adams},
222
+ journal = {ArXiv},
223
+ year = {2019},
224
+ volume = {abs/1910.08475},
225
+ url = {https://api.semanticscholar.org/CorpusID:204788802}
226
+ }
227
+ ```
228
+
218
229
  *Evolution is cleverer than you are.* - Leslie Orgel
@@ -162,4 +162,15 @@ agent.load('./agent.pt')
162
162
  }
163
163
  ```
164
164
 
165
+ ```bibtex
166
+ @article{Ash2019OnTD,
167
+ title = {On the Difficulty of Warm-Starting Neural Network Training},
168
+ author = {Jordan T. Ash and Ryan P. Adams},
169
+ journal = {ArXiv},
170
+ year = {2019},
171
+ volume = {abs/1910.08475},
172
+ url = {https://api.semanticscholar.org/CorpusID:204788802}
173
+ }
174
+ ```
175
+
165
176
  *Evolution is cleverer than you are.* - Leslie Orgel
@@ -76,6 +76,14 @@ def maybe(fn):
76
76
  def interface_torch_numpy(fn, device):
77
77
  # for a given function, move all inputs from torch tensor to numpy, and all outputs from numpy to torch tensor
78
78
 
79
+ def to_torch_tensor(t):
80
+ if isinstance(t, (np.ndarray, np.float64)):
81
+ t = from_numpy(np.array(t))
82
+ elif isinstance(t, (float, int, bool)):
83
+ t = tensor(t)
84
+
85
+ return t.to(device)
86
+
79
87
  @wraps(fn)
80
88
  def decorated_fn(*args, **kwargs):
81
89
 
@@ -83,7 +91,7 @@ def interface_torch_numpy(fn, device):
83
91
 
84
92
  out = fn(*args, **kwargs)
85
93
 
86
- out = tree_map(lambda t: from_numpy(np.array(t)).to(device) if isinstance(t, (np.ndarray, np.float64)) else t, out)
94
+ out = tree_map(to_torch_tensor, out)
87
95
  return out
88
96
 
89
97
  return decorated_fn
@@ -146,6 +154,24 @@ def temp_batch_dim(fn):
146
154
 
147
155
  return inner
148
156
 
157
+ # plasticity related
158
+
159
+ def shrink_and_perturb_(
160
+ module,
161
+ shrink_factor = 0.5,
162
+ perturb_factor = 0.01
163
+ ):
164
+ # Shrink & Perturb
165
+ # Ash et al. https://arxiv.org/abs/1910.08475
166
+
167
+ assert 0. <= shrink_factor <= 1.
168
+
169
+ for p in module.parameters():
170
+ noise = torch.randn_like(p.data)
171
+ p.data.mul_(1. - shrink_factor).add_(noise * perturb_factor)
172
+
173
+ return module
174
+
149
175
  # fitness related
150
176
 
151
177
  def get_fitness_scores(
@@ -267,37 +293,42 @@ class PowerLawDist(Module):
267
293
  class MLP(Module):
268
294
  def __init__(
269
295
  self,
270
- dims: tuple[int, ...],
296
+ dim,
297
+ depth,
271
298
  dim_latent = 0,
299
+ expansion_factor = 2.
272
300
  ):
273
301
  super().__init__()
274
302
  dim_latent = default(dim_latent, 0)
275
303
 
276
- assert len(dims) >= 2, 'must have at least two dimensions'
277
-
278
- # add the latent to the first dim
279
-
280
- first_dim, *rest_dims = dims
281
- dims = (first_dim + dim_latent, *rest_dims)
282
-
283
304
  self.dim_latent = dim_latent
284
305
 
285
306
  self.needs_latent = dim_latent > 0
286
307
 
287
308
  self.encode_latent = nn.Sequential(
288
- Linear(dim_latent, dim_latent),
309
+ Linear(dim_latent, dim),
289
310
  nn.SiLU()
290
311
  ) if self.needs_latent else None
291
312
 
292
- # pairs of dimension
313
+ dim_hidden = int(dim * expansion_factor)
293
314
 
294
- dim_pairs = tuple(zip(dims[:-1], dims[1:]))
315
+ # layers
295
316
 
296
- # modules across layers
317
+ layers = []
318
+
319
+ for _ in range(depth):
320
+ layer = nn.Sequential(
321
+ nn.LayerNorm(dim, bias = False),
322
+ nn.Linear(dim, dim_hidden),
323
+ nn.SiLU(),
324
+ nn.Linear(dim_hidden, dim),
325
+ )
297
326
 
298
- layers = ModuleList([Linear(dim_in, dim_out) for dim_in, dim_out in dim_pairs])
327
+ layers.append(layer)
299
328
 
300
- self.layers = layers
329
+ # modules across layers
330
+
331
+ self.layers = ModuleList(layers)
301
332
 
302
333
  def forward(
303
334
  self,
@@ -319,17 +350,14 @@ class MLP(Module):
319
350
 
320
351
  assert latent.shape[0] == x.shape[0], f'received state with batch size {x.shape[0]} but latent ids received had batch size {latent_id.shape[0]}'
321
352
 
322
- x = cat((x, latent), dim = -1)
353
+ x = x * latent
323
354
 
324
355
  # layers
325
356
 
326
357
  for ind, layer in enumerate(self.layers, start = 1):
327
358
  is_last = ind == len(self.layers)
328
359
 
329
- x = layer(x)
330
-
331
- if not is_last:
332
- x = F.silu(x)
360
+ x = layer(x) + x
333
361
 
334
362
  return x
335
363
 
@@ -341,26 +369,24 @@ class Actor(Module):
341
369
  self,
342
370
  dim_state,
343
371
  num_actions,
344
- dim_hiddens: tuple[int, ...],
372
+ dim,
373
+ mlp_depth,
345
374
  dim_latent = 0,
346
375
  ):
347
376
  super().__init__()
348
377
 
349
- assert len(dim_hiddens) >= 2
350
- dim_first, *_, dim_last = dim_hiddens
351
-
352
378
  self.dim_latent = dim_latent
353
379
 
354
380
  self.init_layer = nn.Sequential(
355
- nn.Linear(dim_state, dim_first),
381
+ nn.Linear(dim_state, dim),
356
382
  nn.SiLU()
357
383
  )
358
384
 
359
- self.mlp = MLP(dims = dim_hiddens, dim_latent = dim_latent)
385
+ self.mlp = MLP(dim = dim, depth = mlp_depth, dim_latent = dim_latent)
360
386
 
361
387
  self.to_out = nn.Sequential(
362
- nn.SiLU(),
363
- nn.Linear(dim_last, num_actions),
388
+ nn.LayerNorm(dim, bias = False),
389
+ nn.Linear(dim, num_actions, bias = False),
364
390
  )
365
391
 
366
392
  def forward(
@@ -379,34 +405,31 @@ class Critic(Module):
379
405
  def __init__(
380
406
  self,
381
407
  dim_state,
382
- dim_hiddens: tuple[int, ...],
408
+ dim,
409
+ mlp_depth,
383
410
  dim_latent = 0,
384
411
  use_regression = False,
385
412
  hl_gauss_loss_kwargs: dict = dict(
386
- min_value = -10.,
387
- max_value = 10.,
388
- num_bins = 25,
389
- sigma = 0.5
413
+ min_value = -100.,
414
+ max_value = 100.,
415
+ num_bins = 200
390
416
  )
391
417
  ):
392
418
  super().__init__()
393
419
 
394
- assert len(dim_hiddens) >= 2
395
- dim_first, *_, dim_last = dim_hiddens
396
-
397
420
  self.dim_latent = dim_latent
398
421
 
399
422
  self.init_layer = nn.Sequential(
400
- nn.Linear(dim_state, dim_first),
423
+ nn.Linear(dim_state, dim),
401
424
  nn.SiLU()
402
425
  )
403
426
 
404
- self.mlp = MLP(dims = dim_hiddens, dim_latent = dim_latent)
427
+ self.mlp = MLP(dim = dim, depth = mlp_depth, dim_latent = dim_latent)
405
428
 
406
- self.final_act = nn.SiLU()
429
+ self.final_norm = nn.LayerNorm(dim, bias = False)
407
430
 
408
431
  self.to_pred = HLGaussLayer(
409
- dim = dim_last,
432
+ dim = dim,
410
433
  use_regression = use_regression,
411
434
  hl_gauss_loss = hl_gauss_loss_kwargs
412
435
  )
@@ -470,7 +493,7 @@ class Critic(Module):
470
493
 
471
494
  hidden = self.mlp(hidden, latent)
472
495
 
473
- hidden = self.final_act(hidden)
496
+ hidden = self.final_norm(hidden)
474
497
 
475
498
  pred_kwargs = dict(return_logits = return_logits) if not self.use_regression else dict()
476
499
  return self.to_pred(hidden, **pred_kwargs)
@@ -825,16 +848,16 @@ class Agent(Module):
825
848
  critic: Critic,
826
849
  latent_gene_pool: LatentGenePool | None,
827
850
  optim_klass = AdoptAtan2,
828
- actor_lr = 1e-4,
829
- critic_lr = 1e-4,
851
+ actor_lr = 8e-4,
852
+ critic_lr = 8e-4,
830
853
  latent_lr = 1e-5,
831
- actor_weight_decay = 1e-3,
832
- critic_weight_decay = 1e-3,
854
+ actor_weight_decay = 5e-4,
855
+ critic_weight_decay = 5e-4,
833
856
  diversity_aux_loss_weight = 0.,
834
857
  use_critic_ema = True,
835
- critic_ema_beta = 0.99,
836
- max_grad_norm = 0.5,
837
- batch_size = 16,
858
+ critic_ema_beta = 0.95,
859
+ max_grad_norm = 1.0,
860
+ batch_size = 32,
838
861
  calc_gae_kwargs: dict = dict(
839
862
  use_accelerated = False,
840
863
  gamma = 0.99,
@@ -1251,8 +1274,10 @@ def create_agent(
1251
1274
  num_latents,
1252
1275
  dim_latent,
1253
1276
  actor_num_actions,
1254
- actor_dim_hiddens: int | tuple[int, ...],
1255
- critic_dim_hiddens: int | tuple[int, ...],
1277
+ actor_dim,
1278
+ actor_mlp_depth,
1279
+ critic_dim,
1280
+ critic_mlp_depth,
1256
1281
  use_critic_ema = True,
1257
1282
  latent_gene_pool_kwargs: dict = dict(),
1258
1283
  actor_kwargs: dict = dict(),
@@ -1275,14 +1300,16 @@ def create_agent(
1275
1300
  num_actions = actor_num_actions,
1276
1301
  dim_state = dim_state,
1277
1302
  dim_latent = dim_latent,
1278
- dim_hiddens = actor_dim_hiddens,
1303
+ dim = actor_dim,
1304
+ mlp_depth = actor_mlp_depth,
1279
1305
  **actor_kwargs
1280
1306
  )
1281
1307
 
1282
1308
  critic = Critic(
1283
1309
  dim_state = dim_state,
1284
1310
  dim_latent = dim_latent,
1285
- dim_hiddens = critic_dim_hiddens,
1311
+ dim = critic_dim,
1312
+ mlp_depth = critic_mlp_depth,
1286
1313
  **critic_kwargs
1287
1314
  )
1288
1315
 
@@ -1457,7 +1484,7 @@ class EPO(Module):
1457
1484
  log_prob,
1458
1485
  reward,
1459
1486
  value,
1460
- tensor(terminated)
1487
+ terminated
1461
1488
  )
1462
1489
 
1463
1490
  memory = Memory(*tuple(t.cpu() for t in memory))
@@ -1469,7 +1496,7 @@ class EPO(Module):
1469
1496
  if not terminated:
1470
1497
  # add bootstrap value if truncated
1471
1498
 
1472
- next_value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent)
1499
+ next_value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent, use_ema_if_available = True, use_unwrapped_model = True)
1473
1500
 
1474
1501
  memory_for_gae = memory._replace(
1475
1502
  episode_id = invalid_episode,
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "evolutionary-policy-optimization"
3
- version = "0.1.10"
3
+ version = "0.1.14"
4
4
  description = "EPO - Pytorch"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -1,10 +1,11 @@
1
1
  import pytest
2
2
 
3
3
  import torch
4
- from evolutionary_policy_optimization import (
4
+ from evolutionary_policy_optimization.epo import (
5
5
  LatentGenePool,
6
6
  Actor,
7
- Critic
7
+ Critic,
8
+ shrink_and_perturb_
8
9
  )
9
10
 
10
11
  @pytest.mark.parametrize('latent_ids', (2, (2, 4)))
@@ -128,3 +129,5 @@ def test_e2e_with_mock_env(
128
129
 
129
130
  agent.save('./agent.pt', overwrite = True)
130
131
  agent.load('./agent.pt')
132
+
133
+ shrink_and_perturb_(agent)
@@ -0,0 +1,63 @@
1
+ import torch
2
+
3
+ from evolutionary_policy_optimization import (
4
+ EPO,
5
+ GymnasiumEnvWrapper
6
+ )
7
+
8
+ # gymnasium
9
+
10
+ from shutil import rmtree
11
+ import gymnasium as gym
12
+
13
+ env = gym.make(
14
+ 'LunarLander-v3',
15
+ render_mode = 'rgb_array'
16
+ )
17
+
18
+ rmtree('./recordings', ignore_errors = True)
19
+
20
+ env = gym.wrappers.RecordVideo(
21
+ env = env,
22
+ video_folder = './recordings',
23
+ name_prefix = 'lunar-video',
24
+ episode_trigger = lambda eps_num: (eps_num % 250) == 0,
25
+ disable_logger = True
26
+ )
27
+
28
+ env = GymnasiumEnvWrapper(env)
29
+
30
+ # epo
31
+
32
+ agent = env.to_epo_agent(
33
+ num_latents = 1,
34
+ dim_latent = 32,
35
+ actor_dim = 128,
36
+ actor_mlp_depth = 2,
37
+ critic_dim = 256,
38
+ critic_mlp_depth = 4,
39
+ latent_gene_pool_kwargs = dict(
40
+ frac_natural_selected = 0.5,
41
+ frac_tournaments = 0.5
42
+ ),
43
+ accelerate_kwargs = dict(
44
+ cpu = False
45
+ ),
46
+ actor_optim_kwargs = dict(
47
+ cautious_factor = 0.1,
48
+ ),
49
+ critic_optim_kwargs = dict(
50
+ cautious_factor = 0.1,
51
+ ),
52
+ )
53
+
54
+ epo = EPO(
55
+ agent,
56
+ episodes_per_latent = 50,
57
+ max_episode_length = 500,
58
+ action_sample_temperature = 1.,
59
+ )
60
+
61
+ epo(agent, env, num_learning_cycles = 100)
62
+
63
+ agent.save('./agent.pt', overwrite = True)
@@ -1,44 +0,0 @@
1
- import torch
2
-
3
- from evolutionary_policy_optimization import (
4
- EPO,
5
- GymnasiumEnvWrapper
6
- )
7
-
8
- # gymnasium
9
-
10
- import gymnasium as gym
11
-
12
- env = gym.make(
13
- 'LunarLander-v3',
14
- render_mode = 'rgb_array'
15
- )
16
-
17
- env = GymnasiumEnvWrapper(env)
18
-
19
- # epo
20
-
21
- agent = env.to_epo_agent(
22
- num_latents = 8,
23
- dim_latent = 32,
24
- actor_dim_hiddens = (256, 128),
25
- critic_dim_hiddens = (256, 128, 64),
26
- latent_gene_pool_kwargs = dict(
27
- frac_natural_selected = 0.5,
28
- frac_tournaments = 0.5
29
- ),
30
- accelerate_kwargs = dict(
31
- cpu = False
32
- )
33
- )
34
-
35
- epo = EPO(
36
- agent,
37
- episodes_per_latent = 5,
38
- max_episode_length = 10,
39
- action_sample_temperature = 1.,
40
- )
41
-
42
- epo(agent, env, num_learning_cycles = 5)
43
-
44
- agent.save('./agent.pt', overwrite = True)