evolutionary-policy-optimization 0.1.10__tar.gz → 0.1.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/PKG-INFO +12 -1
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/README.md +11 -0
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/evolutionary_policy_optimization/epo.py +81 -54
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/pyproject.toml +1 -1
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/tests/test_epo.py +5 -2
- evolutionary_policy_optimization-0.1.14/train_gym.py +63 -0
- evolutionary_policy_optimization-0.1.10/train_gym.py +0 -44
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/.github/workflows/python-publish.yml +0 -0
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/.github/workflows/test.yml +0 -0
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/.gitignore +0 -0
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/LICENSE +0 -0
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/evolutionary_policy_optimization/__init__.py +0 -0
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/evolutionary_policy_optimization/distributed.py +0 -0
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/evolutionary_policy_optimization/env_wrappers.py +0 -0
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/evolutionary_policy_optimization/experimental.py +0 -0
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/evolutionary_policy_optimization/mock_env.py +0 -0
- {evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/requirements.txt +0 -0
{evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: evolutionary-policy-optimization
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.14
|
4
4
|
Summary: EPO - Pytorch
|
5
5
|
Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
|
6
6
|
Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
|
@@ -215,4 +215,15 @@ agent.load('./agent.pt')
|
|
215
215
|
}
|
216
216
|
```
|
217
217
|
|
218
|
+
```bibtex
|
219
|
+
@article{Ash2019OnTD,
|
220
|
+
title = {On the Difficulty of Warm-Starting Neural Network Training},
|
221
|
+
author = {Jordan T. Ash and Ryan P. Adams},
|
222
|
+
journal = {ArXiv},
|
223
|
+
year = {2019},
|
224
|
+
volume = {abs/1910.08475},
|
225
|
+
url = {https://api.semanticscholar.org/CorpusID:204788802}
|
226
|
+
}
|
227
|
+
```
|
228
|
+
|
218
229
|
*Evolution is cleverer than you are.* - Leslie Orgel
|
{evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/README.md
RENAMED
@@ -162,4 +162,15 @@ agent.load('./agent.pt')
|
|
162
162
|
}
|
163
163
|
```
|
164
164
|
|
165
|
+
```bibtex
|
166
|
+
@article{Ash2019OnTD,
|
167
|
+
title = {On the Difficulty of Warm-Starting Neural Network Training},
|
168
|
+
author = {Jordan T. Ash and Ryan P. Adams},
|
169
|
+
journal = {ArXiv},
|
170
|
+
year = {2019},
|
171
|
+
volume = {abs/1910.08475},
|
172
|
+
url = {https://api.semanticscholar.org/CorpusID:204788802}
|
173
|
+
}
|
174
|
+
```
|
175
|
+
|
165
176
|
*Evolution is cleverer than you are.* - Leslie Orgel
|
@@ -76,6 +76,14 @@ def maybe(fn):
|
|
76
76
|
def interface_torch_numpy(fn, device):
|
77
77
|
# for a given function, move all inputs from torch tensor to numpy, and all outputs from numpy to torch tensor
|
78
78
|
|
79
|
+
def to_torch_tensor(t):
|
80
|
+
if isinstance(t, (np.ndarray, np.float64)):
|
81
|
+
t = from_numpy(np.array(t))
|
82
|
+
elif isinstance(t, (float, int, bool)):
|
83
|
+
t = tensor(t)
|
84
|
+
|
85
|
+
return t.to(device)
|
86
|
+
|
79
87
|
@wraps(fn)
|
80
88
|
def decorated_fn(*args, **kwargs):
|
81
89
|
|
@@ -83,7 +91,7 @@ def interface_torch_numpy(fn, device):
|
|
83
91
|
|
84
92
|
out = fn(*args, **kwargs)
|
85
93
|
|
86
|
-
out = tree_map(
|
94
|
+
out = tree_map(to_torch_tensor, out)
|
87
95
|
return out
|
88
96
|
|
89
97
|
return decorated_fn
|
@@ -146,6 +154,24 @@ def temp_batch_dim(fn):
|
|
146
154
|
|
147
155
|
return inner
|
148
156
|
|
157
|
+
# plasticity related
|
158
|
+
|
159
|
+
def shrink_and_perturb_(
|
160
|
+
module,
|
161
|
+
shrink_factor = 0.5,
|
162
|
+
perturb_factor = 0.01
|
163
|
+
):
|
164
|
+
# Shrink & Perturb
|
165
|
+
# Ash et al. https://arxiv.org/abs/1910.08475
|
166
|
+
|
167
|
+
assert 0. <= shrink_factor <= 1.
|
168
|
+
|
169
|
+
for p in module.parameters():
|
170
|
+
noise = torch.randn_like(p.data)
|
171
|
+
p.data.mul_(1. - shrink_factor).add_(noise * perturb_factor)
|
172
|
+
|
173
|
+
return module
|
174
|
+
|
149
175
|
# fitness related
|
150
176
|
|
151
177
|
def get_fitness_scores(
|
@@ -267,37 +293,42 @@ class PowerLawDist(Module):
|
|
267
293
|
class MLP(Module):
|
268
294
|
def __init__(
|
269
295
|
self,
|
270
|
-
|
296
|
+
dim,
|
297
|
+
depth,
|
271
298
|
dim_latent = 0,
|
299
|
+
expansion_factor = 2.
|
272
300
|
):
|
273
301
|
super().__init__()
|
274
302
|
dim_latent = default(dim_latent, 0)
|
275
303
|
|
276
|
-
assert len(dims) >= 2, 'must have at least two dimensions'
|
277
|
-
|
278
|
-
# add the latent to the first dim
|
279
|
-
|
280
|
-
first_dim, *rest_dims = dims
|
281
|
-
dims = (first_dim + dim_latent, *rest_dims)
|
282
|
-
|
283
304
|
self.dim_latent = dim_latent
|
284
305
|
|
285
306
|
self.needs_latent = dim_latent > 0
|
286
307
|
|
287
308
|
self.encode_latent = nn.Sequential(
|
288
|
-
Linear(dim_latent,
|
309
|
+
Linear(dim_latent, dim),
|
289
310
|
nn.SiLU()
|
290
311
|
) if self.needs_latent else None
|
291
312
|
|
292
|
-
|
313
|
+
dim_hidden = int(dim * expansion_factor)
|
293
314
|
|
294
|
-
|
315
|
+
# layers
|
295
316
|
|
296
|
-
|
317
|
+
layers = []
|
318
|
+
|
319
|
+
for _ in range(depth):
|
320
|
+
layer = nn.Sequential(
|
321
|
+
nn.LayerNorm(dim, bias = False),
|
322
|
+
nn.Linear(dim, dim_hidden),
|
323
|
+
nn.SiLU(),
|
324
|
+
nn.Linear(dim_hidden, dim),
|
325
|
+
)
|
297
326
|
|
298
|
-
|
327
|
+
layers.append(layer)
|
299
328
|
|
300
|
-
|
329
|
+
# modules across layers
|
330
|
+
|
331
|
+
self.layers = ModuleList(layers)
|
301
332
|
|
302
333
|
def forward(
|
303
334
|
self,
|
@@ -319,17 +350,14 @@ class MLP(Module):
|
|
319
350
|
|
320
351
|
assert latent.shape[0] == x.shape[0], f'received state with batch size {x.shape[0]} but latent ids received had batch size {latent_id.shape[0]}'
|
321
352
|
|
322
|
-
x =
|
353
|
+
x = x * latent
|
323
354
|
|
324
355
|
# layers
|
325
356
|
|
326
357
|
for ind, layer in enumerate(self.layers, start = 1):
|
327
358
|
is_last = ind == len(self.layers)
|
328
359
|
|
329
|
-
x = layer(x)
|
330
|
-
|
331
|
-
if not is_last:
|
332
|
-
x = F.silu(x)
|
360
|
+
x = layer(x) + x
|
333
361
|
|
334
362
|
return x
|
335
363
|
|
@@ -341,26 +369,24 @@ class Actor(Module):
|
|
341
369
|
self,
|
342
370
|
dim_state,
|
343
371
|
num_actions,
|
344
|
-
|
372
|
+
dim,
|
373
|
+
mlp_depth,
|
345
374
|
dim_latent = 0,
|
346
375
|
):
|
347
376
|
super().__init__()
|
348
377
|
|
349
|
-
assert len(dim_hiddens) >= 2
|
350
|
-
dim_first, *_, dim_last = dim_hiddens
|
351
|
-
|
352
378
|
self.dim_latent = dim_latent
|
353
379
|
|
354
380
|
self.init_layer = nn.Sequential(
|
355
|
-
nn.Linear(dim_state,
|
381
|
+
nn.Linear(dim_state, dim),
|
356
382
|
nn.SiLU()
|
357
383
|
)
|
358
384
|
|
359
|
-
self.mlp = MLP(
|
385
|
+
self.mlp = MLP(dim = dim, depth = mlp_depth, dim_latent = dim_latent)
|
360
386
|
|
361
387
|
self.to_out = nn.Sequential(
|
362
|
-
nn.
|
363
|
-
nn.Linear(
|
388
|
+
nn.LayerNorm(dim, bias = False),
|
389
|
+
nn.Linear(dim, num_actions, bias = False),
|
364
390
|
)
|
365
391
|
|
366
392
|
def forward(
|
@@ -379,34 +405,31 @@ class Critic(Module):
|
|
379
405
|
def __init__(
|
380
406
|
self,
|
381
407
|
dim_state,
|
382
|
-
|
408
|
+
dim,
|
409
|
+
mlp_depth,
|
383
410
|
dim_latent = 0,
|
384
411
|
use_regression = False,
|
385
412
|
hl_gauss_loss_kwargs: dict = dict(
|
386
|
-
min_value = -
|
387
|
-
max_value =
|
388
|
-
num_bins =
|
389
|
-
sigma = 0.5
|
413
|
+
min_value = -100.,
|
414
|
+
max_value = 100.,
|
415
|
+
num_bins = 200
|
390
416
|
)
|
391
417
|
):
|
392
418
|
super().__init__()
|
393
419
|
|
394
|
-
assert len(dim_hiddens) >= 2
|
395
|
-
dim_first, *_, dim_last = dim_hiddens
|
396
|
-
|
397
420
|
self.dim_latent = dim_latent
|
398
421
|
|
399
422
|
self.init_layer = nn.Sequential(
|
400
|
-
nn.Linear(dim_state,
|
423
|
+
nn.Linear(dim_state, dim),
|
401
424
|
nn.SiLU()
|
402
425
|
)
|
403
426
|
|
404
|
-
self.mlp = MLP(
|
427
|
+
self.mlp = MLP(dim = dim, depth = mlp_depth, dim_latent = dim_latent)
|
405
428
|
|
406
|
-
self.
|
429
|
+
self.final_norm = nn.LayerNorm(dim, bias = False)
|
407
430
|
|
408
431
|
self.to_pred = HLGaussLayer(
|
409
|
-
dim =
|
432
|
+
dim = dim,
|
410
433
|
use_regression = use_regression,
|
411
434
|
hl_gauss_loss = hl_gauss_loss_kwargs
|
412
435
|
)
|
@@ -470,7 +493,7 @@ class Critic(Module):
|
|
470
493
|
|
471
494
|
hidden = self.mlp(hidden, latent)
|
472
495
|
|
473
|
-
hidden = self.
|
496
|
+
hidden = self.final_norm(hidden)
|
474
497
|
|
475
498
|
pred_kwargs = dict(return_logits = return_logits) if not self.use_regression else dict()
|
476
499
|
return self.to_pred(hidden, **pred_kwargs)
|
@@ -825,16 +848,16 @@ class Agent(Module):
|
|
825
848
|
critic: Critic,
|
826
849
|
latent_gene_pool: LatentGenePool | None,
|
827
850
|
optim_klass = AdoptAtan2,
|
828
|
-
actor_lr =
|
829
|
-
critic_lr =
|
851
|
+
actor_lr = 8e-4,
|
852
|
+
critic_lr = 8e-4,
|
830
853
|
latent_lr = 1e-5,
|
831
|
-
actor_weight_decay =
|
832
|
-
critic_weight_decay =
|
854
|
+
actor_weight_decay = 5e-4,
|
855
|
+
critic_weight_decay = 5e-4,
|
833
856
|
diversity_aux_loss_weight = 0.,
|
834
857
|
use_critic_ema = True,
|
835
|
-
critic_ema_beta = 0.
|
836
|
-
max_grad_norm = 0
|
837
|
-
batch_size =
|
858
|
+
critic_ema_beta = 0.95,
|
859
|
+
max_grad_norm = 1.0,
|
860
|
+
batch_size = 32,
|
838
861
|
calc_gae_kwargs: dict = dict(
|
839
862
|
use_accelerated = False,
|
840
863
|
gamma = 0.99,
|
@@ -1251,8 +1274,10 @@ def create_agent(
|
|
1251
1274
|
num_latents,
|
1252
1275
|
dim_latent,
|
1253
1276
|
actor_num_actions,
|
1254
|
-
|
1255
|
-
|
1277
|
+
actor_dim,
|
1278
|
+
actor_mlp_depth,
|
1279
|
+
critic_dim,
|
1280
|
+
critic_mlp_depth,
|
1256
1281
|
use_critic_ema = True,
|
1257
1282
|
latent_gene_pool_kwargs: dict = dict(),
|
1258
1283
|
actor_kwargs: dict = dict(),
|
@@ -1275,14 +1300,16 @@ def create_agent(
|
|
1275
1300
|
num_actions = actor_num_actions,
|
1276
1301
|
dim_state = dim_state,
|
1277
1302
|
dim_latent = dim_latent,
|
1278
|
-
|
1303
|
+
dim = actor_dim,
|
1304
|
+
mlp_depth = actor_mlp_depth,
|
1279
1305
|
**actor_kwargs
|
1280
1306
|
)
|
1281
1307
|
|
1282
1308
|
critic = Critic(
|
1283
1309
|
dim_state = dim_state,
|
1284
1310
|
dim_latent = dim_latent,
|
1285
|
-
|
1311
|
+
dim = critic_dim,
|
1312
|
+
mlp_depth = critic_mlp_depth,
|
1286
1313
|
**critic_kwargs
|
1287
1314
|
)
|
1288
1315
|
|
@@ -1457,7 +1484,7 @@ class EPO(Module):
|
|
1457
1484
|
log_prob,
|
1458
1485
|
reward,
|
1459
1486
|
value,
|
1460
|
-
|
1487
|
+
terminated
|
1461
1488
|
)
|
1462
1489
|
|
1463
1490
|
memory = Memory(*tuple(t.cpu() for t in memory))
|
@@ -1469,7 +1496,7 @@ class EPO(Module):
|
|
1469
1496
|
if not terminated:
|
1470
1497
|
# add bootstrap value if truncated
|
1471
1498
|
|
1472
|
-
next_value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent)
|
1499
|
+
next_value = temp_batch_dim(self.agent.get_critic_values)(state, latent = latent, use_ema_if_available = True, use_unwrapped_model = True)
|
1473
1500
|
|
1474
1501
|
memory_for_gae = memory._replace(
|
1475
1502
|
episode_id = invalid_episode,
|
@@ -1,10 +1,11 @@
|
|
1
1
|
import pytest
|
2
2
|
|
3
3
|
import torch
|
4
|
-
from evolutionary_policy_optimization import (
|
4
|
+
from evolutionary_policy_optimization.epo import (
|
5
5
|
LatentGenePool,
|
6
6
|
Actor,
|
7
|
-
Critic
|
7
|
+
Critic,
|
8
|
+
shrink_and_perturb_
|
8
9
|
)
|
9
10
|
|
10
11
|
@pytest.mark.parametrize('latent_ids', (2, (2, 4)))
|
@@ -128,3 +129,5 @@ def test_e2e_with_mock_env(
|
|
128
129
|
|
129
130
|
agent.save('./agent.pt', overwrite = True)
|
130
131
|
agent.load('./agent.pt')
|
132
|
+
|
133
|
+
shrink_and_perturb_(agent)
|
@@ -0,0 +1,63 @@
|
|
1
|
+
import torch
|
2
|
+
|
3
|
+
from evolutionary_policy_optimization import (
|
4
|
+
EPO,
|
5
|
+
GymnasiumEnvWrapper
|
6
|
+
)
|
7
|
+
|
8
|
+
# gymnasium
|
9
|
+
|
10
|
+
from shutil import rmtree
|
11
|
+
import gymnasium as gym
|
12
|
+
|
13
|
+
env = gym.make(
|
14
|
+
'LunarLander-v3',
|
15
|
+
render_mode = 'rgb_array'
|
16
|
+
)
|
17
|
+
|
18
|
+
rmtree('./recordings', ignore_errors = True)
|
19
|
+
|
20
|
+
env = gym.wrappers.RecordVideo(
|
21
|
+
env = env,
|
22
|
+
video_folder = './recordings',
|
23
|
+
name_prefix = 'lunar-video',
|
24
|
+
episode_trigger = lambda eps_num: (eps_num % 250) == 0,
|
25
|
+
disable_logger = True
|
26
|
+
)
|
27
|
+
|
28
|
+
env = GymnasiumEnvWrapper(env)
|
29
|
+
|
30
|
+
# epo
|
31
|
+
|
32
|
+
agent = env.to_epo_agent(
|
33
|
+
num_latents = 1,
|
34
|
+
dim_latent = 32,
|
35
|
+
actor_dim = 128,
|
36
|
+
actor_mlp_depth = 2,
|
37
|
+
critic_dim = 256,
|
38
|
+
critic_mlp_depth = 4,
|
39
|
+
latent_gene_pool_kwargs = dict(
|
40
|
+
frac_natural_selected = 0.5,
|
41
|
+
frac_tournaments = 0.5
|
42
|
+
),
|
43
|
+
accelerate_kwargs = dict(
|
44
|
+
cpu = False
|
45
|
+
),
|
46
|
+
actor_optim_kwargs = dict(
|
47
|
+
cautious_factor = 0.1,
|
48
|
+
),
|
49
|
+
critic_optim_kwargs = dict(
|
50
|
+
cautious_factor = 0.1,
|
51
|
+
),
|
52
|
+
)
|
53
|
+
|
54
|
+
epo = EPO(
|
55
|
+
agent,
|
56
|
+
episodes_per_latent = 50,
|
57
|
+
max_episode_length = 500,
|
58
|
+
action_sample_temperature = 1.,
|
59
|
+
)
|
60
|
+
|
61
|
+
epo(agent, env, num_learning_cycles = 100)
|
62
|
+
|
63
|
+
agent.save('./agent.pt', overwrite = True)
|
@@ -1,44 +0,0 @@
|
|
1
|
-
import torch
|
2
|
-
|
3
|
-
from evolutionary_policy_optimization import (
|
4
|
-
EPO,
|
5
|
-
GymnasiumEnvWrapper
|
6
|
-
)
|
7
|
-
|
8
|
-
# gymnasium
|
9
|
-
|
10
|
-
import gymnasium as gym
|
11
|
-
|
12
|
-
env = gym.make(
|
13
|
-
'LunarLander-v3',
|
14
|
-
render_mode = 'rgb_array'
|
15
|
-
)
|
16
|
-
|
17
|
-
env = GymnasiumEnvWrapper(env)
|
18
|
-
|
19
|
-
# epo
|
20
|
-
|
21
|
-
agent = env.to_epo_agent(
|
22
|
-
num_latents = 8,
|
23
|
-
dim_latent = 32,
|
24
|
-
actor_dim_hiddens = (256, 128),
|
25
|
-
critic_dim_hiddens = (256, 128, 64),
|
26
|
-
latent_gene_pool_kwargs = dict(
|
27
|
-
frac_natural_selected = 0.5,
|
28
|
-
frac_tournaments = 0.5
|
29
|
-
),
|
30
|
-
accelerate_kwargs = dict(
|
31
|
-
cpu = False
|
32
|
-
)
|
33
|
-
)
|
34
|
-
|
35
|
-
epo = EPO(
|
36
|
-
agent,
|
37
|
-
episodes_per_latent = 5,
|
38
|
-
max_episode_length = 10,
|
39
|
-
action_sample_temperature = 1.,
|
40
|
-
)
|
41
|
-
|
42
|
-
epo(agent, env, num_learning_cycles = 5)
|
43
|
-
|
44
|
-
agent.save('./agent.pt', overwrite = True)
|
File without changes
|
File without changes
|
{evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/.gitignore
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{evolutionary_policy_optimization-0.1.10 → evolutionary_policy_optimization-0.1.14}/requirements.txt
RENAMED
File without changes
|