evolutionary-policy-optimization 0.0.72__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/PKG-INFO +2 -1
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/evolutionary_policy_optimization/__init__.py +4 -0
- evolutionary_policy_optimization-0.1.1/evolutionary_policy_optimization/env_wrappers.py +36 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/evolutionary_policy_optimization/epo.py +6 -5
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/evolutionary_policy_optimization/mock_env.py +9 -9
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/pyproject.toml +2 -1
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/requirements.txt +2 -0
- evolutionary_policy_optimization-0.1.1/train_gym.py +43 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/.github/workflows/python-publish.yml +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/.github/workflows/test.yml +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/.gitignore +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/LICENSE +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/README.md +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/evolutionary_policy_optimization/distributed.py +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/evolutionary_policy_optimization/experimental.py +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/tests/test_epo.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: evolutionary-policy-optimization
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.1.1
|
4
4
|
Summary: EPO - Pytorch
|
5
5
|
Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
|
6
6
|
Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
|
@@ -34,6 +34,7 @@ Classifier: License :: OSI Approved :: MIT License
|
|
34
34
|
Classifier: Programming Language :: Python :: 3.8
|
35
35
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
36
36
|
Requires-Python: >=3.9
|
37
|
+
Requires-Dist: accelerate
|
37
38
|
Requires-Dist: adam-atan2-pytorch
|
38
39
|
Requires-Dist: assoc-scan>=0.0.2
|
39
40
|
Requires-Dist: einops>=0.8.1
|
@@ -0,0 +1,36 @@
|
|
1
|
+
import torch
|
2
|
+
from torch.nn import Module
|
3
|
+
|
4
|
+
from evolutionary_policy_optimization.epo import create_agent, Agent
|
5
|
+
|
6
|
+
class GymnasiumEnvWrapper(Module):
|
7
|
+
def __init__(
|
8
|
+
self,
|
9
|
+
env
|
10
|
+
):
|
11
|
+
super().__init__()
|
12
|
+
self.env = env
|
13
|
+
|
14
|
+
def reset(self, *args, **kwargs):
|
15
|
+
return self.env.reset(*args, **kwargs)
|
16
|
+
|
17
|
+
def step(self, *args, **kwargs):
|
18
|
+
return self.env.step(*args, **kwargs)
|
19
|
+
|
20
|
+
def to_agent_hparams(self):
|
21
|
+
return dict(
|
22
|
+
dim_state = self.env.observation_space.shape[0],
|
23
|
+
actor_num_actions = self.env.action_space.n
|
24
|
+
)
|
25
|
+
|
26
|
+
def to_epo_agent(
|
27
|
+
self,
|
28
|
+
*args,
|
29
|
+
**kwargs
|
30
|
+
) -> Agent:
|
31
|
+
|
32
|
+
return create_agent(
|
33
|
+
*args,
|
34
|
+
**self.to_agent_hparams(),
|
35
|
+
**kwargs
|
36
|
+
)
|
@@ -70,7 +70,7 @@ def interface_torch_numpy(fn, device):
|
|
70
70
|
|
71
71
|
out = fn(*args, **kwargs)
|
72
72
|
|
73
|
-
out = tree_map(lambda t: from_numpy(t).to(device) if isinstance(t, np.ndarray) else t, out)
|
73
|
+
out = tree_map(lambda t: from_numpy(np.array(t)).to(device) if isinstance(t, (np.ndarray, np.float64)) else t, out)
|
74
74
|
return out
|
75
75
|
|
76
76
|
return decorated_fn
|
@@ -1040,6 +1040,7 @@ def actor_loss(
|
|
1040
1040
|
# agent contains the actor, critic, and the latent genetic pool
|
1041
1041
|
|
1042
1042
|
def create_agent(
|
1043
|
+
*,
|
1043
1044
|
dim_state,
|
1044
1045
|
num_latents,
|
1045
1046
|
dim_latent,
|
@@ -1176,7 +1177,7 @@ class EPO(Module):
|
|
1176
1177
|
if fix_environ_across_latents:
|
1177
1178
|
maybe_seed = environment_seeds[episode_id]
|
1178
1179
|
|
1179
|
-
yield latent_id, episode_id, maybe_seed
|
1180
|
+
yield latent_id, episode_id, maybe_seed.item()
|
1180
1181
|
|
1181
1182
|
@torch.no_grad()
|
1182
1183
|
def gather_experience_from(
|
@@ -1210,7 +1211,7 @@ class EPO(Module):
|
|
1210
1211
|
if fix_environ_across_latents:
|
1211
1212
|
reset_kwargs.update(seed = maybe_seed)
|
1212
1213
|
|
1213
|
-
state = interface_torch_numpy(env.reset, device = self.device)(**reset_kwargs)
|
1214
|
+
state, _ = interface_torch_numpy(env.reset, device = self.device)(**reset_kwargs)
|
1214
1215
|
|
1215
1216
|
# get latent from pool
|
1216
1217
|
|
@@ -1232,7 +1233,7 @@ class EPO(Module):
|
|
1232
1233
|
|
1233
1234
|
# get the next state, action, and reward
|
1234
1235
|
|
1235
|
-
state, reward, truncated, terminated = interface_torch_numpy(env.
|
1236
|
+
state, reward, truncated, terminated, _ = interface_torch_numpy(env.step, device = self.device)(action)
|
1236
1237
|
|
1237
1238
|
done = truncated or terminated
|
1238
1239
|
|
@@ -1250,7 +1251,7 @@ class EPO(Module):
|
|
1250
1251
|
log_prob,
|
1251
1252
|
reward,
|
1252
1253
|
value,
|
1253
|
-
terminated
|
1254
|
+
tensor(terminated)
|
1254
1255
|
)
|
1255
1256
|
|
1256
1257
|
memory = Memory(*tuple(t.cpu() for t in memory))
|
@@ -22,34 +22,34 @@ class Env(Module):
|
|
22
22
|
self.state_shape = cast_tuple(state_shape)
|
23
23
|
|
24
24
|
self.can_terminate_after = can_terminate_after
|
25
|
-
self.register_buffer('
|
25
|
+
self.register_buffer('_step', tensor(0))
|
26
26
|
|
27
27
|
@property
|
28
28
|
def device(self):
|
29
|
-
return self.
|
29
|
+
return self._step.device
|
30
30
|
|
31
31
|
def reset(
|
32
32
|
self,
|
33
33
|
seed = None
|
34
34
|
):
|
35
35
|
state = randn(self.state_shape, device = self.device)
|
36
|
-
self.
|
37
|
-
return state.numpy()
|
36
|
+
self._step.zero_()
|
37
|
+
return state.numpy(), None
|
38
38
|
|
39
|
-
def
|
39
|
+
def step(
|
40
40
|
self,
|
41
41
|
actions,
|
42
42
|
):
|
43
43
|
state = randn(self.state_shape, device = self.device)
|
44
44
|
reward = randint(0, 5, (), device = self.device).float()
|
45
45
|
|
46
|
-
if self.
|
46
|
+
if self._step > self.can_terminate_after:
|
47
47
|
truncated = tensor(choice((True, False)), device =self.device)
|
48
48
|
terminated = tensor(choice((True, False)), device =self.device)
|
49
49
|
else:
|
50
50
|
truncated = terminated = tensor(False, device = self.device)
|
51
51
|
|
52
|
-
self.
|
52
|
+
self._step.add_(1)
|
53
53
|
|
54
|
-
out = state, reward, truncated, terminated
|
55
|
-
return tuple(t.numpy() for t in out)
|
54
|
+
out = (state, reward, truncated, terminated)
|
55
|
+
return (*tuple(t.numpy() for t in out), None)
|
{evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/pyproject.toml
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "evolutionary-policy-optimization"
|
3
|
-
version = "0.
|
3
|
+
version = "0.1.1"
|
4
4
|
description = "EPO - Pytorch"
|
5
5
|
authors = [
|
6
6
|
{ name = "Phil Wang", email = "lucidrains@gmail.com" }
|
@@ -24,6 +24,7 @@ classifiers = [
|
|
24
24
|
]
|
25
25
|
|
26
26
|
dependencies = [
|
27
|
+
"accelerate",
|
27
28
|
"adam-atan2-pytorch",
|
28
29
|
'assoc-scan>=0.0.2',
|
29
30
|
'einx>=0.3.0',
|
@@ -0,0 +1,43 @@
|
|
1
|
+
import torch
|
2
|
+
|
3
|
+
from evolutionary_policy_optimization import (
|
4
|
+
EPO,
|
5
|
+
GymnasiumEnvWrapper
|
6
|
+
)
|
7
|
+
|
8
|
+
# gymnasium
|
9
|
+
|
10
|
+
import gymnasium as gym
|
11
|
+
|
12
|
+
env = gym.make(
|
13
|
+
'LunarLander-v3',
|
14
|
+
render_mode = 'rgb_array'
|
15
|
+
)
|
16
|
+
|
17
|
+
env = GymnasiumEnvWrapper(env)
|
18
|
+
|
19
|
+
# epo
|
20
|
+
|
21
|
+
agent = env.to_epo_agent(
|
22
|
+
num_latents = 8,
|
23
|
+
dim_latent = 32,
|
24
|
+
actor_dim_hiddens = (256, 128),
|
25
|
+
critic_dim_hiddens = (256, 128, 64),
|
26
|
+
latent_gene_pool_kwargs = dict(
|
27
|
+
frac_natural_selected = 0.5,
|
28
|
+
frac_tournaments = 0.5
|
29
|
+
)
|
30
|
+
)
|
31
|
+
|
32
|
+
epo = EPO(
|
33
|
+
agent,
|
34
|
+
episodes_per_latent = 1,
|
35
|
+
max_episode_length = 10,
|
36
|
+
action_sample_temperature = 1.
|
37
|
+
)
|
38
|
+
|
39
|
+
epo.to('cpu' if not torch.cuda.is_available() else 'cuda')
|
40
|
+
|
41
|
+
epo(agent, env, num_learning_cycles = 1)
|
42
|
+
|
43
|
+
agent.save('./agent.pt', overwrite = True)
|
File without changes
|
File without changes
|
{evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/.gitignore
RENAMED
File without changes
|
File without changes
|
{evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/README.md
RENAMED
File without changes
|
File without changes
|
File without changes
|
{evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.1}/tests/test_epo.py
RENAMED
File without changes
|