evolutionary-policy-optimization 0.0.72__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/PKG-INFO +1 -1
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/evolutionary_policy_optimization/epo.py +5 -5
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/evolutionary_policy_optimization/mock_env.py +9 -9
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/pyproject.toml +1 -1
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/requirements.txt +2 -0
- evolutionary_policy_optimization-0.1.0/train_gym.py +44 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/.github/workflows/python-publish.yml +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/.github/workflows/test.yml +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/.gitignore +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/LICENSE +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/README.md +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/evolutionary_policy_optimization/__init__.py +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/evolutionary_policy_optimization/distributed.py +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/evolutionary_policy_optimization/experimental.py +0 -0
- {evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/tests/test_epo.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: evolutionary-policy-optimization
|
3
|
-
Version: 0.0
|
3
|
+
Version: 0.1.0
|
4
4
|
Summary: EPO - Pytorch
|
5
5
|
Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
|
6
6
|
Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
|
@@ -70,7 +70,7 @@ def interface_torch_numpy(fn, device):
|
|
70
70
|
|
71
71
|
out = fn(*args, **kwargs)
|
72
72
|
|
73
|
-
out = tree_map(lambda t: from_numpy(t).to(device) if isinstance(t, np.ndarray) else t, out)
|
73
|
+
out = tree_map(lambda t: from_numpy(np.array(t)).to(device) if isinstance(t, (np.ndarray, np.float64)) else t, out)
|
74
74
|
return out
|
75
75
|
|
76
76
|
return decorated_fn
|
@@ -1176,7 +1176,7 @@ class EPO(Module):
|
|
1176
1176
|
if fix_environ_across_latents:
|
1177
1177
|
maybe_seed = environment_seeds[episode_id]
|
1178
1178
|
|
1179
|
-
yield latent_id, episode_id, maybe_seed
|
1179
|
+
yield latent_id, episode_id, maybe_seed.item()
|
1180
1180
|
|
1181
1181
|
@torch.no_grad()
|
1182
1182
|
def gather_experience_from(
|
@@ -1210,7 +1210,7 @@ class EPO(Module):
|
|
1210
1210
|
if fix_environ_across_latents:
|
1211
1211
|
reset_kwargs.update(seed = maybe_seed)
|
1212
1212
|
|
1213
|
-
state = interface_torch_numpy(env.reset, device = self.device)(**reset_kwargs)
|
1213
|
+
state, _ = interface_torch_numpy(env.reset, device = self.device)(**reset_kwargs)
|
1214
1214
|
|
1215
1215
|
# get latent from pool
|
1216
1216
|
|
@@ -1232,7 +1232,7 @@ class EPO(Module):
|
|
1232
1232
|
|
1233
1233
|
# get the next state, action, and reward
|
1234
1234
|
|
1235
|
-
state, reward, truncated, terminated = interface_torch_numpy(env.
|
1235
|
+
state, reward, truncated, terminated, _ = interface_torch_numpy(env.step, device = self.device)(action)
|
1236
1236
|
|
1237
1237
|
done = truncated or terminated
|
1238
1238
|
|
@@ -1250,7 +1250,7 @@ class EPO(Module):
|
|
1250
1250
|
log_prob,
|
1251
1251
|
reward,
|
1252
1252
|
value,
|
1253
|
-
terminated
|
1253
|
+
tensor(terminated)
|
1254
1254
|
)
|
1255
1255
|
|
1256
1256
|
memory = Memory(*tuple(t.cpu() for t in memory))
|
@@ -22,34 +22,34 @@ class Env(Module):
|
|
22
22
|
self.state_shape = cast_tuple(state_shape)
|
23
23
|
|
24
24
|
self.can_terminate_after = can_terminate_after
|
25
|
-
self.register_buffer('
|
25
|
+
self.register_buffer('_step', tensor(0))
|
26
26
|
|
27
27
|
@property
|
28
28
|
def device(self):
|
29
|
-
return self.
|
29
|
+
return self._step.device
|
30
30
|
|
31
31
|
def reset(
|
32
32
|
self,
|
33
33
|
seed = None
|
34
34
|
):
|
35
35
|
state = randn(self.state_shape, device = self.device)
|
36
|
-
self.
|
37
|
-
return state.numpy()
|
36
|
+
self._step.zero_()
|
37
|
+
return state.numpy(), None
|
38
38
|
|
39
|
-
def
|
39
|
+
def step(
|
40
40
|
self,
|
41
41
|
actions,
|
42
42
|
):
|
43
43
|
state = randn(self.state_shape, device = self.device)
|
44
44
|
reward = randint(0, 5, (), device = self.device).float()
|
45
45
|
|
46
|
-
if self.
|
46
|
+
if self._step > self.can_terminate_after:
|
47
47
|
truncated = tensor(choice((True, False)), device =self.device)
|
48
48
|
terminated = tensor(choice((True, False)), device =self.device)
|
49
49
|
else:
|
50
50
|
truncated = terminated = tensor(False, device = self.device)
|
51
51
|
|
52
|
-
self.
|
52
|
+
self._step.add_(1)
|
53
53
|
|
54
|
-
out = state, reward, truncated, terminated
|
55
|
-
return tuple(t.numpy() for t in out)
|
54
|
+
out = (state, reward, truncated, terminated)
|
55
|
+
return (*tuple(t.numpy() for t in out), None)
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# gymnasium
|
2
|
+
|
3
|
+
import gymnasium as gym
|
4
|
+
|
5
|
+
env = gym.make(
|
6
|
+
'LunarLander-v3',
|
7
|
+
render_mode = 'rgb_array'
|
8
|
+
)
|
9
|
+
|
10
|
+
state_dim = env.observation_space.shape[0]
|
11
|
+
num_actions = env.action_space.n
|
12
|
+
|
13
|
+
# epo
|
14
|
+
|
15
|
+
import torch
|
16
|
+
|
17
|
+
from evolutionary_policy_optimization import (
|
18
|
+
create_agent,
|
19
|
+
EPO,
|
20
|
+
Env
|
21
|
+
)
|
22
|
+
|
23
|
+
agent = create_agent(
|
24
|
+
dim_state = state_dim,
|
25
|
+
num_latents = 1,
|
26
|
+
dim_latent = 32,
|
27
|
+
actor_num_actions = num_actions,
|
28
|
+
actor_dim_hiddens = (256, 128),
|
29
|
+
critic_dim_hiddens = (256, 128, 64),
|
30
|
+
latent_gene_pool_kwargs = dict(
|
31
|
+
frac_natural_selected = 0.5
|
32
|
+
)
|
33
|
+
)
|
34
|
+
|
35
|
+
epo = EPO(
|
36
|
+
agent,
|
37
|
+
episodes_per_latent = 1,
|
38
|
+
max_episode_length = 10,
|
39
|
+
action_sample_temperature = 1.
|
40
|
+
)
|
41
|
+
|
42
|
+
epo.to('cpu' if not torch.cuda.is_available() else 'cuda')
|
43
|
+
|
44
|
+
epo(agent, env, num_learning_cycles = 5)
|
File without changes
|
File without changes
|
{evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/.gitignore
RENAMED
File without changes
|
File without changes
|
{evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/README.md
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{evolutionary_policy_optimization-0.0.72 → evolutionary_policy_optimization-0.1.0}/tests/test_epo.py
RENAMED
File without changes
|