evolutionary-policy-optimization 0.0.45__tar.gz → 0.0.47__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/PKG-INFO +1 -1
- {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/evolutionary_policy_optimization/epo.py +36 -12
- {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/evolutionary_policy_optimization/mock_env.py +2 -1
- {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/pyproject.toml +1 -1
- {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/.github/workflows/python-publish.yml +0 -0
- {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/.github/workflows/test.yml +0 -0
- {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/.gitignore +0 -0
- {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/LICENSE +0 -0
- {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/README.md +0 -0
- {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/evolutionary_policy_optimization/__init__.py +0 -0
- {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/evolutionary_policy_optimization/experimental.py +0 -0
- {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/requirements.txt +0 -0
- {evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/tests/test_epo.py +0 -0
{evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: evolutionary-policy-optimization
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.47
|
4
4
|
Summary: EPO - Pytorch
|
5
5
|
Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
|
6
6
|
Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
|
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
from functools import partial, wraps
|
4
4
|
from pathlib import Path
|
5
5
|
from collections import namedtuple
|
6
|
+
from random import randrange
|
6
7
|
|
7
8
|
import torch
|
8
9
|
from torch import nn, cat, stack, is_tensor, tensor
|
@@ -87,6 +88,11 @@ def temp_batch_dim(fn):
|
|
87
88
|
|
88
89
|
return inner
|
89
90
|
|
91
|
+
# fitness related
|
92
|
+
|
93
|
+
def get_fitness_scores(cum_rewards, memories):
|
94
|
+
return cum_rewards
|
95
|
+
|
90
96
|
# generalized advantage estimate
|
91
97
|
|
92
98
|
def calc_generalized_advantage_estimate(
|
@@ -643,6 +649,7 @@ class Agent(Module):
|
|
643
649
|
actor_optim_kwargs: dict = dict(),
|
644
650
|
critic_optim_kwargs: dict = dict(),
|
645
651
|
latent_optim_kwargs: dict = dict(),
|
652
|
+
get_fitness_scores: Callable[..., Tensor] = get_fitness_scores
|
646
653
|
):
|
647
654
|
super().__init__()
|
648
655
|
|
@@ -663,6 +670,10 @@ class Agent(Module):
|
|
663
670
|
self.actor_loss = partial(actor_loss, **actor_loss_kwargs)
|
664
671
|
self.calc_gae = partial(calc_generalized_advantage_estimate, **calc_gae_kwargs)
|
665
672
|
|
673
|
+
# fitness score related
|
674
|
+
|
675
|
+
self.get_fitness_scores = get_fitness_scores
|
676
|
+
|
666
677
|
# learning hparams
|
667
678
|
|
668
679
|
self.batch_size = batch_size
|
@@ -766,10 +777,12 @@ class Agent(Module):
|
|
766
777
|
|
767
778
|
def forward(
|
768
779
|
self,
|
769
|
-
|
780
|
+
memories_and_cumulative_rewards: MemoriesAndCumulativeRewards,
|
770
781
|
epochs = 2
|
771
782
|
):
|
772
|
-
memories,
|
783
|
+
memories, cumulative_rewards = memories_and_cumulative_rewards
|
784
|
+
|
785
|
+
fitness_scores = self.get_fitness_scores(cumulative_rewards, memories)
|
773
786
|
|
774
787
|
(
|
775
788
|
episode_ids,
|
@@ -952,9 +965,9 @@ Memory = namedtuple('Memory', [
|
|
952
965
|
'done'
|
953
966
|
])
|
954
967
|
|
955
|
-
|
968
|
+
MemoriesAndCumulativeRewards = namedtuple('MemoriesAndCumulativeRewards', [
|
956
969
|
'memories',
|
957
|
-
'
|
970
|
+
'cumulative_rewards'
|
958
971
|
])
|
959
972
|
|
960
973
|
class EPO(Module):
|
@@ -977,8 +990,9 @@ class EPO(Module):
|
|
977
990
|
@torch.no_grad()
|
978
991
|
def forward(
|
979
992
|
self,
|
980
|
-
env
|
981
|
-
|
993
|
+
env,
|
994
|
+
fix_seed_across_latents = True
|
995
|
+
) -> MemoriesAndCumulativeRewards:
|
982
996
|
|
983
997
|
self.agent.eval()
|
984
998
|
|
@@ -986,16 +1000,26 @@ class EPO(Module):
|
|
986
1000
|
|
987
1001
|
memories: list[Memory] = []
|
988
1002
|
|
989
|
-
|
1003
|
+
cumulative_rewards = torch.zeros((self.num_latents))
|
990
1004
|
|
991
1005
|
for episode_id in tqdm(range(self.episodes_per_latent), desc = 'episode'):
|
992
1006
|
|
1007
|
+
# maybe fix seed for environment across all latents
|
1008
|
+
|
1009
|
+
env_reset_kwargs = dict()
|
1010
|
+
|
1011
|
+
if fix_seed_across_latents:
|
1012
|
+
seed = randrange(int(1e6))
|
1013
|
+
env_reset_kwargs = dict(seed = seed)
|
1014
|
+
|
1015
|
+
# for each latent (on a single machine for now)
|
1016
|
+
|
993
1017
|
for latent_id in tqdm(range(self.num_latents), desc = 'latent'):
|
994
1018
|
time = 0
|
995
1019
|
|
996
1020
|
# initial state
|
997
1021
|
|
998
|
-
state = env.reset()
|
1022
|
+
state = env.reset(**env_reset_kwargs)
|
999
1023
|
|
1000
1024
|
# get latent from pool
|
1001
1025
|
|
@@ -1019,9 +1043,9 @@ class EPO(Module):
|
|
1019
1043
|
|
1020
1044
|
state, reward, done = env(action)
|
1021
1045
|
|
1022
|
-
# update
|
1046
|
+
# update cumulative rewards per latent, to be used as default fitness score
|
1023
1047
|
|
1024
|
-
|
1048
|
+
cumulative_rewards[latent_id] += reward
|
1025
1049
|
|
1026
1050
|
# store memories
|
1027
1051
|
|
@@ -1051,7 +1075,7 @@ class EPO(Module):
|
|
1051
1075
|
|
1052
1076
|
memories.append(memory_for_gae)
|
1053
1077
|
|
1054
|
-
return
|
1078
|
+
return MemoriesAndCumulativeRewards(
|
1055
1079
|
memories = memories,
|
1056
|
-
|
1080
|
+
cumulative_rewards = cumulative_rewards
|
1057
1081
|
)
|
File without changes
|
File without changes
|
{evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/.gitignore
RENAMED
File without changes
|
File without changes
|
{evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/README.md
RENAMED
File without changes
|
File without changes
|
File without changes
|
{evolutionary_policy_optimization-0.0.45 → evolutionary_policy_optimization-0.0.47}/requirements.txt
RENAMED
File without changes
|
File without changes
|