evolutionary-policy-optimization 0.0.40__py3-none-any.whl → 0.0.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -798,6 +798,10 @@ class Agent(Module):
798
798
 
799
799
  latents = self.latent_gene_pool(latent_id = latent_gene_ids)
800
800
 
801
+ orig_latents = latents
802
+ latents = latents.detach()
803
+ latents.requires_grad_()
804
+
801
805
  # learn actor
802
806
 
803
807
  logits = self.actor(states, latents)
@@ -822,6 +826,14 @@ class Agent(Module):
822
826
  self.critic_optim.step()
823
827
  self.critic_optim.zero_grad()
824
828
 
829
+ # maybe update latents, if not frozen
830
+
831
+ if not self.latent_gene_pool.frozen_latents:
832
+ orig_latents.backward(latents.grad)
833
+
834
+ self.latent_optim.step()
835
+ self.latent_optim.zero_grad()
836
+
825
837
  # apply evolution
826
838
 
827
839
  self.latent_gene_pool.genetic_algorithm_step(fitness_scores)
@@ -863,25 +875,32 @@ def create_agent(
863
875
  actor_num_actions,
864
876
  actor_dim_hiddens: int | tuple[int, ...],
865
877
  critic_dim_hiddens: int | tuple[int, ...],
878
+ latent_gene_pool_kwargs: dict = dict(),
879
+ actor_kwargs: dict = dict(),
880
+ critic_kwargs: dict = dict(),
866
881
  ) -> Agent:
867
882
 
868
883
  latent_gene_pool = LatentGenePool(
869
884
  num_latents = num_latents,
870
- dim_latent = dim_latent
885
+ dim_latent = dim_latent,
886
+ **latent_gene_pool_kwargs
871
887
  )
872
888
 
873
889
  actor = Actor(
874
890
  num_actions = actor_num_actions,
875
891
  dim_state = dim_state,
876
892
  dim_latent = dim_latent,
877
- dim_hiddens = actor_dim_hiddens
893
+ dim_hiddens = actor_dim_hiddens,
894
+ **actor_kwargs
878
895
  )
879
896
 
880
897
  critic = Critic(
881
898
  dim_state = dim_state,
882
899
  dim_latent = dim_latent,
883
- dim_hiddens = critic_dim_hiddens
884
- )
900
+ dim_hiddens = critic_dim_hiddens,
901
+ **critic_kwargs
902
+ )
903
+
885
904
  return Agent(actor = actor, critic = critic, latent_gene_pool = latent_gene_pool)
886
905
 
887
906
  # EPO - which is just PPO with natural selection of a population of latent variables conditioning the agent
@@ -951,7 +970,7 @@ class EPO(Module):
951
970
 
952
971
  done = tensor(False)
953
972
 
954
- while time < self.max_episode_length:
973
+ while time < self.max_episode_length and not done:
955
974
 
956
975
  # sample action
957
976
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evolutionary-policy-optimization
3
- Version: 0.0.40
3
+ Version: 0.0.42
4
4
  Summary: EPO - Pytorch
5
5
  Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
6
6
  Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
@@ -162,4 +162,25 @@ agent.load('./agent.pt')
162
162
  }
163
163
  ```
164
164
 
165
+ ```bibtex
166
+ @inproceedings{Khadka2018EvolutionGuidedPG,
167
+ title = {Evolution-Guided Policy Gradient in Reinforcement Learning},
168
+ author = {Shauharda Khadka and Kagan Tumer},
169
+ booktitle = {Neural Information Processing Systems},
170
+ year = {2018},
171
+ url = {https://api.semanticscholar.org/CorpusID:53096951}
172
+ }
173
+ ```
174
+
175
+ ```bibtex
176
+ @article{Fortunato2017NoisyNF,
177
+ title = {Noisy Networks for Exploration},
178
+ author = {Meire Fortunato and Mohammad Gheshlaghi Azar and Bilal Piot and Jacob Menick and Ian Osband and Alex Graves and Vlad Mnih and R{\'e}mi Munos and Demis Hassabis and Olivier Pietquin and Charles Blundell and Shane Legg},
179
+ journal = {ArXiv},
180
+ year = {2017},
181
+ volume = {abs/1706.10295},
182
+ url = {https://api.semanticscholar.org/CorpusID:5176587}
183
+ }
184
+ ```
185
+
165
186
  *Evolution is cleverer than you are.* - Leslie Orgel
@@ -1,8 +1,8 @@
1
1
  evolutionary_policy_optimization/__init__.py,sha256=0q0aBuFgWi06MLMD8FiHzBYQ3_W4LYWrwmCtF3u5H2A,201
2
- evolutionary_policy_optimization/epo.py,sha256=VZOT1-jdZBE39awP7nhE-I1lGKMTfhUv4Dls9ptNsWk,29854
2
+ evolutionary_policy_optimization/epo.py,sha256=Yf-iw1gqmAUEVzg6_PwYy-q4005eroZKUYGxNgwCsKk,30440
3
3
  evolutionary_policy_optimization/experimental.py,sha256=9FrJGviLESlYysHI3i83efT9g2ZB9ha4u3K9HXN98_w,1100
4
4
  evolutionary_policy_optimization/mock_env.py,sha256=QqVPZVJtrvQmSDcnYDTob_A5sDwiUzGj6_tmo6BII5c,918
5
- evolutionary_policy_optimization-0.0.40.dist-info/METADATA,sha256=5ruqqTCmYto8tqkRlc_peBgRhWkhmRdzUef2ot67ky0,5409
6
- evolutionary_policy_optimization-0.0.40.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
7
- evolutionary_policy_optimization-0.0.40.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
8
- evolutionary_policy_optimization-0.0.40.dist-info/RECORD,,
5
+ evolutionary_policy_optimization-0.0.42.dist-info/METADATA,sha256=wiDM3tKsE9zHhyZJGaGcSA-jZuo38W4b_SCU2vQvpGc,6213
6
+ evolutionary_policy_optimization-0.0.42.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
7
+ evolutionary_policy_optimization-0.0.42.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
8
+ evolutionary_policy_optimization-0.0.42.dist-info/RECORD,,