evolutionary-policy-optimization 0.2.14__py3-none-any.whl → 0.2.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1063,6 +1063,7 @@ class Agent(Module):
1063
1063
  critic_loss_kwargs: dict = dict(
1064
1064
  eps_clip = 0.4
1065
1065
  ),
1066
+ use_spo = False, # Simple Policy Optimization - Xie et al. https://arxiv.org/abs/2401.16025v9
1066
1067
  use_improved_critic_loss = True,
1067
1068
  shrink_and_perturb_every = None,
1068
1069
  shrink_and_perturb_kwargs: dict = dict(),
@@ -1126,6 +1127,7 @@ class Agent(Module):
1126
1127
  self.actor_loss = partial(actor_loss, **actor_loss_kwargs)
1127
1128
  self.critic_loss_kwargs = critic_loss_kwargs
1128
1129
 
1130
+ self.use_spo = use_spo
1129
1131
  self.use_improved_critic_loss = use_improved_critic_loss
1130
1132
 
1131
1133
  # fitness score related
@@ -1399,7 +1401,7 @@ class Agent(Module):
1399
1401
 
1400
1402
  logits = self.actor(states, latents)
1401
1403
 
1402
- actor_loss = self.actor_loss(logits, log_probs, actions, advantages)
1404
+ actor_loss = self.actor_loss(logits, log_probs, actions, advantages, use_spo = self.use_spo)
1403
1405
 
1404
1406
  actor_loss.backward()
1405
1407
 
@@ -1498,7 +1500,8 @@ def actor_loss(
1498
1500
  eps_clip = 0.2,
1499
1501
  entropy_weight = .01,
1500
1502
  eps = 1e-5,
1501
- norm_advantages = True
1503
+ norm_advantages = True,
1504
+ use_spo = False
1502
1505
  ):
1503
1506
  batch = logits.shape[0]
1504
1507
 
@@ -1506,14 +1509,22 @@ def actor_loss(
1506
1509
 
1507
1510
  ratio = (log_probs - old_log_probs).exp()
1508
1511
 
1509
- # classic clipped surrogate loss from ppo
1510
-
1511
- clipped_ratio = ratio.clamp(min = 1. - eps_clip, max = 1. + eps_clip)
1512
-
1513
1512
  if norm_advantages:
1514
1513
  advantages = F.layer_norm(advantages, (batch,), eps = eps)
1515
1514
 
1516
- actor_loss = -torch.min(clipped_ratio * advantages, ratio * advantages)
1515
+ if use_spo:
1516
+ # simple policy optimization - line 14 Algorithm 1 https://arxiv.org/abs/2401.16025v9
1517
+
1518
+ actor_loss = - (
1519
+ ratio * advantages -
1520
+ advantages.abs() / (2 * eps_clip) * (ratio - 1.).square()
1521
+ )
1522
+ else:
1523
+ # classic clipped surrogate loss from ppo
1524
+
1525
+ clipped_ratio = ratio.clamp(min = 1. - eps_clip, max = 1. + eps_clip)
1526
+
1527
+ actor_loss = -torch.min(clipped_ratio * advantages, ratio * advantages)
1517
1528
 
1518
1529
  # add entropy loss for exploration
1519
1530
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evolutionary-policy-optimization
3
- Version: 0.2.14
3
+ Version: 0.2.15
4
4
  Summary: EPO - Pytorch
5
5
  Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
6
6
  Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
@@ -300,4 +300,16 @@ That's it
300
300
  }
301
301
  ```
302
302
 
303
+ ```bibtex
304
+ @misc{xie2025simplepolicyoptimization,
305
+ title = {Simple Policy Optimization},
306
+ author = {Zhengpeng Xie and Qiang Zhang and Fan Yang and Marco Hutter and Renjing Xu},
307
+ year = {2025},
308
+ eprint = {2401.16025},
309
+ archivePrefix = {arXiv},
310
+ primaryClass = {cs.LG},
311
+ url = {https://arxiv.org/abs/2401.16025},
312
+ }
313
+ ```
314
+
303
315
  *Evolution is cleverer than you are.* - Leslie Orgel
@@ -1,10 +1,10 @@
1
1
  evolutionary_policy_optimization/__init__.py,sha256=NyiYDYU7DlpmOTM7xiBQET3r1WwX0ebrgMCBLSQrW3c,288
2
2
  evolutionary_policy_optimization/distributed.py,sha256=clN8Bvhb6JIQy2F8FTF312B0RN3iYoPkKjZreBtAwks,2366
3
3
  evolutionary_policy_optimization/env_wrappers.py,sha256=bDL06o9_b1iW6k3fw2xifnOnYlzs643tdW6Yv2gsIdw,803
4
- evolutionary_policy_optimization/epo.py,sha256=dp-SGhg9_NGO8Rezf8K2xlWYCpOXRKB2JaRy5UGP2uo,53142
4
+ evolutionary_policy_optimization/epo.py,sha256=oA8Ft5VRq5XorCZa0Hrxh6rTDEQg7DzSVfR0MwPg5J4,53591
5
5
  evolutionary_policy_optimization/experimental.py,sha256=QZG0__wwFqHN_LJK7e-mHxlIL1mwjlvG6o6bcOpeAKg,6166
6
6
  evolutionary_policy_optimization/mock_env.py,sha256=TLyyRm6tOD0Kdn9QqJJQriaSnsR-YmNQHo4OohmZFG4,1410
7
- evolutionary_policy_optimization-0.2.14.dist-info/METADATA,sha256=opgQguJ4n6CgOYToRoNu9RTBJpM6oiamSOKJsHVV0FE,9972
8
- evolutionary_policy_optimization-0.2.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
- evolutionary_policy_optimization-0.2.14.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
10
- evolutionary_policy_optimization-0.2.14.dist-info/RECORD,,
7
+ evolutionary_policy_optimization-0.2.15.dist-info/METADATA,sha256=7w5tZm9yIeNTHm2d5npon5Nd5ekUpb9ahkf6VJkSZj0,10319
8
+ evolutionary_policy_optimization-0.2.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
+ evolutionary_policy_optimization-0.2.15.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
10
+ evolutionary_policy_optimization-0.2.15.dist-info/RECORD,,