evolutionary-policy-optimization 0.2.14__py3-none-any.whl → 0.2.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evolutionary_policy_optimization/epo.py +18 -7
- {evolutionary_policy_optimization-0.2.14.dist-info → evolutionary_policy_optimization-0.2.15.dist-info}/METADATA +13 -1
- {evolutionary_policy_optimization-0.2.14.dist-info → evolutionary_policy_optimization-0.2.15.dist-info}/RECORD +5 -5
- {evolutionary_policy_optimization-0.2.14.dist-info → evolutionary_policy_optimization-0.2.15.dist-info}/WHEEL +0 -0
- {evolutionary_policy_optimization-0.2.14.dist-info → evolutionary_policy_optimization-0.2.15.dist-info}/licenses/LICENSE +0 -0
@@ -1063,6 +1063,7 @@ class Agent(Module):
|
|
1063
1063
|
critic_loss_kwargs: dict = dict(
|
1064
1064
|
eps_clip = 0.4
|
1065
1065
|
),
|
1066
|
+
use_spo = False, # Simple Policy Optimization - Xie et al. https://arxiv.org/abs/2401.16025v9
|
1066
1067
|
use_improved_critic_loss = True,
|
1067
1068
|
shrink_and_perturb_every = None,
|
1068
1069
|
shrink_and_perturb_kwargs: dict = dict(),
|
@@ -1126,6 +1127,7 @@ class Agent(Module):
|
|
1126
1127
|
self.actor_loss = partial(actor_loss, **actor_loss_kwargs)
|
1127
1128
|
self.critic_loss_kwargs = critic_loss_kwargs
|
1128
1129
|
|
1130
|
+
self.use_spo = use_spo
|
1129
1131
|
self.use_improved_critic_loss = use_improved_critic_loss
|
1130
1132
|
|
1131
1133
|
# fitness score related
|
@@ -1399,7 +1401,7 @@ class Agent(Module):
|
|
1399
1401
|
|
1400
1402
|
logits = self.actor(states, latents)
|
1401
1403
|
|
1402
|
-
actor_loss = self.actor_loss(logits, log_probs, actions, advantages)
|
1404
|
+
actor_loss = self.actor_loss(logits, log_probs, actions, advantages, use_spo = self.use_spo)
|
1403
1405
|
|
1404
1406
|
actor_loss.backward()
|
1405
1407
|
|
@@ -1498,7 +1500,8 @@ def actor_loss(
|
|
1498
1500
|
eps_clip = 0.2,
|
1499
1501
|
entropy_weight = .01,
|
1500
1502
|
eps = 1e-5,
|
1501
|
-
norm_advantages = True
|
1503
|
+
norm_advantages = True,
|
1504
|
+
use_spo = False
|
1502
1505
|
):
|
1503
1506
|
batch = logits.shape[0]
|
1504
1507
|
|
@@ -1506,14 +1509,22 @@ def actor_loss(
|
|
1506
1509
|
|
1507
1510
|
ratio = (log_probs - old_log_probs).exp()
|
1508
1511
|
|
1509
|
-
# classic clipped surrogate loss from ppo
|
1510
|
-
|
1511
|
-
clipped_ratio = ratio.clamp(min = 1. - eps_clip, max = 1. + eps_clip)
|
1512
|
-
|
1513
1512
|
if norm_advantages:
|
1514
1513
|
advantages = F.layer_norm(advantages, (batch,), eps = eps)
|
1515
1514
|
|
1516
|
-
|
1515
|
+
if use_spo:
|
1516
|
+
# simple policy optimization - line 14 Algorithm 1 https://arxiv.org/abs/2401.16025v9
|
1517
|
+
|
1518
|
+
actor_loss = - (
|
1519
|
+
ratio * advantages -
|
1520
|
+
advantages.abs() / (2 * eps_clip) * (ratio - 1.).square()
|
1521
|
+
)
|
1522
|
+
else:
|
1523
|
+
# classic clipped surrogate loss from ppo
|
1524
|
+
|
1525
|
+
clipped_ratio = ratio.clamp(min = 1. - eps_clip, max = 1. + eps_clip)
|
1526
|
+
|
1527
|
+
actor_loss = -torch.min(clipped_ratio * advantages, ratio * advantages)
|
1517
1528
|
|
1518
1529
|
# add entropy loss for exploration
|
1519
1530
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: evolutionary-policy-optimization
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.15
|
4
4
|
Summary: EPO - Pytorch
|
5
5
|
Project-URL: Homepage, https://pypi.org/project/evolutionary-policy-optimization/
|
6
6
|
Project-URL: Repository, https://github.com/lucidrains/evolutionary-policy-optimization
|
@@ -300,4 +300,16 @@ That's it
|
|
300
300
|
}
|
301
301
|
```
|
302
302
|
|
303
|
+
```bibtex
|
304
|
+
@misc{xie2025simplepolicyoptimization,
|
305
|
+
title = {Simple Policy Optimization},
|
306
|
+
author = {Zhengpeng Xie and Qiang Zhang and Fan Yang and Marco Hutter and Renjing Xu},
|
307
|
+
year = {2025},
|
308
|
+
eprint = {2401.16025},
|
309
|
+
archivePrefix = {arXiv},
|
310
|
+
primaryClass = {cs.LG},
|
311
|
+
url = {https://arxiv.org/abs/2401.16025},
|
312
|
+
}
|
313
|
+
```
|
314
|
+
|
303
315
|
*Evolution is cleverer than you are.* - Leslie Orgel
|
@@ -1,10 +1,10 @@
|
|
1
1
|
evolutionary_policy_optimization/__init__.py,sha256=NyiYDYU7DlpmOTM7xiBQET3r1WwX0ebrgMCBLSQrW3c,288
|
2
2
|
evolutionary_policy_optimization/distributed.py,sha256=clN8Bvhb6JIQy2F8FTF312B0RN3iYoPkKjZreBtAwks,2366
|
3
3
|
evolutionary_policy_optimization/env_wrappers.py,sha256=bDL06o9_b1iW6k3fw2xifnOnYlzs643tdW6Yv2gsIdw,803
|
4
|
-
evolutionary_policy_optimization/epo.py,sha256=
|
4
|
+
evolutionary_policy_optimization/epo.py,sha256=oA8Ft5VRq5XorCZa0Hrxh6rTDEQg7DzSVfR0MwPg5J4,53591
|
5
5
|
evolutionary_policy_optimization/experimental.py,sha256=QZG0__wwFqHN_LJK7e-mHxlIL1mwjlvG6o6bcOpeAKg,6166
|
6
6
|
evolutionary_policy_optimization/mock_env.py,sha256=TLyyRm6tOD0Kdn9QqJJQriaSnsR-YmNQHo4OohmZFG4,1410
|
7
|
-
evolutionary_policy_optimization-0.2.
|
8
|
-
evolutionary_policy_optimization-0.2.
|
9
|
-
evolutionary_policy_optimization-0.2.
|
10
|
-
evolutionary_policy_optimization-0.2.
|
7
|
+
evolutionary_policy_optimization-0.2.15.dist-info/METADATA,sha256=7w5tZm9yIeNTHm2d5npon5Nd5ekUpb9ahkf6VJkSZj0,10319
|
8
|
+
evolutionary_policy_optimization-0.2.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
9
|
+
evolutionary_policy_optimization-0.2.15.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
|
10
|
+
evolutionary_policy_optimization-0.2.15.dist-info/RECORD,,
|
File without changes
|