openclaw-alignment 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lib/__init__.py ADDED
@@ -0,0 +1,82 @@
1
+ """
2
+ OpenClaw Alignment - 强化学习对齐系统
3
+
4
+ Phase 1-2: 核心功能和优化
5
+ Phase 3: 高级功能(分布式训练、自动调参、监控、性能优化)
6
+ """
7
+
8
+ # Phase 1: 核心模块
9
+ from .reward import RewardSignal, RewardCalculator
10
+ from .environment import InteractionEnvironment, State, Action
11
+ from .agent import AlignmentAgent, PolicyNetwork, ValueNetwork
12
+ from .learner import RLLearner
13
+ from .integration import RLAlignmentEngine
14
+
15
+ # Phase 2: 优化模块
16
+ from .nn_model import (
17
+ MLPModel,
18
+ PolicyNetworkPyTorch,
19
+ ValueNetworkPyTorch,
20
+ create_policy_network,
21
+ create_value_network,
22
+ TORCH_AVAILABLE
23
+ )
24
+ from .experience_replay import Experience, ExperienceReplay
25
+ from .trainer import RLTrainer
26
+
27
+ # Phase 3: 高级功能模块
28
+ from .distributed_trainer import (
29
+ DistributedTrainer,
30
+ DistributedTrainingConfig
31
+ )
32
+ from .hyperparameter_tuner import (
33
+ LearningRateScheduler,
34
+ HyperparameterSearch,
35
+ EarlyStopping,
36
+ HyperparameterTuner
37
+ )
38
+ from .monitoring import (
39
+ TrainingMonitor,
40
+ MetricsAnalyzer,
41
+ TENSORBOARD_AVAILABLE
42
+ )
43
+ from .performance_optimizer import (
44
+ BatchInference,
45
+ ModelQuantization,
46
+ InferenceCache,
47
+ JITOptimizer,
48
+ PerformanceOptimizer
49
+ )
50
+ from .paths import (
51
+ get_config_dir,
52
+ get_cache_dir,
53
+ get_state_dir,
54
+ resolve_config_path,
55
+ resolve_model_dir,
56
+ )
57
+
58
+ __all__ = [
59
+ # Phase 1
60
+ "RewardSignal", "RewardCalculator",
61
+ "InteractionEnvironment", "State", "Action",
62
+ "AlignmentAgent", "PolicyNetwork", "ValueNetwork",
63
+ "RLLearner", "RLAlignmentEngine",
64
+
65
+ # Phase 2
66
+ "MLPModel", "PolicyNetworkPyTorch", "ValueNetworkPyTorch",
67
+ "create_policy_network", "create_value_network", "TORCH_AVAILABLE",
68
+ "Experience", "ExperienceReplay",
69
+ "RLTrainer",
70
+
71
+ # Phase 3
72
+ "DistributedTrainer", "DistributedTrainingConfig",
73
+ "LearningRateScheduler", "HyperparameterSearch",
74
+ "EarlyStopping", "HyperparameterTuner",
75
+ "TrainingMonitor", "MetricsAnalyzer", "TENSORBOARD_AVAILABLE",
76
+ "BatchInference", "ModelQuantization", "InferenceCache",
77
+ "JITOptimizer", "PerformanceOptimizer",
78
+ "get_config_dir", "get_cache_dir", "get_state_dir",
79
+ "resolve_config_path", "resolve_model_dir",
80
+ ]
81
+
82
+ __version__ = "1.0.0"
lib/agent.py ADDED
@@ -0,0 +1,594 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ 强化学习智能体 - Actor-Critic实现
4
+
5
+ 实现Actor-Critic算法,包括:
6
+ - PolicyNetwork: 策略网络(输出动作概率分布)
7
+ - ValueNetwork: 价值网络(估计状态价值)
8
+ - AlignmentAgent: Actor-Critic智能体
9
+
10
+ Phase 1: 纯NumPy实现(线性模型)
11
+ Phase 2: 可选PyTorch实现(神经网络)
12
+ """
13
+
14
+ import numpy as np
15
+ from typing import Dict, List, Any, Tuple, Optional
16
+ from dataclasses import dataclass
17
+ import json
18
+ from pathlib import Path
19
+
20
+ from .contracts import (
21
+ ACTION_HEAD_DIMS,
22
+ ACTION_VECTOR_DIM,
23
+ AGENT_ORDER,
24
+ AUTOMATION_ORDER,
25
+ CONFIRM_ORDER,
26
+ STYLE_ORDER,
27
+ )
28
+ from .environment import State, Action, AgentType, AutomationLevel, CommunicationStyle
29
+
30
+
31
+ @dataclass
32
+ class Trajectory:
33
+ """轨迹数据类"""
34
+ states: List[np.ndarray] # 状态序列
35
+ actions: List[np.ndarray] # 动作序列(索引向量)
36
+ rewards: List[float] # 奖励序列
37
+ dones: List[bool] # 完成标志
38
+ next_states: List[np.ndarray] # 下一状态序列
39
+
40
+ def __len__(self) -> int:
41
+ return len(self.states)
42
+
43
+ def __repr__(self) -> str:
44
+ return f"Trajectory(length={len(self)}, total_reward={sum(self.rewards):.2f})"
45
+
46
+
47
+ class PolicyNetwork:
48
+ """
49
+ 策略网络 - 输出动作概率分布
50
+
51
+ Phase 1: 线性模型(logits = state @ weights + bias)
52
+ Phase 2: 可选神经网络(PyTorch)
53
+ """
54
+
55
+ def __init__(self, state_dim: int, action_dim: int = ACTION_VECTOR_DIM, hidden_dim: int = 64):
56
+ """
57
+ 初始化策略网络
58
+
59
+ Args:
60
+ state_dim: 状态维度
61
+ action_dim: 动作维度
62
+ hidden_dim: 隐藏层维度(Phase 2使用)
63
+ """
64
+ self.state_dim = state_dim
65
+ self.action_dim = action_dim
66
+ self.hidden_dim = hidden_dim
67
+
68
+ # 多头动作空间
69
+ self.head_dims = ACTION_HEAD_DIMS.copy()
70
+
71
+ # Phase 1: 线性模型参数(多头)
72
+ self.weights = {
73
+ name: np.random.randn(state_dim, dim) * 0.01
74
+ for name, dim in self.head_dims.items()
75
+ }
76
+ self.bias = {
77
+ name: np.zeros(dim)
78
+ for name, dim in self.head_dims.items()
79
+ }
80
+
81
+ def forward(self, state: np.ndarray) -> Dict[str, np.ndarray]:
82
+ """前向传播:计算各头logits"""
83
+ return {
84
+ name: state @ self.weights[name] + self.bias[name]
85
+ for name in self.head_dims
86
+ }
87
+
88
+ def softmax(self, logits: np.ndarray) -> np.ndarray:
89
+ """Softmax激活函数"""
90
+ # 数值稳定性:减去最大值
91
+ exp_logits = np.exp(logits - np.max(logits))
92
+ return exp_logits / np.sum(exp_logits)
93
+
94
+ def get_action_probs(self, state: np.ndarray) -> Dict[str, np.ndarray]:
95
+ """获取动作概率分布(多头)"""
96
+ logits = self.forward(state)
97
+ return {name: self.softmax(head_logits) for name, head_logits in logits.items()}
98
+
99
+ def sample_action(self, state: np.ndarray, explore: bool = True) -> Tuple[np.ndarray, Dict[str, np.ndarray]]:
100
+ """
101
+ 采样动作
102
+
103
+ Args:
104
+ state: 当前状态
105
+ explore: 是否探索(epsilon-greedy)
106
+
107
+ Returns:
108
+ (action_indices, action_probs)
109
+ """
110
+ action_probs = self.get_action_probs(state)
111
+
112
+ if not explore:
113
+ # 推理模式:贪心选择,避免推荐动作随机抖动
114
+ action_indices = np.array([
115
+ int(np.argmax(action_probs["agent"])),
116
+ int(np.argmax(action_probs["automation"])),
117
+ int(np.argmax(action_probs["style"])),
118
+ int(np.argmax(action_probs["confirm"]))
119
+ ], dtype=int)
120
+ elif np.random.random() < 0.1: # 10% epsilon-greedy
121
+ # 随机探索(每个头独立随机)
122
+ action_indices = np.array([
123
+ np.random.randint(self.head_dims["agent"]),
124
+ np.random.randint(self.head_dims["automation"]),
125
+ np.random.randint(self.head_dims["style"]),
126
+ np.random.randint(self.head_dims["confirm"])
127
+ ], dtype=int)
128
+ else:
129
+ # 按概率采样(每个头独立采样)
130
+ action_indices = np.array([
131
+ np.random.choice(self.head_dims["agent"], p=action_probs["agent"]),
132
+ np.random.choice(self.head_dims["automation"], p=action_probs["automation"]),
133
+ np.random.choice(self.head_dims["style"], p=action_probs["style"]),
134
+ np.random.choice(self.head_dims["confirm"], p=action_probs["confirm"])
135
+ ], dtype=int)
136
+
137
+ return action_indices, action_probs
138
+
139
+ def update(self, state: np.ndarray, action_indices: np.ndarray, advantage: float,
140
+ learning_rate: float = 0.01) -> float:
141
+ """
142
+ 更新策略网络(REINFORCE算法)
143
+
144
+ Args:
145
+ state: 当前状态
146
+ action_indices: 执行的动作索引(多头)
147
+ advantage: 优势函数 A(s,a) = Q(s,a) - V(s)
148
+ learning_rate: 学习率
149
+
150
+ Returns:
151
+ 损失值
152
+ """
153
+ action_probs = self.get_action_probs(state)
154
+
155
+ total_loss = 0.0
156
+ head_order = ["agent", "automation", "style", "confirm"]
157
+
158
+ for head_idx, head_name in enumerate(head_order):
159
+ probs = action_probs[head_name]
160
+ action_idx = int(action_indices[head_idx])
161
+ log_prob = np.log(probs[action_idx] + 1e-10)
162
+
163
+ # Policy gradient: -log π(a|s) * A
164
+ loss = -log_prob * advantage
165
+ total_loss += loss
166
+
167
+ one_hot = np.zeros_like(probs)
168
+ one_hot[action_idx] = 1.0
169
+ grad_logits = (one_hot - probs) * advantage
170
+
171
+ # 梯度上升(等价于对损失下降)
172
+ self.weights[head_name] += learning_rate * np.outer(state, grad_logits)
173
+ self.bias[head_name] += learning_rate * grad_logits
174
+
175
+ return float(total_loss)
176
+
177
+ def save(self, path: str) -> None:
178
+ """保存模型参数"""
179
+ params = {
180
+ "weights": {name: w.tolist() for name, w in self.weights.items()},
181
+ "bias": {name: b.tolist() for name, b in self.bias.items()},
182
+ "state_dim": self.state_dim,
183
+ "action_dim": self.action_dim,
184
+ "head_dims": self.head_dims
185
+ }
186
+
187
+ path = Path(path).expanduser()
188
+ path.parent.mkdir(parents=True, exist_ok=True)
189
+
190
+ with open(path, 'w') as f:
191
+ json.dump(params, f)
192
+
193
+ def load(self, path: str) -> None:
194
+ """加载模型参数"""
195
+ path = Path(path).expanduser()
196
+
197
+ if not path.exists():
198
+ return
199
+
200
+ with open(path, 'r') as f:
201
+ params = json.load(f)
202
+
203
+ self.weights = {name: np.array(w) for name, w in params["weights"].items()}
204
+ self.bias = {name: np.array(b) for name, b in params["bias"].items()}
205
+ self.state_dim = params["state_dim"]
206
+ self.action_dim = params["action_dim"]
207
+ self.head_dims = params.get("head_dims", self.head_dims)
208
+
209
+
210
+ class ValueNetwork:
211
+ """
212
+ 价值网络 - 估计状态价值 V(s)
213
+
214
+ Phase 1: 线性模型
215
+ Phase 2: 可选神经网络(PyTorch)
216
+ """
217
+
218
+ def __init__(self, state_dim: int, hidden_dim: int = 64):
219
+ """
220
+ 初始化价值网络
221
+
222
+ Args:
223
+ state_dim: 状态维度
224
+ hidden_dim: 隐藏层维度(Phase 2使用)
225
+ """
226
+ self.state_dim = state_dim
227
+ self.hidden_dim = hidden_dim
228
+
229
+ # 线性模型参数
230
+ self.weights = np.random.randn(state_dim) * 0.01
231
+ self.bias = 0.0
232
+
233
+ def forward(self, state: np.ndarray) -> float:
234
+ """前向传播:计算状态价值"""
235
+ return float(state @ self.weights + self.bias)
236
+
237
+ def update(self, state: np.ndarray, target_value: float,
238
+ learning_rate: float = 0.01) -> float:
239
+ """
240
+ 更新价值网络(MSE损失)
241
+
242
+ Args:
243
+ state: 当前状态
244
+ target_value: 目标价值(实际回报)
245
+ learning_rate: 学习率
246
+
247
+ Returns:
248
+ 损失值
249
+ """
250
+ # 计算当前价值
251
+ current_value = self.forward(state)
252
+
253
+ # 计算损失
254
+ loss = (current_value - target_value) ** 2
255
+
256
+ # 计算梯度
257
+ grad_w = 2 * (current_value - target_value) * state
258
+ grad_b = 2 * (current_value - target_value)
259
+
260
+ # 更新权重
261
+ self.weights -= learning_rate * grad_w
262
+ self.bias -= learning_rate * grad_b
263
+
264
+ return loss
265
+
266
+ def save(self, path: str) -> None:
267
+ """保存模型参数"""
268
+ params = {
269
+ "weights": self.weights.tolist(),
270
+ "bias": float(self.bias),
271
+ "state_dim": self.state_dim
272
+ }
273
+
274
+ path = Path(path).expanduser()
275
+ path.parent.mkdir(parents=True, exist_ok=True)
276
+
277
+ with open(path, 'w') as f:
278
+ json.dump(params, f)
279
+
280
+ def load(self, path: str) -> None:
281
+ """加载模型参数"""
282
+ path = Path(path).expanduser()
283
+
284
+ if not path.exists():
285
+ return
286
+
287
+ with open(path, 'r') as f:
288
+ params = json.load(f)
289
+
290
+ self.weights = np.array(params["weights"])
291
+ self.bias = params["bias"]
292
+ self.state_dim = params["state_dim"]
293
+
294
+
295
+ class AlignmentAgent:
296
+ """
297
+ Actor-Critic智能体
298
+
299
+ 结合策略网络和价值网络,实现Actor-Critic算法:
300
+ - Actor: 策略网络,选择动作
301
+ - Critic: 价值网络,评估状态价值
302
+
303
+ 算法流程:
304
+ 1. 使用当前策略选择动作
305
+ 2. 执行动作,获得奖励和下一状态
306
+ 3. 计算优势函数 A = R + γV(s') - V(s)
307
+ 4. 更新Actor:-log π(a|s) * A
308
+ 5. 更新Critic:(V(s) - R)²
309
+ """
310
+
311
+ def __init__(self, state_dim: int, action_dim: int,
312
+ gamma: float = 0.99,
313
+ actor_lr: float = 0.01,
314
+ critic_lr: float = 0.01):
315
+ """
316
+ 初始化智能体
317
+
318
+ Args:
319
+ state_dim: 状态维度
320
+ action_dim: 动作维度
321
+ gamma: 折扣因子
322
+ actor_lr: Actor学习率
323
+ critic_lr: Critic学习率
324
+ """
325
+ self.state_dim = state_dim
326
+ self.action_dim = action_dim
327
+ self.gamma = gamma
328
+
329
+ # 初始化网络
330
+ self.policy_net = PolicyNetwork(state_dim, action_dim)
331
+ self.value_net = ValueNetwork(state_dim)
332
+
333
+ # 学习率
334
+ self.actor_lr = actor_lr
335
+ self.critic_lr = critic_lr
336
+
337
+ # 训练统计
338
+ self.episode_count = 0
339
+ self.total_steps = 0
340
+
341
+ def select_action(self, state: State, explore: bool = True) -> Action:
342
+ """
343
+ 选择动作
344
+
345
+ Args:
346
+ state: 当前状态
347
+ explore: 是否探索
348
+
349
+ Returns:
350
+ 选择的动作
351
+ """
352
+ # 将状态转换为向量
353
+ state_vector = state.to_vector()
354
+
355
+ # 采样动作索引
356
+ action_indices, action_probs = self.policy_net.sample_action(state_vector, explore)
357
+
358
+ # 将动作索引转换为Action对象
359
+ action = self.decode_action_indices(action_indices)
360
+
361
+ return action
362
+
363
+ def encode_action_indices(self, action: Action) -> np.ndarray:
364
+ """将Action编码为索引向量"""
365
+ agent_idx = AGENT_ORDER.index(action.agent_selection.value)
366
+ automation_idx = AUTOMATION_ORDER.index(action.automation_level.value)
367
+ style_idx = STYLE_ORDER.index(action.communication_style.value)
368
+ confirm_idx = CONFIRM_ORDER.index(action.confirmation_needed)
369
+
370
+ return np.array([agent_idx, automation_idx, style_idx, confirm_idx], dtype=int)
371
+
372
+ def decode_action_indices(self, action_indices: np.ndarray) -> Action:
373
+ """将动作索引解码为Action对象"""
374
+ indices = [int(x) for x in action_indices]
375
+ if len(indices) != 4:
376
+ raise ValueError(f"action_indices must contain 4 values, got {len(indices)}")
377
+ agent_idx, automation_idx, style_idx, confirm_idx = indices
378
+
379
+ if not (0 <= agent_idx < len(AGENT_ORDER)):
380
+ raise ValueError(f"agent index out of range: {agent_idx}")
381
+ if not (0 <= automation_idx < len(AUTOMATION_ORDER)):
382
+ raise ValueError(f"automation index out of range: {automation_idx}")
383
+ if not (0 <= style_idx < len(STYLE_ORDER)):
384
+ raise ValueError(f"style index out of range: {style_idx}")
385
+ if not (0 <= confirm_idx < len(CONFIRM_ORDER)):
386
+ raise ValueError(f"confirm index out of range: {confirm_idx}")
387
+
388
+ agent_type = AgentType(AGENT_ORDER[agent_idx])
389
+ automation = AutomationLevel(AUTOMATION_ORDER[automation_idx])
390
+ style = CommunicationStyle(STYLE_ORDER[style_idx])
391
+ confirm = CONFIRM_ORDER[confirm_idx]
392
+
393
+ return Action(
394
+ agent_selection=agent_type,
395
+ automation_level=automation,
396
+ communication_style=style,
397
+ confirmation_needed=confirm
398
+ )
399
+
400
+ def update_policy(self, trajectory: Trajectory) -> Dict[str, float]:
401
+ """
402
+ 更新策略(Actor-Critic算法)
403
+
404
+ Args:
405
+ trajectory: 完整轨迹
406
+
407
+ Returns:
408
+ 损失统计
409
+ """
410
+ if len(trajectory) == 0:
411
+ return {}
412
+
413
+ total_actor_loss = 0.0
414
+ total_critic_loss = 0.0
415
+
416
+ # 逐步更新
417
+ for i in range(len(trajectory)):
418
+ state = trajectory.states[i]
419
+ action_indices = trajectory.actions[i]
420
+ reward = trajectory.rewards[i]
421
+ next_state = trajectory.next_states[i]
422
+ done = trajectory.dones[i]
423
+
424
+ # 计算目标价值
425
+ if done:
426
+ target_value = reward
427
+ else:
428
+ target_value = reward + self.gamma * self.value_net.forward(next_state)
429
+
430
+ # 计算优势函数
431
+ current_value = self.value_net.forward(state)
432
+ advantage = target_value - current_value
433
+
434
+ # 更新Actor
435
+ actor_loss = self.policy_net.update(state, action_indices, advantage, self.actor_lr)
436
+ total_actor_loss += actor_loss
437
+
438
+ # 更新Critic
439
+ critic_loss = self.value_net.update(state, target_value, self.critic_lr)
440
+ total_critic_loss += critic_loss
441
+
442
+ # 更新统计
443
+ self.episode_count += 1
444
+ self.total_steps += len(trajectory)
445
+
446
+ return {
447
+ "actor_loss": total_actor_loss / len(trajectory),
448
+ "critic_loss": total_critic_loss / len(trajectory),
449
+ "episode_length": len(trajectory),
450
+ "total_return": sum(trajectory.rewards)
451
+ }
452
+
453
+ def _compute_returns(self, rewards: List[float], dones: List[bool]) -> List[float]:
454
+ """
455
+ 计算折扣回报
456
+
457
+ Args:
458
+ rewards: 奖励序列
459
+ dones: 完成标志序列
460
+
461
+ Returns:
462
+ 折扣回报序列
463
+ """
464
+ returns = []
465
+ running_return = 0.0
466
+
467
+ # 从后往前计算
468
+ for reward, done in zip(reversed(rewards), reversed(dones)):
469
+ if done:
470
+ running_return = reward
471
+ else:
472
+ running_return = reward + self.gamma * running_return
473
+
474
+ returns.insert(0, running_return)
475
+
476
+ return returns
477
+
478
+ def save_model(self, path: str) -> None:
479
+ """保存模型"""
480
+ model_dir = Path(path).expanduser()
481
+ model_dir.mkdir(parents=True, exist_ok=True)
482
+
483
+ # 保存策略网络
484
+ self.policy_net.save(str(model_dir / "policy_network.json"))
485
+
486
+ # 保存价值网络
487
+ self.value_net.save(str(model_dir / "value_network.json"))
488
+
489
+ # 保存元数据
490
+ metadata = {
491
+ "episode_count": self.episode_count,
492
+ "total_steps": self.total_steps,
493
+ "gamma": self.gamma,
494
+ "actor_lr": self.actor_lr,
495
+ "critic_lr": self.critic_lr
496
+ }
497
+
498
+ with open(model_dir / "metadata.json", 'w') as f:
499
+ json.dump(metadata, f, indent=2)
500
+
501
+ def load_model(self, path: str) -> None:
502
+ """加载模型"""
503
+ model_dir = Path(path).expanduser()
504
+
505
+ # 加载策略网络
506
+ self.policy_net.load(str(model_dir / "policy_network.json"))
507
+
508
+ # 加载价值网络
509
+ self.value_net.load(str(model_dir / "value_network.json"))
510
+
511
+ # 加载元数据
512
+ metadata_path = model_dir / "metadata.json"
513
+ if metadata_path.exists():
514
+ with open(metadata_path, 'r') as f:
515
+ metadata = json.load(f)
516
+
517
+ self.episode_count = metadata.get("episode_count", 0)
518
+ self.total_steps = metadata.get("total_steps", 0)
519
+
520
+
521
+ def main():
522
+ """测试智能体"""
523
+ from .environment import InteractionEnvironment
524
+
525
+ # 创建环境和智能体
526
+ env = InteractionEnvironment()
527
+ agent = AlignmentAgent(
528
+ state_dim=env.get_state_space_size(),
529
+ action_dim=env.get_action_space_size()
530
+ )
531
+
532
+ print(f"智能体已创建")
533
+ print(f"状态空间: {env.get_state_space_size()}")
534
+ print(f"动作空间: {env.get_action_space_size()}")
535
+
536
+ # 模拟训练
537
+ for episode in range(3):
538
+ print(f"\n=== Episode {episode + 1} ===")
539
+
540
+ # 重置环境
541
+ task_context = {
542
+ "task_type": "T2",
543
+ "tech_stack": ["python"],
544
+ "user_mood": "focused"
545
+ }
546
+
547
+ state = env.reset(task_context)
548
+
549
+ # 收集轨迹
550
+ trajectory = Trajectory([], [], [], [], [])
551
+
552
+ for step in range(5): # 每个episode最多5步
553
+ # 选择动作
554
+ action = agent.select_action(state, explore=True)
555
+
556
+ # 模拟任务结果
557
+ task_result = {
558
+ "duration": 200 + step * 50,
559
+ "completed": step == 4, # 最后一步完成
560
+ "test_result": {"coverage": 70 + step * 5},
561
+ "user_feedback": {"accepted": True},
562
+ "metrics": {}
563
+ }
564
+
565
+ # 执行步骤
566
+ next_state, reward, done, info = env.step(action, task_result)
567
+
568
+ # 记录轨迹
569
+ trajectory.states.append(state.to_vector())
570
+ trajectory.actions.append(agent.encode_action_indices(action))
571
+ trajectory.rewards.append(reward)
572
+ trajectory.dones.append(done)
573
+ trajectory.next_states.append(next_state.to_vector())
574
+
575
+ state = next_state
576
+
577
+ print(f" Step {step + 1}: reward={reward:.3f}, done={done}")
578
+
579
+ if done:
580
+ break
581
+
582
+ # 更新策略
583
+ stats = agent.update_policy(trajectory)
584
+ print(f" 总回报: {stats['total_return']:.3f}")
585
+ print(f" Actor损失: {stats['actor_loss']:.4f}")
586
+ print(f" Critic损失: {stats['critic_loss']:.4f}")
587
+
588
+ # 保存模型
589
+ agent.save_model("/tmp/openclaw_rl_agent")
590
+ print(f"\n模型已保存到 /tmp/openclaw_rl_agent")
591
+
592
+
593
+ if __name__ == "__main__":
594
+ main()