agentic-qe 2.1.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. package/CHANGELOG.md +123 -0
  2. package/README.md +1 -1
  3. package/dist/agents/index.d.ts.map +1 -1
  4. package/dist/agents/index.js +5 -1
  5. package/dist/agents/index.js.map +1 -1
  6. package/dist/core/di/AgentDependencies.d.ts +127 -0
  7. package/dist/core/di/AgentDependencies.d.ts.map +1 -0
  8. package/dist/core/di/AgentDependencies.js +251 -0
  9. package/dist/core/di/AgentDependencies.js.map +1 -0
  10. package/dist/core/di/DIContainer.d.ts +149 -0
  11. package/dist/core/di/DIContainer.d.ts.map +1 -0
  12. package/dist/core/di/DIContainer.js +333 -0
  13. package/dist/core/di/DIContainer.js.map +1 -0
  14. package/dist/core/di/index.d.ts +11 -0
  15. package/dist/core/di/index.d.ts.map +1 -0
  16. package/dist/core/di/index.js +22 -0
  17. package/dist/core/di/index.js.map +1 -0
  18. package/dist/core/index.d.ts +1 -0
  19. package/dist/core/index.d.ts.map +1 -1
  20. package/dist/core/index.js +11 -1
  21. package/dist/core/index.js.map +1 -1
  22. package/dist/learning/ExperienceSharingProtocol.d.ts +243 -0
  23. package/dist/learning/ExperienceSharingProtocol.d.ts.map +1 -0
  24. package/dist/learning/ExperienceSharingProtocol.js +538 -0
  25. package/dist/learning/ExperienceSharingProtocol.js.map +1 -0
  26. package/dist/learning/LearningEngine.d.ts +101 -1
  27. package/dist/learning/LearningEngine.d.ts.map +1 -1
  28. package/dist/learning/LearningEngine.js +330 -3
  29. package/dist/learning/LearningEngine.js.map +1 -1
  30. package/dist/learning/QLearning.d.ts +38 -125
  31. package/dist/learning/QLearning.d.ts.map +1 -1
  32. package/dist/learning/QLearning.js +46 -267
  33. package/dist/learning/QLearning.js.map +1 -1
  34. package/dist/learning/QLearningLegacy.d.ts +154 -0
  35. package/dist/learning/QLearningLegacy.d.ts.map +1 -0
  36. package/dist/learning/QLearningLegacy.js +337 -0
  37. package/dist/learning/QLearningLegacy.js.map +1 -0
  38. package/dist/learning/algorithms/AbstractRLLearner.d.ts +162 -0
  39. package/dist/learning/algorithms/AbstractRLLearner.d.ts.map +1 -0
  40. package/dist/learning/algorithms/AbstractRLLearner.js +300 -0
  41. package/dist/learning/algorithms/AbstractRLLearner.js.map +1 -0
  42. package/dist/learning/algorithms/ActorCriticLearner.d.ts +201 -0
  43. package/dist/learning/algorithms/ActorCriticLearner.d.ts.map +1 -0
  44. package/dist/learning/algorithms/ActorCriticLearner.js +447 -0
  45. package/dist/learning/algorithms/ActorCriticLearner.js.map +1 -0
  46. package/dist/learning/algorithms/PPOLearner.d.ts +207 -0
  47. package/dist/learning/algorithms/PPOLearner.d.ts.map +1 -0
  48. package/dist/learning/algorithms/PPOLearner.js +490 -0
  49. package/dist/learning/algorithms/PPOLearner.js.map +1 -0
  50. package/dist/learning/algorithms/QLearning.d.ts +68 -0
  51. package/dist/learning/algorithms/QLearning.d.ts.map +1 -0
  52. package/dist/learning/algorithms/QLearning.js +116 -0
  53. package/dist/learning/algorithms/QLearning.js.map +1 -0
  54. package/dist/learning/algorithms/SARSALearner.d.ts +107 -0
  55. package/dist/learning/algorithms/SARSALearner.d.ts.map +1 -0
  56. package/dist/learning/algorithms/SARSALearner.js +252 -0
  57. package/dist/learning/algorithms/SARSALearner.js.map +1 -0
  58. package/dist/learning/algorithms/index.d.ts +29 -0
  59. package/dist/learning/algorithms/index.d.ts.map +1 -0
  60. package/dist/learning/algorithms/index.js +44 -0
  61. package/dist/learning/algorithms/index.js.map +1 -0
  62. package/dist/learning/index.d.ts +3 -0
  63. package/dist/learning/index.d.ts.map +1 -1
  64. package/dist/learning/index.js +15 -1
  65. package/dist/learning/index.js.map +1 -1
  66. package/dist/learning/types.d.ts +2 -0
  67. package/dist/learning/types.d.ts.map +1 -1
  68. package/dist/memory/DistributedPatternLibrary.d.ts +159 -0
  69. package/dist/memory/DistributedPatternLibrary.d.ts.map +1 -0
  70. package/dist/memory/DistributedPatternLibrary.js +370 -0
  71. package/dist/memory/DistributedPatternLibrary.js.map +1 -0
  72. package/dist/memory/PatternQualityScorer.d.ts +169 -0
  73. package/dist/memory/PatternQualityScorer.d.ts.map +1 -0
  74. package/dist/memory/PatternQualityScorer.js +327 -0
  75. package/dist/memory/PatternQualityScorer.js.map +1 -0
  76. package/dist/memory/PatternReplicationService.d.ts +187 -0
  77. package/dist/memory/PatternReplicationService.d.ts.map +1 -0
  78. package/dist/memory/PatternReplicationService.js +392 -0
  79. package/dist/memory/PatternReplicationService.js.map +1 -0
  80. package/dist/providers/ClaudeProvider.d.ts +98 -0
  81. package/dist/providers/ClaudeProvider.d.ts.map +1 -0
  82. package/dist/providers/ClaudeProvider.js +418 -0
  83. package/dist/providers/ClaudeProvider.js.map +1 -0
  84. package/dist/providers/ILLMProvider.d.ts +287 -0
  85. package/dist/providers/ILLMProvider.d.ts.map +1 -0
  86. package/dist/providers/ILLMProvider.js +33 -0
  87. package/dist/providers/ILLMProvider.js.map +1 -0
  88. package/dist/providers/LLMProviderFactory.d.ts +154 -0
  89. package/dist/providers/LLMProviderFactory.d.ts.map +1 -0
  90. package/dist/providers/LLMProviderFactory.js +426 -0
  91. package/dist/providers/LLMProviderFactory.js.map +1 -0
  92. package/dist/providers/RuvllmProvider.d.ts +107 -0
  93. package/dist/providers/RuvllmProvider.d.ts.map +1 -0
  94. package/dist/providers/RuvllmProvider.js +417 -0
  95. package/dist/providers/RuvllmProvider.js.map +1 -0
  96. package/dist/providers/index.d.ts +31 -0
  97. package/dist/providers/index.d.ts.map +1 -0
  98. package/dist/providers/index.js +69 -0
  99. package/dist/providers/index.js.map +1 -0
  100. package/package.json +1 -1
@@ -0,0 +1,490 @@
1
+ "use strict";
2
+ /**
3
+ * PPOLearner - Proximal Policy Optimization Algorithm
4
+ *
5
+ * Implements PPO-Clip, the most widely used variant of PPO:
6
+ * - Clipped surrogate objective to prevent large policy updates
7
+ * - Generalized Advantage Estimation (GAE) for variance reduction
8
+ * - Value function clipping for stability
9
+ * - Multiple epochs over collected trajectories
10
+ *
11
+ * Key features:
12
+ * - Trust region optimization without KL constraint
13
+ * - Sample efficient with mini-batch updates
14
+ * - Robust to hyperparameter choices
15
+ * - Suitable for continuous and discrete action spaces
16
+ *
17
+ * PPO-Clip objective:
18
+ * L^CLIP(θ) = E[min(r(θ)Â, clip(r(θ), 1-ε, 1+ε)Â)]
19
+ * where r(θ) = π_θ(a|s) / π_θ_old(a|s)
20
+ *
21
+ * @module learning/algorithms/PPOLearner
22
+ * @version 1.0.0
23
+ */
24
+ Object.defineProperty(exports, "__esModule", { value: true });
25
+ exports.PPOLearner = void 0;
26
+ exports.createDefaultPPOConfig = createDefaultPPOConfig;
27
+ const AbstractRLLearner_1 = require("./AbstractRLLearner");
28
+ /**
29
+ * PPOLearner - Proximal Policy Optimization implementation
30
+ *
31
+ * PPO is a state-of-the-art policy gradient method that achieves
32
+ * strong performance while being simpler than TRPO.
33
+ *
34
+ * Usage:
35
+ * ```typescript
36
+ * const ppo = new PPOLearner({
37
+ * learningRate: 0.0003,
38
+ * discountFactor: 0.99,
39
+ * explorationRate: 0.0,
40
+ * explorationDecay: 1.0,
41
+ * minExplorationRate: 0.0,
42
+ * clipEpsilon: 0.2,
43
+ * ppoEpochs: 4,
44
+ * miniBatchSize: 64,
45
+ * valueLossCoefficient: 0.5,
46
+ * entropyCoefficient: 0.01,
47
+ * gaeLambda: 0.95,
48
+ * maxGradNorm: 0.5,
49
+ * clipValueLoss: true,
50
+ * policyLearningRate: 0.0003,
51
+ * valueLearningRate: 0.001,
52
+ * useExperienceReplay: false,
53
+ * replayBufferSize: 2048,
54
+ * batchSize: 64
55
+ * });
56
+ *
57
+ * // Collect trajectory
58
+ * ppo.collectStep(state, action, reward, nextState, done);
59
+ *
60
+ * // Train on collected trajectory
61
+ * ppo.trainOnTrajectory();
62
+ * ```
63
+ */
64
+ class PPOLearner extends AbstractRLLearner_1.AbstractRLLearner {
65
+ constructor(config) {
66
+ super(config);
67
+ this.ppoConfig = config;
68
+ this.policyTable = new Map();
69
+ this.valueTable = new Map();
70
+ this.oldPolicyTable = new Map();
71
+ this.trajectory = [];
72
+ this.defaultExploration = config.explorationRate;
73
+ this.logger.info('PPOLearner initialized', {
74
+ clipEpsilon: config.clipEpsilon,
75
+ epochs: config.ppoEpochs,
76
+ gaeLambda: config.gaeLambda,
77
+ entropyCoeff: config.entropyCoefficient
78
+ });
79
+ }
80
+ /**
81
+ * Select action using current policy (softmax)
82
+ */
83
+ selectAction(state, availableActions) {
84
+ if (availableActions.length === 0) {
85
+ throw new Error('No available actions to select from');
86
+ }
87
+ const stateKey = this.encodeState(state);
88
+ const probs = this.getActionProbabilities(stateKey, availableActions);
89
+ // Sample from distribution
90
+ const random = Math.random();
91
+ let cumulative = 0;
92
+ for (let i = 0; i < availableActions.length; i++) {
93
+ cumulative += probs[i];
94
+ if (random <= cumulative) {
95
+ return availableActions[i];
96
+ }
97
+ }
98
+ return availableActions[availableActions.length - 1];
99
+ }
100
+ /**
101
+ * Get action probabilities using softmax policy
102
+ */
103
+ getActionProbabilities(stateKey, availableActions) {
104
+ const preferences = [];
105
+ for (const action of availableActions) {
106
+ const actionKey = this.encodeAction(action);
107
+ const params = this.getPolicyParams(stateKey, actionKey);
108
+ preferences.push(params.preference);
109
+ }
110
+ // Softmax with numerical stability
111
+ const maxPref = Math.max(...preferences);
112
+ const expPrefs = preferences.map(p => Math.exp(p - maxPref));
113
+ const sumExp = expPrefs.reduce((sum, e) => sum + e, 0);
114
+ return expPrefs.map(e => e / sumExp);
115
+ }
116
+ /**
117
+ * Get policy parameters for state-action pair
118
+ */
119
+ getPolicyParams(stateKey, actionKey) {
120
+ const statePolicy = this.policyTable.get(stateKey);
121
+ if (!statePolicy) {
122
+ return { preference: 0, logProb: 0, updateCount: 0 };
123
+ }
124
+ return statePolicy.get(actionKey) ?? { preference: 0, logProb: 0, updateCount: 0 };
125
+ }
126
+ /**
127
+ * Get log probability of action under current policy
128
+ */
129
+ getLogProb(stateKey, actionKey, availableActions) {
130
+ // Get preference for target action
131
+ const params = this.getPolicyParams(stateKey, actionKey);
132
+ // If we don't know the action space, return stored log prob
133
+ if (!availableActions) {
134
+ return params.logProb;
135
+ }
136
+ // Calculate actual log probability
137
+ const prefs = [];
138
+ let targetPref = params.preference;
139
+ for (const action of availableActions) {
140
+ const ak = this.encodeAction(action);
141
+ const p = this.getPolicyParams(stateKey, ak);
142
+ prefs.push(p.preference);
143
+ if (ak === actionKey) {
144
+ targetPref = p.preference;
145
+ }
146
+ }
147
+ const maxPref = Math.max(...prefs, targetPref);
148
+ const expTarget = Math.exp(targetPref - maxPref);
149
+ const sumExp = prefs.reduce((sum, p) => sum + Math.exp(p - maxPref), 0);
150
+ return Math.log(expTarget / sumExp);
151
+ }
152
+ /**
153
+ * Get state value from value network
154
+ */
155
+ getStateValue(state) {
156
+ const stateKey = this.encodeState(state);
157
+ return this.valueTable.get(stateKey) ?? 0;
158
+ }
159
+ /**
160
+ * Collect a step in the trajectory
161
+ */
162
+ collectStep(state, action, reward, nextState, done) {
163
+ const stateKey = this.encodeState(state);
164
+ const actionKey = this.encodeAction(action);
165
+ const nextStateKey = this.encodeState(nextState);
166
+ const value = this.valueTable.get(stateKey) ?? 0;
167
+ const logProb = this.getLogProb(stateKey, actionKey);
168
+ this.trajectory.push({
169
+ state: stateKey,
170
+ action: actionKey,
171
+ reward,
172
+ nextState: nextStateKey,
173
+ done,
174
+ value,
175
+ logProb,
176
+ advantage: 0, // Computed later
177
+ returns: 0 // Computed later
178
+ });
179
+ }
180
+ /**
181
+ * Standard update interface - collects experience and trains when ready
182
+ */
183
+ update(experience, nextAction) {
184
+ this.stepCount++;
185
+ const { state, action, reward, nextState } = experience;
186
+ const done = experience.done ?? false;
187
+ // Collect step
188
+ this.collectStep(state, action, reward, nextState, done);
189
+ // Train when trajectory is large enough
190
+ if (this.trajectory.length >= this.ppoConfig.replayBufferSize) {
191
+ this.trainOnTrajectory();
192
+ }
193
+ }
194
+ /**
195
+ * Train on collected trajectory using PPO
196
+ */
197
+ trainOnTrajectory() {
198
+ if (this.trajectory.length === 0) {
199
+ return;
200
+ }
201
+ // Compute advantages using GAE
202
+ this.computeGAE();
203
+ // Save old policy for ratio computation
204
+ this.saveOldPolicy();
205
+ // Multiple epochs of training
206
+ for (let epoch = 0; epoch < this.ppoConfig.ppoEpochs; epoch++) {
207
+ this.trainEpoch();
208
+ }
209
+ // Clear trajectory
210
+ this.trajectory = [];
211
+ this.logger.info('PPO training complete', {
212
+ epochs: this.ppoConfig.ppoEpochs,
213
+ steps: this.stepCount
214
+ });
215
+ }
216
+ /**
217
+ * Compute Generalized Advantage Estimation (GAE)
218
+ *
219
+ * GAE: Â_t = Σ_{l=0}^∞ (γλ)^l δ_{t+l}
220
+ * where δ_t = r_t + γV(s_{t+1}) - V(s_t)
221
+ */
222
+ computeGAE() {
223
+ const gamma = this.config.discountFactor;
224
+ const lambda = this.ppoConfig.gaeLambda;
225
+ let lastGaeLam = 0;
226
+ const n = this.trajectory.length;
227
+ // Compute returns and advantages backwards
228
+ for (let t = n - 1; t >= 0; t--) {
229
+ const step = this.trajectory[t];
230
+ const nextValue = step.done
231
+ ? 0
232
+ : (t < n - 1 ? this.trajectory[t + 1].value : this.valueTable.get(step.nextState) ?? 0);
233
+ // TD error
234
+ const delta = step.reward + gamma * nextValue - step.value;
235
+ // GAE advantage
236
+ lastGaeLam = step.done
237
+ ? delta
238
+ : delta + gamma * lambda * lastGaeLam;
239
+ step.advantage = lastGaeLam;
240
+ step.returns = step.advantage + step.value;
241
+ }
242
+ // Normalize advantages
243
+ const advantages = this.trajectory.map(s => s.advantage);
244
+ const mean = advantages.reduce((s, a) => s + a, 0) / advantages.length;
245
+ const variance = advantages.reduce((s, a) => s + (a - mean) ** 2, 0) / advantages.length;
246
+ const std = Math.sqrt(variance) + 1e-8;
247
+ for (const step of this.trajectory) {
248
+ step.advantage = (step.advantage - mean) / std;
249
+ }
250
+ }
251
+ /**
252
+ * Save current policy as old policy for ratio computation
253
+ */
254
+ saveOldPolicy() {
255
+ this.oldPolicyTable.clear();
256
+ for (const [state, actions] of this.policyTable.entries()) {
257
+ const actionMap = new Map();
258
+ for (const [action, params] of actions.entries()) {
259
+ actionMap.set(action, { ...params });
260
+ }
261
+ this.oldPolicyTable.set(state, actionMap);
262
+ }
263
+ }
264
+ /**
265
+ * Get old log probability for ratio computation
266
+ */
267
+ getOldLogProb(stateKey, actionKey) {
268
+ const statePolicy = this.oldPolicyTable.get(stateKey);
269
+ if (!statePolicy) {
270
+ return 0;
271
+ }
272
+ return statePolicy.get(actionKey)?.logProb ?? 0;
273
+ }
274
+ /**
275
+ * Train one epoch on the trajectory
276
+ */
277
+ trainEpoch() {
278
+ // Shuffle trajectory
279
+ const shuffled = [...this.trajectory].sort(() => Math.random() - 0.5);
280
+ // Mini-batch updates
281
+ for (let i = 0; i < shuffled.length; i += this.ppoConfig.miniBatchSize) {
282
+ const batch = shuffled.slice(i, i + this.ppoConfig.miniBatchSize);
283
+ this.trainMiniBatch(batch);
284
+ }
285
+ }
286
+ /**
287
+ * Train on a mini-batch
288
+ */
289
+ trainMiniBatch(batch) {
290
+ for (const step of batch) {
291
+ // Compute probability ratio
292
+ const newLogProb = this.getLogProb(step.state, step.action);
293
+ const oldLogProb = step.logProb; // Use stored log prob
294
+ const ratio = Math.exp(newLogProb - oldLogProb);
295
+ // Compute clipped and unclipped objectives
296
+ const eps = this.ppoConfig.clipEpsilon;
297
+ const surr1 = ratio * step.advantage;
298
+ const surr2 = Math.max(Math.min(ratio, 1 + eps), 1 - eps) * step.advantage;
299
+ // Policy loss (negative because we want to maximize)
300
+ const policyLoss = -Math.min(surr1, surr2);
301
+ // Value loss
302
+ const valueTarget = step.returns;
303
+ const currentValue = this.valueTable.get(step.state) ?? 0;
304
+ let valueLoss = (currentValue - valueTarget) ** 2;
305
+ // Clip value loss if enabled
306
+ if (this.ppoConfig.clipValueLoss) {
307
+ const clippedValue = step.value + Math.max(Math.min(currentValue - step.value, eps), -eps);
308
+ const clippedValueLoss = (clippedValue - valueTarget) ** 2;
309
+ valueLoss = Math.max(valueLoss, clippedValueLoss);
310
+ }
311
+ // Entropy bonus
312
+ const entropy = this.computeEntropy(step.state);
313
+ const entropyLoss = -this.ppoConfig.entropyCoefficient * entropy;
314
+ // Total loss
315
+ const totalLoss = policyLoss + this.ppoConfig.valueLossCoefficient * valueLoss + entropyLoss;
316
+ // Update policy (gradient ascent direction)
317
+ this.updatePolicy(step.state, step.action, step.advantage, ratio);
318
+ // Update value function
319
+ this.updateValue(step.state, valueTarget);
320
+ }
321
+ }
322
+ /**
323
+ * Update policy parameters
324
+ */
325
+ updatePolicy(stateKey, actionKey, advantage, ratio) {
326
+ if (!this.policyTable.has(stateKey)) {
327
+ this.policyTable.set(stateKey, new Map());
328
+ }
329
+ const statePolicy = this.policyTable.get(stateKey);
330
+ const current = statePolicy.get(actionKey) ?? { preference: 0, logProb: 0, updateCount: 0 };
331
+ // Clipped gradient
332
+ const eps = this.ppoConfig.clipEpsilon;
333
+ let gradient = advantage;
334
+ if ((ratio > 1 + eps && advantage > 0) || (ratio < 1 - eps && advantage < 0)) {
335
+ gradient = 0; // Clipped - no update
336
+ }
337
+ // Update preference
338
+ const newPreference = current.preference + this.ppoConfig.policyLearningRate * gradient;
339
+ const newLogProb = this.getLogProb(stateKey, actionKey);
340
+ statePolicy.set(actionKey, {
341
+ preference: newPreference,
342
+ logProb: newLogProb,
343
+ updateCount: current.updateCount + 1
344
+ });
345
+ // Update Q-table for compatibility
346
+ this.setQValue(stateKey, actionKey, newPreference);
347
+ }
348
+ /**
349
+ * Update value function
350
+ */
351
+ updateValue(stateKey, target) {
352
+ const current = this.valueTable.get(stateKey) ?? 0;
353
+ const newValue = current + this.ppoConfig.valueLearningRate * (target - current);
354
+ this.valueTable.set(stateKey, newValue);
355
+ }
356
+ /**
357
+ * Compute entropy of policy at state
358
+ */
359
+ computeEntropy(stateKey) {
360
+ const statePolicy = this.policyTable.get(stateKey);
361
+ if (!statePolicy || statePolicy.size === 0) {
362
+ return 0;
363
+ }
364
+ const prefs = Array.from(statePolicy.values()).map(p => p.preference);
365
+ const maxPref = Math.max(...prefs);
366
+ const expPrefs = prefs.map(p => Math.exp(p - maxPref));
367
+ const sumExp = expPrefs.reduce((s, e) => s + e, 0);
368
+ const probs = expPrefs.map(e => e / sumExp);
369
+ let entropy = 0;
370
+ for (const p of probs) {
371
+ if (p > 0) {
372
+ entropy -= p * Math.log(p);
373
+ }
374
+ }
375
+ return entropy;
376
+ }
377
+ /**
378
+ * Get default exploration rate for reset
379
+ */
380
+ getDefaultExplorationRate() {
381
+ return this.defaultExploration;
382
+ }
383
+ /**
384
+ * Get PPO-specific statistics
385
+ */
386
+ getPPOStatistics() {
387
+ let totalValue = 0;
388
+ for (const v of this.valueTable.values()) {
389
+ totalValue += v;
390
+ }
391
+ let policySize = 0;
392
+ for (const statePolicy of this.policyTable.values()) {
393
+ policySize += statePolicy.size;
394
+ }
395
+ const avgAdvantage = this.trajectory.length > 0
396
+ ? this.trajectory.reduce((s, t) => s + t.advantage, 0) / this.trajectory.length
397
+ : 0;
398
+ return {
399
+ trajectoryLength: this.trajectory.length,
400
+ valueTableSize: this.valueTable.size,
401
+ policyTableSize: policySize,
402
+ avgValue: this.valueTable.size > 0 ? totalValue / this.valueTable.size : 0,
403
+ avgAdvantage,
404
+ clipFraction: 0 // Would need tracking during training
405
+ };
406
+ }
407
+ /**
408
+ * Reset PPO-specific state
409
+ */
410
+ reset() {
411
+ super.reset();
412
+ this.policyTable.clear();
413
+ this.valueTable.clear();
414
+ this.oldPolicyTable.clear();
415
+ this.trajectory = [];
416
+ this.logger.info('PPOLearner reset');
417
+ }
418
+ /**
419
+ * Export PPO state
420
+ */
421
+ exportPPO() {
422
+ const serializedPolicy = {};
423
+ for (const [state, actions] of this.policyTable.entries()) {
424
+ serializedPolicy[state] = {};
425
+ for (const [action, params] of actions.entries()) {
426
+ serializedPolicy[state][action] = params;
427
+ }
428
+ }
429
+ const serializedValue = {};
430
+ for (const [state, value] of this.valueTable.entries()) {
431
+ serializedValue[state] = value;
432
+ }
433
+ return {
434
+ base: this.export(),
435
+ policyTable: serializedPolicy,
436
+ valueTable: serializedValue,
437
+ ppoConfig: { ...this.ppoConfig }
438
+ };
439
+ }
440
+ /**
441
+ * Import PPO state
442
+ */
443
+ importPPO(state) {
444
+ this.import(state.base);
445
+ this.policyTable.clear();
446
+ for (const [stateKey, actions] of Object.entries(state.policyTable)) {
447
+ const actionMap = new Map();
448
+ for (const [actionKey, params] of Object.entries(actions)) {
449
+ actionMap.set(actionKey, params);
450
+ }
451
+ this.policyTable.set(stateKey, actionMap);
452
+ }
453
+ this.valueTable.clear();
454
+ for (const [stateKey, value] of Object.entries(state.valueTable)) {
455
+ this.valueTable.set(stateKey, value);
456
+ }
457
+ this.ppoConfig = { ...state.ppoConfig };
458
+ this.logger.info('Imported PPO state', {
459
+ policySize: this.policyTable.size,
460
+ valueSize: this.valueTable.size
461
+ });
462
+ }
463
+ }
464
+ exports.PPOLearner = PPOLearner;
465
+ /**
466
+ * Create default PPO configuration
467
+ */
468
+ function createDefaultPPOConfig() {
469
+ return {
470
+ learningRate: 0.0003,
471
+ discountFactor: 0.99,
472
+ explorationRate: 0.0, // PPO uses entropy for exploration
473
+ explorationDecay: 1.0,
474
+ minExplorationRate: 0.0,
475
+ clipEpsilon: 0.2,
476
+ ppoEpochs: 4,
477
+ miniBatchSize: 64,
478
+ valueLossCoefficient: 0.5,
479
+ entropyCoefficient: 0.01,
480
+ gaeLambda: 0.95,
481
+ maxGradNorm: 0.5,
482
+ clipValueLoss: true,
483
+ policyLearningRate: 0.0003,
484
+ valueLearningRate: 0.001,
485
+ useExperienceReplay: false, // PPO doesn't use replay buffer
486
+ replayBufferSize: 2048, // Used as trajectory buffer size
487
+ batchSize: 64
488
+ };
489
+ }
490
+ //# sourceMappingURL=PPOLearner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"PPOLearner.js","sourceRoot":"","sources":["../../../src/learning/algorithms/PPOLearner.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;;;AA2lBH,wDAqBC;AA9mBD,2DAA0E;AAqD1E;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmCG;AACH,MAAa,UAAW,SAAQ,qCAAiB;IAQ/C,YAAY,MAAiB;QAC3B,KAAK,CAAC,MAAM,CAAC,CAAC;QACd,IAAI,CAAC,SAAS,GAAG,MAAM,CAAC;QACxB,IAAI,CAAC,WAAW,GAAG,IAAI,GAAG,EAAE,CAAC;QAC7B,IAAI,CAAC,UAAU,GAAG,IAAI,GAAG,EAAE,CAAC;QAC5B,IAAI,CAAC,cAAc,GAAG,IAAI,GAAG,EAAE,CAAC;QAChC,IAAI,CAAC,UAAU,GAAG,EAAE,CAAC;QACrB,IAAI,CAAC,kBAAkB,GAAG,MAAM,CAAC,eAAe,CAAC;QAEjD,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,wBAAwB,EAAE;YACzC,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,MAAM,EAAE,MAAM,CAAC,SAAS;YACxB,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,YAAY,EAAE,MAAM,CAAC,kBAAkB;SACxC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACM,YAAY,CAAC,KAAgB,EAAE,gBAA+B;QACrE,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAClC,MAAM,IAAI,KAAK,CAAC,qCAAqC,CAAC,CAAC;QACzD,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,KAAK,GAAG,IAAI,CAAC,sBAAsB,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;QAEtE,2BAA2B;QAC3B,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QAC7B,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,gBAAgB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjD,UAAU,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC;YACvB,IAAI,MAAM,IAAI,UAAU,EAAE,CAAC;gBACzB,OAAO,gBAAgB,CAAC,CAAC,CAAC,CAAC;YAC7B,CAAC;QACH,CAAC;QAED,OAAO,gBAAgB,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACvD,CAAC;IAED;;OAEG;IACK,sBAAsB,CAAC,QAAgB,EAAE,gBAA+B;QAC9E,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,KAAK,MAAM,MAAM,IAAI,gBAAgB,EAAE,CAAC;YACtC,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YAC5C,MAAM,MAAM,GAAG,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;YACzD,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;QACtC,CAAC;QAED,mCAAmC;QACnC,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,WAAW,CAAC,CAAC;QACzC,MAAM,QAAQ,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC;QAC7D,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;QAEvD,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;IACvC,CAAC;IAED;;OAEG;IACK,eAAe,CAAC,QAAgB,EAAE,SAAiB;QACzD,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACnD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,OAAO,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC;QACvD,CAAC;QACD,OAAO,WAAW,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC;IACrF,CAAC;IAED;;OAEG;IACK,UAAU,CAAC,QAAgB,EAAE,SAAiB,EAAE,gBAAgC;QACtF,mCAAmC;QACnC,MAAM,MAAM,GAAG,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAEzD,4DAA4D;QAC5D,IAAI,CAAC,gBAAgB,EAAE,CAAC;YACtB,OAAO,MAAM,CAAC,OAAO,CAAC;QACxB,CAAC;QAED,mCAAmC;QACnC,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,IAAI,UAAU,GAAG,MAAM,CAAC,UAAU,CAAC;QAEnC,KAAK,MAAM,MAAM,IAAI,gBAAgB,EAAE,CAAC;YACtC,MAAM,EAAE,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YACrC,MAAM,CAAC,GAAG,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;YAC7C,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC;YACzB,IAAI,EAAE,KAAK,SAAS,EAAE,CAAC;gBACrB,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC;YAC5B,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,EAAE,UAAU,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,GAAG,OAAO,CAAC,CAAC;QACjD,MAAM,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC;QAExE,OAAO,IAAI,CAAC,GAAG,CAAC,SAAS,GAAG,MAAM,CAAC,CAAC;IACtC,CAAC;IAED;;OAEG;IACM,aAAa,CAAC,KAAgB;QACrC,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACzC,OAAO,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,WAAW,CACT,KAAgB,EAChB,MAAmB,EACnB,MAAc,EACd,SAAoB,EACpB,IAAa;QAEb,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;QAC5C,MAAM,YAAY,GAAG,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC;QAEjD,MAAM,KAAK,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACjD,MAAM,OAAO,GAAG,IAAI,CAAC,UAAU,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAErD,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC;YACnB,KAAK,EAAE,QAAQ;YACf,MAAM,EAAE,SAAS;YACjB,MAAM;YACN,SAAS,EAAE,YAAY;YACvB,IAAI;YACJ,KAAK;YACL,OAAO;YACP,SAAS,EAAE,CAAC,EAAE,iBAAiB;YAC/B,OAAO,EAAE,CAAC,CAAI,iBAAiB;SAChC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACM,MAAM,CAAC,UAA0B,EAAE,UAAwB;QAClE,IAAI,CAAC,SAAS,EAAE,CAAC;QAEjB,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,GAAG,UAAU,CAAC;QACxD,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,IAAI,KAAK,CAAC;QAEtC,eAAe;QACf,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;QAEzD,wCAAwC;QACxC,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,IAAI,IAAI,CAAC,SAAS,CAAC,gBAAgB,EAAE,CAAC;YAC9D,IAAI,CAAC,iBAAiB,EAAE,CAAC;QAC3B,CAAC;IACH,CAAC;IAED;;OAEG;IACH,iBAAiB;QACf,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACjC,OAAO;QACT,CAAC;QAED,+BAA+B;QAC/B,IAAI,CAAC,UAAU,EAAE,CAAC;QAElB,wCAAwC;QACxC,IAAI,CAAC,aAAa,EAAE,CAAC;QAErB,8BAA8B;QAC9B,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,SAAS,EAAE,KAAK,EAAE,EAAE,CAAC;YAC9D,IAAI,CAAC,UAAU,EAAE,CAAC;QACpB,CAAC;QAED,mBAAmB;QACnB,IAAI,CAAC,UAAU,GAAG,EAAE,CAAC;QAErB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,uBAAuB,EAAE;YACxC,MAAM,EAAE,IAAI,CAAC,SAAS,CAAC,SAAS;YAChC,KAAK,EAAE,IAAI,CAAC,SAAS;SACtB,CAAC,CAAC;IACL,CAAC;IAED;;;;;OAKG;IACK,UAAU;QAChB,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,cAAc,CAAC;QACzC,MAAM,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC;QAExC,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,MAAM,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;QAEjC,2CAA2C;QAC3C,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChC,MAAM,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YAEhC,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI;gBACzB,CAAC,CAAC,CAAC;gBACH,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC;YAE1F,WAAW;YACX,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,GAAG,KAAK,GAAG,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC;YAE3D,gBAAgB;YAChB,UAAU,GAAG,IAAI,CAAC,IAAI;gBACpB,CAAC,CAAC,KAAK;gBACP,CAAC,CAAC,KAAK,GAAG,KAAK,GAAG,MAAM,GAAG,UAAU,CAAC;YAExC,IAAI,CAAC,SAAS,GAAG,UAAU,CAAC;YAC5B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC;QAC7C,CAAC;QAED,uBAAuB;QACvB,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACzD,MAAM,IAAI,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC;QACvE,MAAM,QAAQ,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC;QACzF,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,IAAI,CAAC;QAEvC,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACnC,IAAI,CAAC,SAAS,GAAG,CAAC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,GAAG,GAAG,CAAC;QACjD,CAAC;IACH,CAAC;IAED;;OAEG;IACK,aAAa;QACnB,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,CAAC;QAC5B,KAAK,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,IAAI,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,CAAC;YAC1D,MAAM,SAAS,GAAG,IAAI,GAAG,EAAwB,CAAC;YAClD,KAAK,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;gBACjD,SAAS,CAAC,GAAG,CAAC,MAAM,EAAE,EAAE,GAAG,MAAM,EAAE,CAAC,CAAC;YACvC,CAAC;YACD,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;QAC5C,CAAC;IACH,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,QAAgB,EAAE,SAAiB;QACvD,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACtD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,OAAO,CAAC,CAAC;QACX,CAAC;QACD,OAAO,WAAW,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,OAAO,IAAI,CAAC,CAAC;IAClD,CAAC;IAED;;OAEG;IACK,UAAU;QAChB,qBAAqB;QACrB,MAAM,QAAQ,GAAG,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC;QAEtE,qBAAqB;QACrB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,IAAI,IAAI,CAAC,SAAS,CAAC,aAAa,EAAE,CAAC;YACvE,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,aAAa,CAAC,CAAC;YAClE,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC;QAC7B,CAAC;IACH,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,KAAuB;QAC5C,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,4BAA4B;YAC5B,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;YAC5D,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC,sBAAsB;YACvD,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,GAAG,UAAU,CAAC,CAAC;YAEhD,2CAA2C;YAC3C,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC;YACvC,MAAM,KAAK,GAAG,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC;YACrC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC;YAE3E,qDAAqD;YACrD,MAAM,UAAU,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAE3C,aAAa;YACb,MAAM,WAAW,GAAG,IAAI,CAAC,OAAO,CAAC;YACjC,MAAM,YAAY,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YAC1D,IAAI,SAAS,GAAG,CAAC,YAAY,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC;YAElD,6BAA6B;YAC7B,IAAI,IAAI,CAAC,SAAS,CAAC,aAAa,EAAE,CAAC;gBACjC,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,GAAG,IAAI,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC;gBAC3F,MAAM,gBAAgB,GAAG,CAAC,YAAY,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC;gBAC3D,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,gBAAgB,CAAC,CAAC;YACpD,CAAC;YAED,gBAAgB;YAChB,MAAM,OAAO,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAChD,MAAM,WAAW,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,kBAAkB,GAAG,OAAO,CAAC;YAEjE,aAAa;YACb,MAAM,SAAS,GAAG,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,oBAAoB,GAAG,SAAS,GAAG,WAAW,CAAC;YAE7F,4CAA4C;YAC5C,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;YAElE,wBAAwB;YACxB,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;QAC5C,CAAC;IACH,CAAC;IAED;;OAEG;IACK,YAAY,CAClB,QAAgB,EAChB,SAAiB,EACjB,SAAiB,EACjB,KAAa;QAEb,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YACpC,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,EAAE,IAAI,GAAG,EAAE,CAAC,CAAC;QAC5C,CAAC;QACD,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAE,CAAC;QAEpD,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC;QAE5F,mBAAmB;QACnB,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC;QACvC,IAAI,QAAQ,GAAG,SAAS,CAAC;QACzB,IAAI,CAAC,KAAK,GAAG,CAAC,GAAG,GAAG,IAAI,SAAS,GAAG,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,GAAG,GAAG,IAAI,SAAS,GAAG,CAAC,CAAC,EAAE,CAAC;YAC7E,QAAQ,GAAG,CAAC,CAAC,CAAC,sBAAsB;QACtC,CAAC;QAED,oBAAoB;QACpB,MAAM,aAAa,GAAG,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,kBAAkB,GAAG,QAAQ,CAAC;QACxF,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAExD,WAAW,CAAC,GAAG,CAAC,SAAS,EAAE;YACzB,UAAU,EAAE,aAAa;YACzB,OAAO,EAAE,UAAU;YACnB,WAAW,EAAE,OAAO,CAAC,WAAW,GAAG,CAAC;SACrC,CAAC,CAAC;QAEH,mCAAmC;QACnC,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,SAAS,EAAE,aAAa,CAAC,CAAC;IACrD,CAAC;IAED;;OAEG;IACK,WAAW,CAAC,QAAgB,EAAE,MAAc;QAClD,MAAM,OAAO,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACnD,MAAM,QAAQ,GAAG,OAAO,GAAG,IAAI,CAAC,SAAS,CAAC,iBAAiB,GAAG,CAAC,MAAM,GAAG,OAAO,CAAC,CAAC;QACjF,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAC1C,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,QAAgB;QACrC,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACnD,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;YAC3C,OAAO,CAAC,CAAC;QACX,CAAC;QAED,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC;QACtE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC;QACnC,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC;QACvD,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;QACnD,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;QAE5C,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;YACtB,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBACV,OAAO,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YAC7B,CAAC;QACH,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACO,yBAAyB;QACjC,OAAO,IAAI,CAAC,kBAAkB,CAAC;IACjC,CAAC;IAED;;OAEG;IACH,gBAAgB;QAQd,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,EAAE,EAAE,CAAC;YACzC,UAAU,IAAI,CAAC,CAAC;QAClB,CAAC;QAED,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,KAAK,MAAM,WAAW,IAAI,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,EAAE,CAAC;YACpD,UAAU,IAAI,WAAW,CAAC,IAAI,CAAC;QACjC,CAAC;QAED,MAAM,YAAY,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC;YAC7C,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM;YAC/E,CAAC,CAAC,CAAC,CAAC;QAEN,OAAO;YACL,gBAAgB,EAAE,IAAI,CAAC,UAAU,CAAC,MAAM;YACxC,cAAc,EAAE,IAAI,CAAC,UAAU,CAAC,IAAI;YACpC,eAAe,EAAE,UAAU;YAC3B,QAAQ,EAAE,IAAI,CAAC,UAAU,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC1E,YAAY;YACZ,YAAY,EAAE,CAAC,CAAC,sCAAsC;SACvD,CAAC;IACJ,CAAC;IAED;;OAEG;IACM,KAAK;QACZ,KAAK,CAAC,KAAK,EAAE,CAAC;QACd,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;QACxB,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,CAAC;QAC5B,IAAI,CAAC,UAAU,GAAG,EAAE,CAAC;QACrB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;IACvC,CAAC;IAED;;OAEG;IACH,SAAS;QAMP,MAAM,gBAAgB,GAAiD,EAAE,CAAC;QAC1E,KAAK,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,IAAI,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,CAAC;YAC1D,gBAAgB,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC;YAC7B,KAAK,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;gBACjD,gBAAgB,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,GAAG,MAAM,CAAC;YAC3C,CAAC;QACH,CAAC;QAED,MAAM,eAAe,GAA2B,EAAE,CAAC;QACnD,KAAK,MAAM,CAAC,KAAK,EAAE,KAAK,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC;YACvD,eAAe,CAAC,KAAK,CAAC,GAAG,KAAK,CAAC;QACjC,CAAC;QAED,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,MAAM,EAAE;YACnB,WAAW,EAAE,gBAAgB;YAC7B,UAAU,EAAE,eAAe;YAC3B,SAAS,EAAE,EAAE,GAAG,IAAI,CAAC,SAAS,EAAE;SACjC,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,SAAS,CAAC,KAAwC;QAChD,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAExB,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC;QACzB,KAAK,MAAM,CAAC,QAAQ,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,EAAE,CAAC;YACpE,MAAM,SAAS,GAAG,IAAI,GAAG,EAAwB,CAAC;YAClD,KAAK,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;gBAC1D,SAAS,CAAC,GAAG,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;YACnC,CAAC;YACD,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAC5C,CAAC;QAED,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;QACxB,KAAK,MAAM,CAAC,QAAQ,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,UAAU,CAAC,EAAE,CAAC;YACjE,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QACvC,CAAC;QAED,IAAI,CAAC,SAAS,GAAG,EAAE,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC;QAExC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,oBAAoB,EAAE;YACrC,UAAU,EAAE,IAAI,CAAC,WAAW,CAAC,IAAI;YACjC,SAAS,EAAE,IAAI,CAAC,UAAU,CAAC,IAAI;SAChC,CAAC,CAAC;IACL,CAAC;CACF;AA3fD,gCA2fC;AAED;;GAEG;AACH,SAAgB,sBAAsB;IACpC,OAAO;QACL,YAAY,EAAE,MAAM;QACpB,cAAc,EAAE,IAAI;QACpB,eAAe,EAAE,GAAG,EAAE,mCAAmC;QACzD,gBAAgB,EAAE,GAAG;QACrB,kBAAkB,EAAE,GAAG;QACvB,WAAW,EAAE,GAAG;QAChB,SAAS,EAAE,CAAC;QACZ,aAAa,EAAE,EAAE;QACjB,oBAAoB,EAAE,GAAG;QACzB,kBAAkB,EAAE,IAAI;QACxB,SAAS,EAAE,IAAI;QACf,WAAW,EAAE,GAAG;QAChB,aAAa,EAAE,IAAI;QACnB,kBAAkB,EAAE,MAAM;QAC1B,iBAAiB,EAAE,KAAK;QACxB,mBAAmB,EAAE,KAAK,EAAE,gCAAgC;QAC5D,gBAAgB,EAAE,IAAI,EAAM,iCAAiC;QAC7D,SAAS,EAAE,EAAE;KACd,CAAC;AACJ,CAAC"}
@@ -0,0 +1,68 @@
1
+ /**
2
+ * QLearning - Off-policy TD(0) Reinforcement Learning
3
+ *
4
+ * Implements standard Q-learning algorithm for reinforcement learning.
5
+ * Key differences from SARSA:
6
+ * - Off-policy: learns optimal Q-values regardless of policy being followed
7
+ * - Uses max Q-value for next state, not actual next action
8
+ * - Update rule: Q(s,a) ← Q(s,a) + α[r + γ·max(Q(s',a')) - Q(s,a)]
9
+ * - More aggressive than SARSA, finds optimal policy faster
10
+ */
11
+ import { AbstractRLLearner, RLConfig } from './AbstractRLLearner';
12
+ import { TaskExperience, AgentAction } from '../types';
13
+ /**
14
+ * Q-learning configuration (extends base RL config)
15
+ */
16
+ export interface QLearningConfig extends RLConfig {
17
+ }
18
+ /**
19
+ * QLearning - Standard Q-learning implementation
20
+ *
21
+ * Implements the classic Q-learning algorithm with:
22
+ * - Epsilon-greedy exploration policy
23
+ * - Off-policy temporal difference (TD) learning
24
+ * - Q-table for state-action values
25
+ * - Optional experience replay for stability
26
+ *
27
+ * Update Rule:
28
+ * Q(s,a) ← Q(s,a) + α[r + γ·max_a'(Q(s',a')) - Q(s,a)]
29
+ *
30
+ * Key characteristics:
31
+ * - Off-policy: learns about optimal policy while following exploration policy
32
+ * - Uses max Q-value (greedy) for bootstrapping
33
+ * - Converges to optimal Q* under certain conditions
34
+ * - More sample-efficient than on-policy methods
35
+ */
36
+ export declare class QLearning extends AbstractRLLearner {
37
+ private readonly defaultConfig;
38
+ constructor(config?: Partial<QLearningConfig>);
39
+ /**
40
+ * Update Q-value using Q-learning update rule
41
+ * Q(s,a) ← Q(s,a) + α[r + γ·max(Q(s',a')) - Q(s,a)]
42
+ *
43
+ * @param experience The transition experience (s, a, r, s')
44
+ * @param nextAction Ignored in Q-learning (uses max Q-value instead)
45
+ */
46
+ update(experience: TaskExperience, nextAction?: AgentAction): void;
47
+ /**
48
+ * Get the default exploration rate for this algorithm
49
+ */
50
+ protected getDefaultExplorationRate(): number;
51
+ /**
52
+ * Get algorithm name
53
+ */
54
+ getAlgorithmName(): string;
55
+ /**
56
+ * Get algorithm type (off-policy)
57
+ */
58
+ getAlgorithmType(): 'on-policy' | 'off-policy';
59
+ /**
60
+ * Get detailed statistics including Q-learning-specific metrics
61
+ */
62
+ getDetailedStatistics(): {
63
+ algorithm: string;
64
+ type: 'on-policy' | 'off-policy';
65
+ stats: ReturnType<AbstractRLLearner['getStatistics']>;
66
+ };
67
+ }
68
+ //# sourceMappingURL=QLearning.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"QLearning.d.ts","sourceRoot":"","sources":["../../../src/learning/algorithms/QLearning.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,qBAAqB,CAAC;AAClE,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AAEvD;;GAEG;AACH,MAAM,WAAW,eAAgB,SAAQ,QAAQ;CAGhD;AAgBD;;;;;;;;;;;;;;;;;GAiBG;AACH,qBAAa,SAAU,SAAQ,iBAAiB;IAC9C,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAkB;gBAEpC,MAAM,GAAE,OAAO,CAAC,eAAe,CAAM;IAOjD;;;;;;OAMG;IACH,MAAM,CAAC,UAAU,EAAE,cAAc,EAAE,UAAU,CAAC,EAAE,WAAW,GAAG,IAAI;IAiClE;;OAEG;IACH,SAAS,CAAC,yBAAyB,IAAI,MAAM;IAI7C;;OAEG;IACH,gBAAgB,IAAI,MAAM;IAI1B;;OAEG;IACH,gBAAgB,IAAI,WAAW,GAAG,YAAY;IAI9C;;OAEG;IACH,qBAAqB,IAAI;QACvB,SAAS,EAAE,MAAM,CAAC;QAClB,IAAI,EAAE,WAAW,GAAG,YAAY,CAAC;QACjC,KAAK,EAAE,UAAU,CAAC,iBAAiB,CAAC,eAAe,CAAC,CAAC,CAAC;KACvD;CAOF"}