agentic-qe 2.1.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. package/CHANGELOG.md +123 -0
  2. package/README.md +1 -1
  3. package/dist/agents/index.d.ts.map +1 -1
  4. package/dist/agents/index.js +5 -1
  5. package/dist/agents/index.js.map +1 -1
  6. package/dist/core/di/AgentDependencies.d.ts +127 -0
  7. package/dist/core/di/AgentDependencies.d.ts.map +1 -0
  8. package/dist/core/di/AgentDependencies.js +251 -0
  9. package/dist/core/di/AgentDependencies.js.map +1 -0
  10. package/dist/core/di/DIContainer.d.ts +149 -0
  11. package/dist/core/di/DIContainer.d.ts.map +1 -0
  12. package/dist/core/di/DIContainer.js +333 -0
  13. package/dist/core/di/DIContainer.js.map +1 -0
  14. package/dist/core/di/index.d.ts +11 -0
  15. package/dist/core/di/index.d.ts.map +1 -0
  16. package/dist/core/di/index.js +22 -0
  17. package/dist/core/di/index.js.map +1 -0
  18. package/dist/core/index.d.ts +1 -0
  19. package/dist/core/index.d.ts.map +1 -1
  20. package/dist/core/index.js +11 -1
  21. package/dist/core/index.js.map +1 -1
  22. package/dist/learning/ExperienceSharingProtocol.d.ts +243 -0
  23. package/dist/learning/ExperienceSharingProtocol.d.ts.map +1 -0
  24. package/dist/learning/ExperienceSharingProtocol.js +538 -0
  25. package/dist/learning/ExperienceSharingProtocol.js.map +1 -0
  26. package/dist/learning/LearningEngine.d.ts +101 -1
  27. package/dist/learning/LearningEngine.d.ts.map +1 -1
  28. package/dist/learning/LearningEngine.js +330 -3
  29. package/dist/learning/LearningEngine.js.map +1 -1
  30. package/dist/learning/QLearning.d.ts +38 -125
  31. package/dist/learning/QLearning.d.ts.map +1 -1
  32. package/dist/learning/QLearning.js +46 -267
  33. package/dist/learning/QLearning.js.map +1 -1
  34. package/dist/learning/QLearningLegacy.d.ts +154 -0
  35. package/dist/learning/QLearningLegacy.d.ts.map +1 -0
  36. package/dist/learning/QLearningLegacy.js +337 -0
  37. package/dist/learning/QLearningLegacy.js.map +1 -0
  38. package/dist/learning/algorithms/AbstractRLLearner.d.ts +162 -0
  39. package/dist/learning/algorithms/AbstractRLLearner.d.ts.map +1 -0
  40. package/dist/learning/algorithms/AbstractRLLearner.js +300 -0
  41. package/dist/learning/algorithms/AbstractRLLearner.js.map +1 -0
  42. package/dist/learning/algorithms/ActorCriticLearner.d.ts +201 -0
  43. package/dist/learning/algorithms/ActorCriticLearner.d.ts.map +1 -0
  44. package/dist/learning/algorithms/ActorCriticLearner.js +447 -0
  45. package/dist/learning/algorithms/ActorCriticLearner.js.map +1 -0
  46. package/dist/learning/algorithms/PPOLearner.d.ts +207 -0
  47. package/dist/learning/algorithms/PPOLearner.d.ts.map +1 -0
  48. package/dist/learning/algorithms/PPOLearner.js +490 -0
  49. package/dist/learning/algorithms/PPOLearner.js.map +1 -0
  50. package/dist/learning/algorithms/QLearning.d.ts +68 -0
  51. package/dist/learning/algorithms/QLearning.d.ts.map +1 -0
  52. package/dist/learning/algorithms/QLearning.js +116 -0
  53. package/dist/learning/algorithms/QLearning.js.map +1 -0
  54. package/dist/learning/algorithms/SARSALearner.d.ts +107 -0
  55. package/dist/learning/algorithms/SARSALearner.d.ts.map +1 -0
  56. package/dist/learning/algorithms/SARSALearner.js +252 -0
  57. package/dist/learning/algorithms/SARSALearner.js.map +1 -0
  58. package/dist/learning/algorithms/index.d.ts +29 -0
  59. package/dist/learning/algorithms/index.d.ts.map +1 -0
  60. package/dist/learning/algorithms/index.js +44 -0
  61. package/dist/learning/algorithms/index.js.map +1 -0
  62. package/dist/learning/index.d.ts +3 -0
  63. package/dist/learning/index.d.ts.map +1 -1
  64. package/dist/learning/index.js +15 -1
  65. package/dist/learning/index.js.map +1 -1
  66. package/dist/learning/types.d.ts +2 -0
  67. package/dist/learning/types.d.ts.map +1 -1
  68. package/dist/memory/DistributedPatternLibrary.d.ts +159 -0
  69. package/dist/memory/DistributedPatternLibrary.d.ts.map +1 -0
  70. package/dist/memory/DistributedPatternLibrary.js +370 -0
  71. package/dist/memory/DistributedPatternLibrary.js.map +1 -0
  72. package/dist/memory/PatternQualityScorer.d.ts +169 -0
  73. package/dist/memory/PatternQualityScorer.d.ts.map +1 -0
  74. package/dist/memory/PatternQualityScorer.js +327 -0
  75. package/dist/memory/PatternQualityScorer.js.map +1 -0
  76. package/dist/memory/PatternReplicationService.d.ts +187 -0
  77. package/dist/memory/PatternReplicationService.d.ts.map +1 -0
  78. package/dist/memory/PatternReplicationService.js +392 -0
  79. package/dist/memory/PatternReplicationService.js.map +1 -0
  80. package/dist/providers/ClaudeProvider.d.ts +98 -0
  81. package/dist/providers/ClaudeProvider.d.ts.map +1 -0
  82. package/dist/providers/ClaudeProvider.js +418 -0
  83. package/dist/providers/ClaudeProvider.js.map +1 -0
  84. package/dist/providers/ILLMProvider.d.ts +287 -0
  85. package/dist/providers/ILLMProvider.d.ts.map +1 -0
  86. package/dist/providers/ILLMProvider.js +33 -0
  87. package/dist/providers/ILLMProvider.js.map +1 -0
  88. package/dist/providers/LLMProviderFactory.d.ts +154 -0
  89. package/dist/providers/LLMProviderFactory.d.ts.map +1 -0
  90. package/dist/providers/LLMProviderFactory.js +426 -0
  91. package/dist/providers/LLMProviderFactory.js.map +1 -0
  92. package/dist/providers/RuvllmProvider.d.ts +107 -0
  93. package/dist/providers/RuvllmProvider.d.ts.map +1 -0
  94. package/dist/providers/RuvllmProvider.js +417 -0
  95. package/dist/providers/RuvllmProvider.js.map +1 -0
  96. package/dist/providers/index.d.ts +31 -0
  97. package/dist/providers/index.d.ts.map +1 -0
  98. package/dist/providers/index.js +69 -0
  99. package/dist/providers/index.js.map +1 -0
  100. package/package.json +1 -1
@@ -0,0 +1,447 @@
1
+ "use strict";
2
+ /**
3
+ * ActorCriticLearner - Actor-Critic Reinforcement Learning Algorithm
4
+ *
5
+ * Implements Advantage Actor-Critic (A2C) combining:
6
+ * - Actor: Policy network that selects actions using softmax policy
7
+ * - Critic: Value network that estimates state values for advantage calculation
8
+ *
9
+ * Key features:
10
+ * - Continuous action probabilities via softmax
11
+ * - Advantage-based updates to reduce variance
12
+ * - Entropy bonus for exploration
13
+ * - Policy gradient with baseline
14
+ *
15
+ * Update rules:
16
+ * - Critic (Value): V(s) += α_c * δ where δ = r + γV(s') - V(s)
17
+ * - Actor (Policy): π(a|s) += α_a * δ * ∇log(π(a|s)) + β * H(π)
18
+ *
19
+ * @module learning/algorithms/ActorCriticLearner
20
+ * @version 1.0.0
21
+ */
22
+ Object.defineProperty(exports, "__esModule", { value: true });
23
+ exports.ActorCriticLearner = void 0;
24
+ exports.createDefaultActorCriticConfig = createDefaultActorCriticConfig;
25
+ const AbstractRLLearner_1 = require("./AbstractRLLearner");
26
+ /**
27
+ * ActorCriticLearner - Advantage Actor-Critic implementation
28
+ *
29
+ * Combines policy gradient (actor) with value function approximation (critic)
30
+ * for more stable and efficient learning than pure Q-learning.
31
+ *
32
+ * Usage:
33
+ * ```typescript
34
+ * const ac = new ActorCriticLearner({
35
+ * learningRate: 0.1,
36
+ * actorLearningRate: 0.01,
37
+ * criticLearningRate: 0.1,
38
+ * discountFactor: 0.95,
39
+ * explorationRate: 0.3,
40
+ * explorationDecay: 0.995,
41
+ * minExplorationRate: 0.01,
42
+ * entropyCoefficient: 0.01,
43
+ * temperature: 1.0,
44
+ * normalizeAdvantage: true,
45
+ * targetUpdateFrequency: 100,
46
+ * useExperienceReplay: true,
47
+ * replayBufferSize: 10000,
48
+ * batchSize: 32
49
+ * });
50
+ *
51
+ * const action = ac.selectAction(state, availableActions);
52
+ * ac.update(experience);
53
+ * ```
54
+ */
55
+ class ActorCriticLearner extends AbstractRLLearner_1.AbstractRLLearner {
56
+ constructor(config) {
57
+ super(config);
58
+ this.actorConfig = config;
59
+ this.policyTable = new Map();
60
+ this.valueTable = new Map();
61
+ this.targetValueTable = new Map();
62
+ this.updatesSinceTargetSync = 0;
63
+ this.advantageHistory = [];
64
+ this.defaultExploration = config.explorationRate;
65
+ this.logger.info('ActorCriticLearner initialized', {
66
+ actorLR: config.actorLearningRate,
67
+ criticLR: config.criticLearningRate,
68
+ entropy: config.entropyCoefficient,
69
+ temperature: config.temperature
70
+ });
71
+ }
72
+ /**
73
+ * Select action using softmax policy with exploration
74
+ * π(a|s) = exp(Q(s,a)/τ) / Σ_a' exp(Q(s,a')/τ)
75
+ */
76
+ selectAction(state, availableActions) {
77
+ if (availableActions.length === 0) {
78
+ throw new Error('No available actions to select from');
79
+ }
80
+ // With probability ε, use random action (exploration fallback)
81
+ if (Math.random() < this.config.explorationRate) {
82
+ const randomIndex = Math.floor(Math.random() * availableActions.length);
83
+ return availableActions[randomIndex];
84
+ }
85
+ // Use softmax policy
86
+ return this.sampleFromPolicy(state, availableActions);
87
+ }
88
+ /**
89
+ * Sample action from softmax policy distribution
90
+ */
91
+ sampleFromPolicy(state, availableActions) {
92
+ const stateKey = this.encodeState(state);
93
+ const probabilities = this.getActionProbabilities(stateKey, availableActions);
94
+ // Sample from categorical distribution
95
+ const random = Math.random();
96
+ let cumulative = 0;
97
+ for (let i = 0; i < availableActions.length; i++) {
98
+ cumulative += probabilities[i];
99
+ if (random <= cumulative) {
100
+ return availableActions[i];
101
+ }
102
+ }
103
+ // Fallback (shouldn't reach here due to normalization)
104
+ return availableActions[availableActions.length - 1];
105
+ }
106
+ /**
107
+ * Get softmax action probabilities
108
+ * π(a|s) = exp(preference(s,a)/τ) / Σ_a' exp(preference(s,a')/τ)
109
+ */
110
+ getActionProbabilities(stateKey, availableActions) {
111
+ const temperature = this.actorConfig.temperature;
112
+ const preferences = [];
113
+ // Get preferences (Q-values or policy table values)
114
+ for (const action of availableActions) {
115
+ const actionKey = this.encodeAction(action);
116
+ const preference = this.getPreference(stateKey, actionKey);
117
+ preferences.push(preference / temperature);
118
+ }
119
+ // Softmax with numerical stability
120
+ const maxPref = Math.max(...preferences);
121
+ const expPrefs = preferences.map(p => Math.exp(p - maxPref));
122
+ const sumExp = expPrefs.reduce((sum, e) => sum + e, 0);
123
+ return expPrefs.map(e => e / sumExp);
124
+ }
125
+ /**
126
+ * Get preference for state-action pair from policy table
127
+ */
128
+ getPreference(stateKey, actionKey) {
129
+ const statePolicy = this.policyTable.get(stateKey);
130
+ if (!statePolicy) {
131
+ return 0; // uniform preference initially
132
+ }
133
+ const entry = statePolicy.get(actionKey);
134
+ return entry ? entry.probability : 0;
135
+ }
136
+ /**
137
+ * Update actor and critic using temporal difference
138
+ *
139
+ * TD Error (advantage): δ = r + γV(s') - V(s)
140
+ * Critic update: V(s) += α_c * δ
141
+ * Actor update: preference(s,a) += α_a * δ * (1 - π(a|s))
142
+ */
143
+ update(experience, nextAction) {
144
+ this.stepCount++;
145
+ const { state, action, reward, nextState, done } = this.extractExperience(experience);
146
+ const stateKey = this.encodeState(state);
147
+ const actionKey = this.encodeAction(action);
148
+ // Get current and next state values from critic
149
+ const currentV = this.getStateValue(state);
150
+ const nextV = done ? 0 : this.getTargetStateValue(nextState);
151
+ // Calculate TD error (advantage)
152
+ let advantage = reward + this.config.discountFactor * nextV - currentV;
153
+ // Normalize advantage if enabled
154
+ if (this.actorConfig.normalizeAdvantage) {
155
+ advantage = this.normalizeAdvantage(advantage);
156
+ }
157
+ // Update critic (value function)
158
+ this.updateCritic(stateKey, currentV, advantage);
159
+ // Update actor (policy)
160
+ this.updateActor(stateKey, actionKey, advantage);
161
+ // Store in replay buffer if enabled
162
+ if (this.replayBuffer) {
163
+ this.replayBuffer.add(experience);
164
+ }
165
+ // Sync target network periodically
166
+ this.updatesSinceTargetSync++;
167
+ if (this.updatesSinceTargetSync >= this.actorConfig.targetUpdateFrequency) {
168
+ this.syncTargetNetwork();
169
+ this.updatesSinceTargetSync = 0;
170
+ }
171
+ this.logger.debug('Actor-Critic update', {
172
+ state: stateKey,
173
+ action: actionKey,
174
+ reward,
175
+ advantage,
176
+ valueUpdate: currentV + this.actorConfig.criticLearningRate * advantage
177
+ });
178
+ }
179
+ /**
180
+ * Update critic (value function)
181
+ * V(s) += α_c * δ
182
+ */
183
+ updateCritic(stateKey, currentV, advantage) {
184
+ const newValue = currentV + this.actorConfig.criticLearningRate * advantage;
185
+ const existingEntry = this.valueTable.get(stateKey);
186
+ this.valueTable.set(stateKey, {
187
+ state: stateKey,
188
+ value: newValue,
189
+ updateCount: (existingEntry?.updateCount ?? 0) + 1,
190
+ lastUpdated: Date.now()
191
+ });
192
+ }
193
+ /**
194
+ * Update actor (policy)
195
+ * For softmax policy: preference(s,a) += α_a * δ * (1 - π(a|s))
196
+ * This increases preference for actions with positive advantage
197
+ */
198
+ updateActor(stateKey, actionKey, advantage) {
199
+ if (!this.policyTable.has(stateKey)) {
200
+ this.policyTable.set(stateKey, new Map());
201
+ }
202
+ const statePolicy = this.policyTable.get(stateKey);
203
+ // Get current preference and probability
204
+ const currentEntry = statePolicy.get(actionKey);
205
+ const currentPref = currentEntry?.probability ?? 0;
206
+ // Approximate gradient: increase preference proportional to advantage
207
+ // Also add entropy bonus to encourage exploration
208
+ const entropyBonus = this.calculateEntropyBonus(stateKey);
209
+ const newPref = currentPref + this.actorConfig.actorLearningRate * (advantage + entropyBonus);
210
+ statePolicy.set(actionKey, {
211
+ action: actionKey,
212
+ probability: newPref,
213
+ logProbability: Math.log(Math.max(0.001, this.softmaxProb(stateKey, actionKey))),
214
+ updateCount: (currentEntry?.updateCount ?? 0) + 1,
215
+ lastUpdated: Date.now()
216
+ });
217
+ // Also update Q-table for getBestAction compatibility
218
+ this.setQValue(stateKey, actionKey, newPref);
219
+ }
220
+ /**
221
+ * Calculate entropy bonus for a state
222
+ * H(π(·|s)) = -Σ_a π(a|s) log(π(a|s))
223
+ */
224
+ calculateEntropyBonus(stateKey) {
225
+ const statePolicy = this.policyTable.get(stateKey);
226
+ if (!statePolicy || statePolicy.size === 0) {
227
+ return 0;
228
+ }
229
+ // Calculate entropy over stored actions
230
+ const prefs = Array.from(statePolicy.values()).map(e => e.probability);
231
+ const maxPref = Math.max(...prefs);
232
+ const expPrefs = prefs.map(p => Math.exp((p - maxPref) / this.actorConfig.temperature));
233
+ const sumExp = expPrefs.reduce((sum, e) => sum + e, 0);
234
+ const probs = expPrefs.map(e => e / sumExp);
235
+ let entropy = 0;
236
+ for (const p of probs) {
237
+ if (p > 0) {
238
+ entropy -= p * Math.log(p);
239
+ }
240
+ }
241
+ return this.actorConfig.entropyCoefficient * entropy;
242
+ }
243
+ /**
244
+ * Get softmax probability for a specific action
245
+ */
246
+ softmaxProb(stateKey, actionKey) {
247
+ const statePolicy = this.policyTable.get(stateKey);
248
+ if (!statePolicy || statePolicy.size === 0) {
249
+ return 1.0 / Math.max(1, statePolicy?.size ?? 1);
250
+ }
251
+ const prefs = Array.from(statePolicy.entries());
252
+ const temp = this.actorConfig.temperature;
253
+ const maxPref = Math.max(...prefs.map(([, e]) => e.probability));
254
+ let sumExp = 0;
255
+ let targetExp = 0;
256
+ for (const [key, entry] of prefs) {
257
+ const exp = Math.exp((entry.probability - maxPref) / temp);
258
+ sumExp += exp;
259
+ if (key === actionKey) {
260
+ targetExp = exp;
261
+ }
262
+ }
263
+ return targetExp / sumExp;
264
+ }
265
+ /**
266
+ * Normalize advantage using running statistics
267
+ */
268
+ normalizeAdvantage(advantage) {
269
+ this.advantageHistory.push(advantage);
270
+ // Keep limited history
271
+ if (this.advantageHistory.length > 1000) {
272
+ this.advantageHistory.shift();
273
+ }
274
+ if (this.advantageHistory.length < 10) {
275
+ return advantage;
276
+ }
277
+ const mean = this.advantageHistory.reduce((s, a) => s + a, 0) / this.advantageHistory.length;
278
+ const variance = this.advantageHistory.reduce((s, a) => s + (a - mean) ** 2, 0) / this.advantageHistory.length;
279
+ const std = Math.sqrt(variance) + 1e-8;
280
+ return (advantage - mean) / std;
281
+ }
282
+ /**
283
+ * Get state value from value table
284
+ */
285
+ getStateValue(state) {
286
+ const stateKey = this.encodeState(state);
287
+ const entry = this.valueTable.get(stateKey);
288
+ return entry?.value ?? 0;
289
+ }
290
+ /**
291
+ * Get state value from target network (for stability)
292
+ */
293
+ getTargetStateValue(state) {
294
+ const stateKey = this.encodeState(state);
295
+ const entry = this.targetValueTable.get(stateKey);
296
+ return entry?.value ?? this.getStateValue(state);
297
+ }
298
+ /**
299
+ * Sync target network with main network
300
+ */
301
+ syncTargetNetwork() {
302
+ this.targetValueTable.clear();
303
+ for (const [key, value] of this.valueTable.entries()) {
304
+ this.targetValueTable.set(key, { ...value });
305
+ }
306
+ this.logger.debug('Target network synchronized');
307
+ }
308
+ /**
309
+ * Extract experience components
310
+ */
311
+ extractExperience(experience) {
312
+ return {
313
+ state: experience.state,
314
+ action: experience.action,
315
+ reward: experience.reward,
316
+ nextState: experience.nextState,
317
+ done: experience.done ?? false
318
+ };
319
+ }
320
+ /**
321
+ * Get default exploration rate for reset
322
+ */
323
+ getDefaultExplorationRate() {
324
+ return this.defaultExploration;
325
+ }
326
+ /**
327
+ * Get actor-critic specific statistics
328
+ */
329
+ getActorCriticStatistics() {
330
+ // Calculate average state value
331
+ let totalValue = 0;
332
+ for (const entry of this.valueTable.values()) {
333
+ totalValue += entry.value;
334
+ }
335
+ const avgStateValue = this.valueTable.size > 0 ? totalValue / this.valueTable.size : 0;
336
+ // Calculate policy table size
337
+ let policySize = 0;
338
+ for (const statePolicy of this.policyTable.values()) {
339
+ policySize += statePolicy.size;
340
+ }
341
+ // Calculate average entropy
342
+ let totalEntropy = 0;
343
+ let entropyCount = 0;
344
+ for (const stateKey of this.policyTable.keys()) {
345
+ const entropy = this.calculateEntropyBonus(stateKey) / this.actorConfig.entropyCoefficient;
346
+ totalEntropy += entropy;
347
+ entropyCount++;
348
+ }
349
+ const avgEntropy = entropyCount > 0 ? totalEntropy / entropyCount : 0;
350
+ // Calculate advantage statistics
351
+ const advMean = this.advantageHistory.length > 0
352
+ ? this.advantageHistory.reduce((s, a) => s + a, 0) / this.advantageHistory.length
353
+ : 0;
354
+ const advVariance = this.advantageHistory.length > 0
355
+ ? this.advantageHistory.reduce((s, a) => s + (a - advMean) ** 2, 0) / this.advantageHistory.length
356
+ : 0;
357
+ return {
358
+ valueTableSize: this.valueTable.size,
359
+ policyTableSize: policySize,
360
+ avgStateValue,
361
+ avgEntropy,
362
+ advantageMean: advMean,
363
+ advantageStd: Math.sqrt(advVariance)
364
+ };
365
+ }
366
+ /**
367
+ * Reset actor-critic specific state
368
+ */
369
+ reset() {
370
+ super.reset();
371
+ this.policyTable.clear();
372
+ this.valueTable.clear();
373
+ this.targetValueTable.clear();
374
+ this.advantageHistory = [];
375
+ this.updatesSinceTargetSync = 0;
376
+ this.logger.info('ActorCriticLearner reset');
377
+ }
378
+ /**
379
+ * Export complete actor-critic state
380
+ */
381
+ exportActorCritic() {
382
+ const serializedPolicy = {};
383
+ for (const [state, actions] of this.policyTable.entries()) {
384
+ serializedPolicy[state] = {};
385
+ for (const [action, entry] of actions.entries()) {
386
+ serializedPolicy[state][action] = entry;
387
+ }
388
+ }
389
+ const serializedValue = {};
390
+ for (const [state, entry] of this.valueTable.entries()) {
391
+ serializedValue[state] = entry;
392
+ }
393
+ return {
394
+ base: this.export(),
395
+ valueTable: serializedValue,
396
+ policyTable: serializedPolicy,
397
+ actorConfig: { ...this.actorConfig }
398
+ };
399
+ }
400
+ /**
401
+ * Import complete actor-critic state
402
+ */
403
+ importActorCritic(state) {
404
+ this.import(state.base);
405
+ this.valueTable.clear();
406
+ for (const [stateKey, entry] of Object.entries(state.valueTable)) {
407
+ this.valueTable.set(stateKey, entry);
408
+ }
409
+ this.policyTable.clear();
410
+ for (const [stateKey, actions] of Object.entries(state.policyTable)) {
411
+ const actionMap = new Map();
412
+ for (const [actionKey, entry] of Object.entries(actions)) {
413
+ actionMap.set(actionKey, entry);
414
+ }
415
+ this.policyTable.set(stateKey, actionMap);
416
+ }
417
+ this.actorConfig = { ...state.actorConfig };
418
+ this.syncTargetNetwork();
419
+ this.logger.info('Imported Actor-Critic state', {
420
+ valueTableSize: this.valueTable.size,
421
+ policyTableSize: this.policyTable.size
422
+ });
423
+ }
424
+ }
425
+ exports.ActorCriticLearner = ActorCriticLearner;
426
+ /**
427
+ * Create default Actor-Critic configuration
428
+ */
429
+ function createDefaultActorCriticConfig() {
430
+ return {
431
+ learningRate: 0.1,
432
+ actorLearningRate: 0.01,
433
+ criticLearningRate: 0.1,
434
+ discountFactor: 0.95,
435
+ explorationRate: 0.3,
436
+ explorationDecay: 0.995,
437
+ minExplorationRate: 0.01,
438
+ entropyCoefficient: 0.01,
439
+ temperature: 1.0,
440
+ normalizeAdvantage: true,
441
+ targetUpdateFrequency: 100,
442
+ useExperienceReplay: true,
443
+ replayBufferSize: 10000,
444
+ batchSize: 32
445
+ };
446
+ }
447
+ //# sourceMappingURL=ActorCriticLearner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ActorCriticLearner.js","sourceRoot":"","sources":["../../../src/learning/algorithms/ActorCriticLearner.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;;GAmBG;;;AAyhBH,wEAiBC;AAxiBD,2DAA0E;AA0C1E;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,MAAa,kBAAmB,SAAQ,qCAAiB;IASvD,YAAY,MAAyB;QACnC,KAAK,CAAC,MAAM,CAAC,CAAC;QACd,IAAI,CAAC,WAAW,GAAG,MAAM,CAAC;QAC1B,IAAI,CAAC,WAAW,GAAG,IAAI,GAAG,EAAE,CAAC;QAC7B,IAAI,CAAC,UAAU,GAAG,IAAI,GAAG,EAAE,CAAC;QAC5B,IAAI,CAAC,gBAAgB,GAAG,IAAI,GAAG,EAAE,CAAC;QAClC,IAAI,CAAC,sBAAsB,GAAG,CAAC,CAAC;QAChC,IAAI,CAAC,gBAAgB,GAAG,EAAE,CAAC;QAC3B,IAAI,CAAC,kBAAkB,GAAG,MAAM,CAAC,eAAe,CAAC;QAEjD,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,gCAAgC,EAAE;YACjD,OAAO,EAAE,MAAM,CAAC,iBAAiB;YACjC,QAAQ,EAAE,MAAM,CAAC,kBAAkB;YACnC,OAAO,EAAE,MAAM,CAAC,kBAAkB;YAClC,WAAW,EAAE,MAAM,CAAC,WAAW;SAChC,CAAC,CAAC;IACL,CAAC;IAED;;;OAGG;IACM,YAAY,CAAC,KAAgB,EAAE,gBAA+B;QACrE,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAClC,MAAM,IAAI,KAAK,CAAC,qCAAqC,CAAC,CAAC;QACzD,CAAC;QAED,+DAA+D;QAC/D,IAAI,IAAI,CAAC,MAAM,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,eAAe,EAAE,CAAC;YAChD,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,gBAAgB,CAAC,MAAM,CAAC,CAAC;YACxE,OAAO,gBAAgB,CAAC,WAAW,CAAC,CAAC;QACvC,CAAC;QAED,qBAAqB;QACrB,OAAO,IAAI,CAAC,gBAAgB,CAAC,KAAK,EAAE,gBAAgB,CAAC,CAAC;IACxD,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,KAAgB,EAAE,gBAA+B;QACxE,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,aAAa,GAAG,IAAI,CAAC,sBAAsB,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;QAE9E,uCAAuC;QACvC,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QAC7B,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,gBAAgB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjD,UAAU,IAAI,aAAa,CAAC,CAAC,CAAC,CAAC;YAC/B,IAAI,MAAM,IAAI,UAAU,EAAE,CAAC;gBACzB,OAAO,gBAAgB,CAAC,CAAC,CAAC,CAAC;YAC7B,CAAC;QACH,CAAC;QAED,uDAAuD;QACvD,OAAO,gBAAgB,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACvD,CAAC;IAED;;;OAGG;IACK,sBAAsB,CAAC,QAAgB,EAAE,gBAA+B;QAC9E,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,WAAW,CAAC;QACjD,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,oDAAoD;QACpD,KAAK,MAAM,MAAM,IAAI,gBAAgB,EAAE,CAAC;YACtC,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YAC5C,MAAM,UAAU,GAAG,IAAI,CAAC,aAAa,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;YAC3D,WAAW,CAAC,IAAI,CAAC,UAAU,GAAG,WAAW,CAAC,CAAC;QAC7C,CAAC;QAED,mCAAmC;QACnC,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,WAAW,CAAC,CAAC;QACzC,MAAM,QAAQ,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC;QAC7D,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;QAEvD,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;IACvC,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,QAAgB,EAAE,SAAiB;QACvD,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACnD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,OAAO,CAAC,CAAC,CAAC,+BAA+B;QAC3C,CAAC;QAED,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QACzC,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;IACvC,CAAC;IAED;;;;;;OAMG;IACM,MAAM,CAAC,UAA0B,EAAE,UAAwB;QAClE,IAAI,CAAC,SAAS,EAAE,CAAC;QAEjB,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,GAAG,IAAI,CAAC,iBAAiB,CAAC,UAAU,CAAC,CAAC;QACtF,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;QAE5C,gDAAgD;QAChD,MAAM,QAAQ,GAAG,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;QAC3C,MAAM,KAAK,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,SAAS,CAAC,CAAC;QAE7D,iCAAiC;QACjC,IAAI,SAAS,GAAG,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,cAAc,GAAG,KAAK,GAAG,QAAQ,CAAC;QAEvE,iCAAiC;QACjC,IAAI,IAAI,CAAC,WAAW,CAAC,kBAAkB,EAAE,CAAC;YACxC,SAAS,GAAG,IAAI,CAAC,kBAAkB,CAAC,SAAS,CAAC,CAAC;QACjD,CAAC;QAED,iCAAiC;QACjC,IAAI,CAAC,YAAY,CAAC,QAAQ,EAAE,QAAQ,EAAE,SAAS,CAAC,CAAC;QAEjD,wBAAwB;QACxB,IAAI,CAAC,WAAW,CAAC,QAAQ,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC;QAEjD,oCAAoC;QACpC,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YACtB,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;QACpC,CAAC;QAED,mCAAmC;QACnC,IAAI,CAAC,sBAAsB,EAAE,CAAC;QAC9B,IAAI,IAAI,CAAC,sBAAsB,IAAI,IAAI,CAAC,WAAW,CAAC,qBAAqB,EAAE,CAAC;YAC1E,IAAI,CAAC,iBAAiB,EAAE,CAAC;YACzB,IAAI,CAAC,sBAAsB,GAAG,CAAC,CAAC;QAClC,CAAC;QAED,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,qBAAqB,EAAE;YACvC,KAAK,EAAE,QAAQ;YACf,MAAM,EAAE,SAAS;YACjB,MAAM;YACN,SAAS;YACT,WAAW,EAAE,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,kBAAkB,GAAG,SAAS;SACxE,CAAC,CAAC;IACL,CAAC;IAED;;;OAGG;IACK,YAAY,CAAC,QAAgB,EAAE,QAAgB,EAAE,SAAiB;QACxE,MAAM,QAAQ,GAAG,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,kBAAkB,GAAG,SAAS,CAAC;QAE5E,MAAM,aAAa,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACpD,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,EAAE;YAC5B,KAAK,EAAE,QAAQ;YACf,KAAK,EAAE,QAAQ;YACf,WAAW,EAAE,CAAC,aAAa,EAAE,WAAW,IAAI,CAAC,CAAC,GAAG,CAAC;YAClD,WAAW,EAAE,IAAI,CAAC,GAAG,EAAE;SACxB,CAAC,CAAC;IACL,CAAC;IAED;;;;OAIG;IACK,WAAW,CAAC,QAAgB,EAAE,SAAiB,EAAE,SAAiB;QACxE,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YACpC,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,EAAE,IAAI,GAAG,EAAE,CAAC,CAAC;QAC5C,CAAC;QACD,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAE,CAAC;QAEpD,yCAAyC;QACzC,MAAM,YAAY,GAAG,WAAW,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QAChD,MAAM,WAAW,GAAG,YAAY,EAAE,WAAW,IAAI,CAAC,CAAC;QAEnD,sEAAsE;QACtE,kDAAkD;QAClD,MAAM,YAAY,GAAG,IAAI,CAAC,qBAAqB,CAAC,QAAQ,CAAC,CAAC;QAC1D,MAAM,OAAO,GAAG,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,iBAAiB,GAAG,CAAC,SAAS,GAAG,YAAY,CAAC,CAAC;QAE9F,WAAW,CAAC,GAAG,CAAC,SAAS,EAAE;YACzB,MAAM,EAAE,SAAS;YACjB,WAAW,EAAE,OAAO;YACpB,cAAc,EAAE,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,CAAC,WAAW,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC,CAAC;YAChF,WAAW,EAAE,CAAC,YAAY,EAAE,WAAW,IAAI,CAAC,CAAC,GAAG,CAAC;YACjD,WAAW,EAAE,IAAI,CAAC,GAAG,EAAE;SACxB,CAAC,CAAC;QAEH,sDAAsD;QACtD,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;IAC/C,CAAC;IAED;;;OAGG;IACK,qBAAqB,CAAC,QAAgB;QAC5C,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACnD,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;YAC3C,OAAO,CAAC,CAAC;QACX,CAAC;QAED,wCAAwC;QACxC,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;QACvE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC;QACnC,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,GAAG,IAAI,CAAC,WAAW,CAAC,WAAW,CAAC,CAAC,CAAC;QACxF,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;QACvD,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;QAE5C,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;YACtB,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBACV,OAAO,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YAC7B,CAAC;QACH,CAAC;QAED,OAAO,IAAI,CAAC,WAAW,CAAC,kBAAkB,GAAG,OAAO,CAAC;IACvD,CAAC;IAED;;OAEG;IACK,WAAW,CAAC,QAAgB,EAAE,SAAiB;QACrD,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACnD,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;YAC3C,OAAO,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,IAAI,CAAC,CAAC,CAAC;QACnD,CAAC;QAED,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,CAAC,CAAC;QAChD,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,CAAC,WAAW,CAAC;QAE1C,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC;QACjE,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,KAAK,EAAE,CAAC;YACjC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,WAAW,GAAG,OAAO,CAAC,GAAG,IAAI,CAAC,CAAC;YAC3D,MAAM,IAAI,GAAG,CAAC;YACd,IAAI,GAAG,KAAK,SAAS,EAAE,CAAC;gBACtB,SAAS,GAAG,GAAG,CAAC;YAClB,CAAC;QACH,CAAC;QAED,OAAO,SAAS,GAAG,MAAM,CAAC;IAC5B,CAAC;IAED;;OAEG;IACK,kBAAkB,CAAC,SAAiB;QAC1C,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAEtC,uBAAuB;QACvB,IAAI,IAAI,CAAC,gBAAgB,CAAC,MAAM,GAAG,IAAI,EAAE,CAAC;YACxC,IAAI,CAAC,gBAAgB,CAAC,KAAK,EAAE,CAAC;QAChC,CAAC;QAED,IAAI,IAAI,CAAC,gBAAgB,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;YACtC,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,MAAM,IAAI,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC;QAC7F,MAAM,QAAQ,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC;QAC/G,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,IAAI,CAAC;QAEvC,OAAO,CAAC,SAAS,GAAG,IAAI,CAAC,GAAG,GAAG,CAAC;IAClC,CAAC;IAED;;OAEG;IACM,aAAa,CAAC,KAAgB;QACrC,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,KAAK,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAC5C,OAAO,KAAK,EAAE,KAAK,IAAI,CAAC,CAAC;IAC3B,CAAC;IAED;;OAEG;IACK,mBAAmB,CAAC,KAAgB;QAC1C,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,KAAK,GAAG,IAAI,CAAC,gBAAgB,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAClD,OAAO,KAAK,EAAE,KAAK,IAAI,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;IACnD,CAAC;IAED;;OAEG;IACK,iBAAiB;QACvB,IAAI,CAAC,gBAAgB,CAAC,KAAK,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC;YACrD,IAAI,CAAC,gBAAgB,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,GAAG,KAAK,EAAE,CAAC,CAAC;QAC/C,CAAC;QACD,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;IACnD,CAAC;IAED;;OAEG;IACK,iBAAiB,CAAC,UAA0B;QAOlD,OAAO;YACL,KAAK,EAAE,UAAU,CAAC,KAAK;YACvB,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,SAAS,EAAE,UAAU,CAAC,SAAS;YAC/B,IAAI,EAAE,UAAU,CAAC,IAAI,IAAI,KAAK;SAC/B,CAAC;IACJ,CAAC;IAED;;OAEG;IACO,yBAAyB;QACjC,OAAO,IAAI,CAAC,kBAAkB,CAAC;IACjC,CAAC;IAED;;OAEG;IACH,wBAAwB;QAQtB,gCAAgC;QAChC,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,EAAE,EAAE,CAAC;YAC7C,UAAU,IAAI,KAAK,CAAC,KAAK,CAAC;QAC5B,CAAC;QACD,MAAM,aAAa,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QAEvF,8BAA8B;QAC9B,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,KAAK,MAAM,WAAW,IAAI,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,EAAE,CAAC;YACpD,UAAU,IAAI,WAAW,CAAC,IAAI,CAAC;QACjC,CAAC;QAED,4BAA4B;QAC5B,IAAI,YAAY,GAAG,CAAC,CAAC;QACrB,IAAI,YAAY,GAAG,CAAC,CAAC;QACrB,KAAK,MAAM,QAAQ,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,EAAE,CAAC;YAC/C,MAAM,OAAO,GAAG,IAAI,CAAC,qBAAqB,CAAC,QAAQ,CAAC,GAAG,IAAI,CAAC,WAAW,CAAC,kBAAkB,CAAC;YAC3F,YAAY,IAAI,OAAO,CAAC;YACxB,YAAY,EAAE,CAAC;QACjB,CAAC;QACD,MAAM,UAAU,GAAG,YAAY,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;QAEtE,iCAAiC;QACjC,MAAM,OAAO,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC;YAC9C,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM;YACjF,CAAC,CAAC,CAAC,CAAC;QACN,MAAM,WAAW,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC;YAClD,CAAC,CAAC,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM;YAClG,CAAC,CAAC,CAAC,CAAC;QAEN,OAAO;YACL,cAAc,EAAE,IAAI,CAAC,UAAU,CAAC,IAAI;YACpC,eAAe,EAAE,UAAU;YAC3B,aAAa;YACb,UAAU;YACV,aAAa,EAAE,OAAO;YACtB,YAAY,EAAE,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC;SACrC,CAAC;IACJ,CAAC;IAED;;OAEG;IACM,KAAK;QACZ,KAAK,CAAC,KAAK,EAAE,CAAC;QACd,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;QACxB,IAAI,CAAC,gBAAgB,CAAC,KAAK,EAAE,CAAC;QAC9B,IAAI,CAAC,gBAAgB,GAAG,EAAE,CAAC;QAC3B,IAAI,CAAC,sBAAsB,GAAG,CAAC,CAAC;QAChC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;IAC/C,CAAC;IAED;;OAEG;IACH,iBAAiB;QAMf,MAAM,gBAAgB,GAAgD,EAAE,CAAC;QACzE,KAAK,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,IAAI,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,CAAC;YAC1D,gBAAgB,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC;YAC7B,KAAK,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;gBAChD,gBAAgB,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC;YAC1C,CAAC;QACH,CAAC;QAED,MAAM,eAAe,GAAoC,EAAE,CAAC;QAC5D,KAAK,MAAM,CAAC,KAAK,EAAE,KAAK,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC;YACvD,eAAe,CAAC,KAAK,CAAC,GAAG,KAAK,CAAC;QACjC,CAAC;QAED,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,MAAM,EAAE;YACnB,UAAU,EAAE,eAAe;YAC3B,WAAW,EAAE,gBAAgB;YAC7B,WAAW,EAAE,EAAE,GAAG,IAAI,CAAC,WAAW,EAAE;SACrC,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,iBAAiB,CAAC,KAAgD;QAChE,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAExB,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;QACxB,KAAK,MAAM,CAAC,QAAQ,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,UAAU,CAAC,EAAE,CAAC;YACjE,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QACvC,CAAC;QAED,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC;QACzB,KAAK,MAAM,CAAC,QAAQ,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,EAAE,CAAC;YACpE,MAAM,SAAS,GAAG,IAAI,GAAG,EAAuB,CAAC;YACjD,KAAK,MAAM,CAAC,SAAS,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;gBACzD,SAAS,CAAC,GAAG,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;YAClC,CAAC;YACD,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAC5C,CAAC;QAED,IAAI,CAAC,WAAW,GAAG,EAAE,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC;QAC5C,IAAI,CAAC,iBAAiB,EAAE,CAAC;QAEzB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,6BAA6B,EAAE;YAC9C,cAAc,EAAE,IAAI,CAAC,UAAU,CAAC,IAAI;YACpC,eAAe,EAAE,IAAI,CAAC,WAAW,CAAC,IAAI;SACvC,CAAC,CAAC;IACL,CAAC;CACF;AA3cD,gDA2cC;AAED;;GAEG;AACH,SAAgB,8BAA8B;IAC5C,OAAO;QACL,YAAY,EAAE,GAAG;QACjB,iBAAiB,EAAE,IAAI;QACvB,kBAAkB,EAAE,GAAG;QACvB,cAAc,EAAE,IAAI;QACpB,eAAe,EAAE,GAAG;QACpB,gBAAgB,EAAE,KAAK;QACvB,kBAAkB,EAAE,IAAI;QACxB,kBAAkB,EAAE,IAAI;QACxB,WAAW,EAAE,GAAG;QAChB,kBAAkB,EAAE,IAAI;QACxB,qBAAqB,EAAE,GAAG;QAC1B,mBAAmB,EAAE,IAAI;QACzB,gBAAgB,EAAE,KAAK;QACvB,SAAS,EAAE,EAAE;KACd,CAAC;AACJ,CAAC"}
@@ -0,0 +1,207 @@
1
+ /**
2
+ * PPOLearner - Proximal Policy Optimization Algorithm
3
+ *
4
+ * Implements PPO-Clip, the most widely used variant of PPO:
5
+ * - Clipped surrogate objective to prevent large policy updates
6
+ * - Generalized Advantage Estimation (GAE) for variance reduction
7
+ * - Value function clipping for stability
8
+ * - Multiple epochs over collected trajectories
9
+ *
10
+ * Key features:
11
+ * - Trust region optimization without KL constraint
12
+ * - Sample efficient with mini-batch updates
13
+ * - Robust to hyperparameter choices
14
+ * - Suitable for continuous and discrete action spaces
15
+ *
16
+ * PPO-Clip objective:
17
+ * L^CLIP(θ) = E[min(r(θ)Â, clip(r(θ), 1-ε, 1+ε)Â)]
18
+ * where r(θ) = π_θ(a|s) / π_θ_old(a|s)
19
+ *
20
+ * @module learning/algorithms/PPOLearner
21
+ * @version 1.0.0
22
+ */
23
+ import { AbstractRLLearner, RLConfig } from './AbstractRLLearner';
24
+ import { TaskState, AgentAction, TaskExperience } from '../types';
25
+ /**
26
+ * Configuration specific to PPO algorithm
27
+ */
28
+ export interface PPOConfig extends RLConfig {
29
+ /** Clipping parameter (ε) - typically 0.1-0.3 */
30
+ clipEpsilon: number;
31
+ /** Number of epochs to train on collected data */
32
+ ppoEpochs: number;
33
+ /** Mini-batch size for training */
34
+ miniBatchSize: number;
35
+ /** Value function loss coefficient */
36
+ valueLossCoefficient: number;
37
+ /** Entropy loss coefficient for exploration */
38
+ entropyCoefficient: number;
39
+ /** GAE lambda for advantage estimation */
40
+ gaeLambda: number;
41
+ /** Maximum gradient norm for clipping */
42
+ maxGradNorm: number;
43
+ /** Whether to clip value function updates */
44
+ clipValueLoss: boolean;
45
+ /** Learning rate for policy network */
46
+ policyLearningRate: number;
47
+ /** Learning rate for value network */
48
+ valueLearningRate: number;
49
+ }
50
+ /**
51
+ * Policy parameters for a state-action pair
52
+ */
53
+ interface PolicyParams {
54
+ preference: number;
55
+ logProb: number;
56
+ updateCount: number;
57
+ }
58
+ /**
59
+ * PPOLearner - Proximal Policy Optimization implementation
60
+ *
61
+ * PPO is a state-of-the-art policy gradient method that achieves
62
+ * strong performance while being simpler than TRPO.
63
+ *
64
+ * Usage:
65
+ * ```typescript
66
+ * const ppo = new PPOLearner({
67
+ * learningRate: 0.0003,
68
+ * discountFactor: 0.99,
69
+ * explorationRate: 0.0,
70
+ * explorationDecay: 1.0,
71
+ * minExplorationRate: 0.0,
72
+ * clipEpsilon: 0.2,
73
+ * ppoEpochs: 4,
74
+ * miniBatchSize: 64,
75
+ * valueLossCoefficient: 0.5,
76
+ * entropyCoefficient: 0.01,
77
+ * gaeLambda: 0.95,
78
+ * maxGradNorm: 0.5,
79
+ * clipValueLoss: true,
80
+ * policyLearningRate: 0.0003,
81
+ * valueLearningRate: 0.001,
82
+ * useExperienceReplay: false,
83
+ * replayBufferSize: 2048,
84
+ * batchSize: 64
85
+ * });
86
+ *
87
+ * // Collect trajectory
88
+ * ppo.collectStep(state, action, reward, nextState, done);
89
+ *
90
+ * // Train on collected trajectory
91
+ * ppo.trainOnTrajectory();
92
+ * ```
93
+ */
94
+ export declare class PPOLearner extends AbstractRLLearner {
95
+ private ppoConfig;
96
+ private policyTable;
97
+ private valueTable;
98
+ private oldPolicyTable;
99
+ private trajectory;
100
+ private readonly defaultExploration;
101
+ constructor(config: PPOConfig);
102
+ /**
103
+ * Select action using current policy (softmax)
104
+ */
105
+ selectAction(state: TaskState, availableActions: AgentAction[]): AgentAction;
106
+ /**
107
+ * Get action probabilities using softmax policy
108
+ */
109
+ private getActionProbabilities;
110
+ /**
111
+ * Get policy parameters for state-action pair
112
+ */
113
+ private getPolicyParams;
114
+ /**
115
+ * Get log probability of action under current policy
116
+ */
117
+ private getLogProb;
118
+ /**
119
+ * Get state value from value network
120
+ */
121
+ getStateValue(state: TaskState): number;
122
+ /**
123
+ * Collect a step in the trajectory
124
+ */
125
+ collectStep(state: TaskState, action: AgentAction, reward: number, nextState: TaskState, done: boolean): void;
126
+ /**
127
+ * Standard update interface - collects experience and trains when ready
128
+ */
129
+ update(experience: TaskExperience, nextAction?: AgentAction): void;
130
+ /**
131
+ * Train on collected trajectory using PPO
132
+ */
133
+ trainOnTrajectory(): void;
134
+ /**
135
+ * Compute Generalized Advantage Estimation (GAE)
136
+ *
137
+ * GAE: Â_t = Σ_{l=0}^∞ (γλ)^l δ_{t+l}
138
+ * where δ_t = r_t + γV(s_{t+1}) - V(s_t)
139
+ */
140
+ private computeGAE;
141
+ /**
142
+ * Save current policy as old policy for ratio computation
143
+ */
144
+ private saveOldPolicy;
145
+ /**
146
+ * Get old log probability for ratio computation
147
+ */
148
+ private getOldLogProb;
149
+ /**
150
+ * Train one epoch on the trajectory
151
+ */
152
+ private trainEpoch;
153
+ /**
154
+ * Train on a mini-batch
155
+ */
156
+ private trainMiniBatch;
157
+ /**
158
+ * Update policy parameters
159
+ */
160
+ private updatePolicy;
161
+ /**
162
+ * Update value function
163
+ */
164
+ private updateValue;
165
+ /**
166
+ * Compute entropy of policy at state
167
+ */
168
+ private computeEntropy;
169
+ /**
170
+ * Get default exploration rate for reset
171
+ */
172
+ protected getDefaultExplorationRate(): number;
173
+ /**
174
+ * Get PPO-specific statistics
175
+ */
176
+ getPPOStatistics(): {
177
+ trajectoryLength: number;
178
+ valueTableSize: number;
179
+ policyTableSize: number;
180
+ avgValue: number;
181
+ avgAdvantage: number;
182
+ clipFraction: number;
183
+ };
184
+ /**
185
+ * Reset PPO-specific state
186
+ */
187
+ reset(): void;
188
+ /**
189
+ * Export PPO state
190
+ */
191
+ exportPPO(): {
192
+ base: ReturnType<AbstractRLLearner['export']>;
193
+ policyTable: Record<string, Record<string, PolicyParams>>;
194
+ valueTable: Record<string, number>;
195
+ ppoConfig: PPOConfig;
196
+ };
197
+ /**
198
+ * Import PPO state
199
+ */
200
+ importPPO(state: ReturnType<typeof this.exportPPO>): void;
201
+ }
202
+ /**
203
+ * Create default PPO configuration
204
+ */
205
+ export declare function createDefaultPPOConfig(): PPOConfig;
206
+ export {};
207
+ //# sourceMappingURL=PPOLearner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"PPOLearner.d.ts","sourceRoot":"","sources":["../../../src/learning/algorithms/PPOLearner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAEH,OAAO,EAAE,iBAAiB,EAAE,QAAQ,EAAU,MAAM,qBAAqB,CAAC;AAC1E,OAAO,EAAE,SAAS,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,UAAU,CAAC;AAElE;;GAEG;AACH,MAAM,WAAW,SAAU,SAAQ,QAAQ;IACzC,iDAAiD;IACjD,WAAW,EAAE,MAAM,CAAC;IACpB,kDAAkD;IAClD,SAAS,EAAE,MAAM,CAAC;IAClB,mCAAmC;IACnC,aAAa,EAAE,MAAM,CAAC;IACtB,sCAAsC;IACtC,oBAAoB,EAAE,MAAM,CAAC;IAC7B,+CAA+C;IAC/C,kBAAkB,EAAE,MAAM,CAAC;IAC3B,0CAA0C;IAC1C,SAAS,EAAE,MAAM,CAAC;IAClB,yCAAyC;IACzC,WAAW,EAAE,MAAM,CAAC;IACpB,6CAA6C;IAC7C,aAAa,EAAE,OAAO,CAAC;IACvB,uCAAuC;IACvC,kBAAkB,EAAE,MAAM,CAAC;IAC3B,sCAAsC;IACtC,iBAAiB,EAAE,MAAM,CAAC;CAC3B;AAiBD;;GAEG;AACH,UAAU,YAAY;IACpB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmCG;AACH,qBAAa,UAAW,SAAQ,iBAAiB;IAC/C,OAAO,CAAC,SAAS,CAAY;IAC7B,OAAO,CAAC,WAAW,CAAyC;IAC5D,OAAO,CAAC,UAAU,CAAsB;IACxC,OAAO,CAAC,cAAc,CAAyC;IAC/D,OAAO,CAAC,UAAU,CAAmB;IACrC,OAAO,CAAC,QAAQ,CAAC,kBAAkB,CAAS;gBAEhC,MAAM,EAAE,SAAS;IAiB7B;;OAEG;IACM,YAAY,CAAC,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,WAAW,EAAE,GAAG,WAAW;IAsBrF;;OAEG;IACH,OAAO,CAAC,sBAAsB;IAiB9B;;OAEG;IACH,OAAO,CAAC,eAAe;IAQvB;;OAEG;IACH,OAAO,CAAC,UAAU;IA6BlB;;OAEG;IACM,aAAa,CAAC,KAAK,EAAE,SAAS,GAAG,MAAM;IAKhD;;OAEG;IACH,WAAW,CACT,KAAK,EAAE,SAAS,EAChB,MAAM,EAAE,WAAW,EACnB,MAAM,EAAE,MAAM,EACd,SAAS,EAAE,SAAS,EACpB,IAAI,EAAE,OAAO,GACZ,IAAI;IAqBP;;OAEG;IACM,MAAM,CAAC,UAAU,EAAE,cAAc,EAAE,UAAU,CAAC,EAAE,WAAW,GAAG,IAAI;IAe3E;;OAEG;IACH,iBAAiB,IAAI,IAAI;IAyBzB;;;;;OAKG;IACH,OAAO,CAAC,UAAU;IAsClB;;OAEG;IACH,OAAO,CAAC,aAAa;IAWrB;;OAEG;IACH,OAAO,CAAC,aAAa;IAQrB;;OAEG;IACH,OAAO,CAAC,UAAU;IAWlB;;OAEG;IACH,OAAO,CAAC,cAAc;IA0CtB;;OAEG;IACH,OAAO,CAAC,YAAY;IAkCpB;;OAEG;IACH,OAAO,CAAC,WAAW;IAMnB;;OAEG;IACH,OAAO,CAAC,cAAc;IAsBtB;;OAEG;IACH,SAAS,CAAC,yBAAyB,IAAI,MAAM;IAI7C;;OAEG;IACH,gBAAgB,IAAI;QAClB,gBAAgB,EAAE,MAAM,CAAC;QACzB,cAAc,EAAE,MAAM,CAAC;QACvB,eAAe,EAAE,MAAM,CAAC;QACxB,QAAQ,EAAE,MAAM,CAAC;QACjB,YAAY,EAAE,MAAM,CAAC;QACrB,YAAY,EAAE,MAAM,CAAC;KACtB;IAyBD;;OAEG;IACM,KAAK,IAAI,IAAI;IAStB;;OAEG;IACH,SAAS,IAAI;QACX,IAAI,EAAE,UAAU,CAAC,iBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC;QAC9C,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC,CAAC;QAC1D,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACnC,SAAS,EAAE,SAAS,CAAC;KACtB;IAsBD;;OAEG;IACH,SAAS,CAAC,KAAK,EAAE,UAAU,CAAC,OAAO,IAAI,CAAC,SAAS,CAAC,GAAG,IAAI;CAwB1D;AAED;;GAEG;AACH,wBAAgB,sBAAsB,IAAI,SAAS,CAqBlD"}