agentic-qe 2.1.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. package/CHANGELOG.md +123 -0
  2. package/README.md +1 -1
  3. package/dist/agents/index.d.ts.map +1 -1
  4. package/dist/agents/index.js +5 -1
  5. package/dist/agents/index.js.map +1 -1
  6. package/dist/core/di/AgentDependencies.d.ts +127 -0
  7. package/dist/core/di/AgentDependencies.d.ts.map +1 -0
  8. package/dist/core/di/AgentDependencies.js +251 -0
  9. package/dist/core/di/AgentDependencies.js.map +1 -0
  10. package/dist/core/di/DIContainer.d.ts +149 -0
  11. package/dist/core/di/DIContainer.d.ts.map +1 -0
  12. package/dist/core/di/DIContainer.js +333 -0
  13. package/dist/core/di/DIContainer.js.map +1 -0
  14. package/dist/core/di/index.d.ts +11 -0
  15. package/dist/core/di/index.d.ts.map +1 -0
  16. package/dist/core/di/index.js +22 -0
  17. package/dist/core/di/index.js.map +1 -0
  18. package/dist/core/index.d.ts +1 -0
  19. package/dist/core/index.d.ts.map +1 -1
  20. package/dist/core/index.js +11 -1
  21. package/dist/core/index.js.map +1 -1
  22. package/dist/learning/ExperienceSharingProtocol.d.ts +243 -0
  23. package/dist/learning/ExperienceSharingProtocol.d.ts.map +1 -0
  24. package/dist/learning/ExperienceSharingProtocol.js +538 -0
  25. package/dist/learning/ExperienceSharingProtocol.js.map +1 -0
  26. package/dist/learning/LearningEngine.d.ts +101 -1
  27. package/dist/learning/LearningEngine.d.ts.map +1 -1
  28. package/dist/learning/LearningEngine.js +330 -3
  29. package/dist/learning/LearningEngine.js.map +1 -1
  30. package/dist/learning/QLearning.d.ts +38 -125
  31. package/dist/learning/QLearning.d.ts.map +1 -1
  32. package/dist/learning/QLearning.js +46 -267
  33. package/dist/learning/QLearning.js.map +1 -1
  34. package/dist/learning/QLearningLegacy.d.ts +154 -0
  35. package/dist/learning/QLearningLegacy.d.ts.map +1 -0
  36. package/dist/learning/QLearningLegacy.js +337 -0
  37. package/dist/learning/QLearningLegacy.js.map +1 -0
  38. package/dist/learning/algorithms/AbstractRLLearner.d.ts +162 -0
  39. package/dist/learning/algorithms/AbstractRLLearner.d.ts.map +1 -0
  40. package/dist/learning/algorithms/AbstractRLLearner.js +300 -0
  41. package/dist/learning/algorithms/AbstractRLLearner.js.map +1 -0
  42. package/dist/learning/algorithms/ActorCriticLearner.d.ts +201 -0
  43. package/dist/learning/algorithms/ActorCriticLearner.d.ts.map +1 -0
  44. package/dist/learning/algorithms/ActorCriticLearner.js +447 -0
  45. package/dist/learning/algorithms/ActorCriticLearner.js.map +1 -0
  46. package/dist/learning/algorithms/PPOLearner.d.ts +207 -0
  47. package/dist/learning/algorithms/PPOLearner.d.ts.map +1 -0
  48. package/dist/learning/algorithms/PPOLearner.js +490 -0
  49. package/dist/learning/algorithms/PPOLearner.js.map +1 -0
  50. package/dist/learning/algorithms/QLearning.d.ts +68 -0
  51. package/dist/learning/algorithms/QLearning.d.ts.map +1 -0
  52. package/dist/learning/algorithms/QLearning.js +116 -0
  53. package/dist/learning/algorithms/QLearning.js.map +1 -0
  54. package/dist/learning/algorithms/SARSALearner.d.ts +107 -0
  55. package/dist/learning/algorithms/SARSALearner.d.ts.map +1 -0
  56. package/dist/learning/algorithms/SARSALearner.js +252 -0
  57. package/dist/learning/algorithms/SARSALearner.js.map +1 -0
  58. package/dist/learning/algorithms/index.d.ts +29 -0
  59. package/dist/learning/algorithms/index.d.ts.map +1 -0
  60. package/dist/learning/algorithms/index.js +44 -0
  61. package/dist/learning/algorithms/index.js.map +1 -0
  62. package/dist/learning/index.d.ts +3 -0
  63. package/dist/learning/index.d.ts.map +1 -1
  64. package/dist/learning/index.js +15 -1
  65. package/dist/learning/index.js.map +1 -1
  66. package/dist/learning/types.d.ts +2 -0
  67. package/dist/learning/types.d.ts.map +1 -1
  68. package/dist/memory/DistributedPatternLibrary.d.ts +159 -0
  69. package/dist/memory/DistributedPatternLibrary.d.ts.map +1 -0
  70. package/dist/memory/DistributedPatternLibrary.js +370 -0
  71. package/dist/memory/DistributedPatternLibrary.js.map +1 -0
  72. package/dist/memory/PatternQualityScorer.d.ts +169 -0
  73. package/dist/memory/PatternQualityScorer.d.ts.map +1 -0
  74. package/dist/memory/PatternQualityScorer.js +327 -0
  75. package/dist/memory/PatternQualityScorer.js.map +1 -0
  76. package/dist/memory/PatternReplicationService.d.ts +187 -0
  77. package/dist/memory/PatternReplicationService.d.ts.map +1 -0
  78. package/dist/memory/PatternReplicationService.js +392 -0
  79. package/dist/memory/PatternReplicationService.js.map +1 -0
  80. package/dist/providers/ClaudeProvider.d.ts +98 -0
  81. package/dist/providers/ClaudeProvider.d.ts.map +1 -0
  82. package/dist/providers/ClaudeProvider.js +418 -0
  83. package/dist/providers/ClaudeProvider.js.map +1 -0
  84. package/dist/providers/ILLMProvider.d.ts +287 -0
  85. package/dist/providers/ILLMProvider.d.ts.map +1 -0
  86. package/dist/providers/ILLMProvider.js +33 -0
  87. package/dist/providers/ILLMProvider.js.map +1 -0
  88. package/dist/providers/LLMProviderFactory.d.ts +154 -0
  89. package/dist/providers/LLMProviderFactory.d.ts.map +1 -0
  90. package/dist/providers/LLMProviderFactory.js +426 -0
  91. package/dist/providers/LLMProviderFactory.js.map +1 -0
  92. package/dist/providers/RuvllmProvider.d.ts +107 -0
  93. package/dist/providers/RuvllmProvider.d.ts.map +1 -0
  94. package/dist/providers/RuvllmProvider.js +417 -0
  95. package/dist/providers/RuvllmProvider.js.map +1 -0
  96. package/dist/providers/index.d.ts +31 -0
  97. package/dist/providers/index.d.ts.map +1 -0
  98. package/dist/providers/index.js +69 -0
  99. package/dist/providers/index.js.map +1 -0
  100. package/package.json +1 -1
@@ -0,0 +1,116 @@
1
+ "use strict";
2
+ /**
3
+ * QLearning - Off-policy TD(0) Reinforcement Learning
4
+ *
5
+ * Implements standard Q-learning algorithm for reinforcement learning.
6
+ * Key differences from SARSA:
7
+ * - Off-policy: learns optimal Q-values regardless of policy being followed
8
+ * - Uses max Q-value for next state, not actual next action
9
+ * - Update rule: Q(s,a) ← Q(s,a) + α[r + γ·max(Q(s',a')) - Q(s,a)]
10
+ * - More aggressive than SARSA, finds optimal policy faster
11
+ */
12
+ Object.defineProperty(exports, "__esModule", { value: true });
13
+ exports.QLearning = void 0;
14
+ const AbstractRLLearner_1 = require("./AbstractRLLearner");
15
+ /**
16
+ * Default Q-learning configuration
17
+ */
18
+ const DEFAULT_CONFIG = {
19
+ learningRate: 0.1,
20
+ discountFactor: 0.95,
21
+ explorationRate: 0.3,
22
+ explorationDecay: 0.995,
23
+ minExplorationRate: 0.01,
24
+ useExperienceReplay: true,
25
+ replayBufferSize: 10000,
26
+ batchSize: 32
27
+ };
28
+ /**
29
+ * QLearning - Standard Q-learning implementation
30
+ *
31
+ * Implements the classic Q-learning algorithm with:
32
+ * - Epsilon-greedy exploration policy
33
+ * - Off-policy temporal difference (TD) learning
34
+ * - Q-table for state-action values
35
+ * - Optional experience replay for stability
36
+ *
37
+ * Update Rule:
38
+ * Q(s,a) ← Q(s,a) + α[r + γ·max_a'(Q(s',a')) - Q(s,a)]
39
+ *
40
+ * Key characteristics:
41
+ * - Off-policy: learns about optimal policy while following exploration policy
42
+ * - Uses max Q-value (greedy) for bootstrapping
43
+ * - Converges to optimal Q* under certain conditions
44
+ * - More sample-efficient than on-policy methods
45
+ */
46
+ class QLearning extends AbstractRLLearner_1.AbstractRLLearner {
47
+ constructor(config = {}) {
48
+ const fullConfig = { ...DEFAULT_CONFIG, ...config };
49
+ super(fullConfig);
50
+ this.defaultConfig = fullConfig;
51
+ this.logger.info('QLearning initialized with off-policy TD(0)', { config: fullConfig });
52
+ }
53
+ /**
54
+ * Update Q-value using Q-learning update rule
55
+ * Q(s,a) ← Q(s,a) + α[r + γ·max(Q(s',a')) - Q(s,a)]
56
+ *
57
+ * @param experience The transition experience (s, a, r, s')
58
+ * @param nextAction Ignored in Q-learning (uses max Q-value instead)
59
+ */
60
+ update(experience, nextAction) {
61
+ const stateKey = this.encodeState(experience.state);
62
+ const actionKey = this.encodeAction(experience.action);
63
+ const nextStateKey = this.encodeState(experience.nextState);
64
+ // Get current Q-value Q(s,a)
65
+ const stateActions = this.qTable.get(stateKey);
66
+ const currentQ = stateActions?.get(actionKey)?.value ?? 0;
67
+ // Q-Learning: Get max Q-value for next state (greedy)
68
+ // This is the key difference from SARSA (which uses actual next action)
69
+ const nextStateActions = this.qTable.get(nextStateKey);
70
+ const maxNextQ = nextStateActions && nextStateActions.size > 0
71
+ ? Math.max(...Array.from(nextStateActions.values()).map(qv => qv.value))
72
+ : 0;
73
+ // Q-learning update rule
74
+ // Q(s,a) = Q(s,a) + α * [r + γ * max(Q(s',a')) - Q(s,a)]
75
+ const tdTarget = experience.reward + this.config.discountFactor * maxNextQ;
76
+ const tdError = tdTarget - currentQ;
77
+ const newQ = currentQ + this.config.learningRate * tdError;
78
+ // Update Q-value
79
+ this.setQValue(stateKey, actionKey, newQ);
80
+ // Add to experience replay buffer if enabled
81
+ if (this.replayBuffer) {
82
+ this.replayBuffer.add(experience, Math.abs(tdError)); // Priority based on TD error
83
+ }
84
+ this.stepCount++;
85
+ }
86
+ /**
87
+ * Get the default exploration rate for this algorithm
88
+ */
89
+ getDefaultExplorationRate() {
90
+ return this.defaultConfig.explorationRate;
91
+ }
92
+ /**
93
+ * Get algorithm name
94
+ */
95
+ getAlgorithmName() {
96
+ return 'Q-Learning';
97
+ }
98
+ /**
99
+ * Get algorithm type (off-policy)
100
+ */
101
+ getAlgorithmType() {
102
+ return 'off-policy';
103
+ }
104
+ /**
105
+ * Get detailed statistics including Q-learning-specific metrics
106
+ */
107
+ getDetailedStatistics() {
108
+ return {
109
+ algorithm: this.getAlgorithmName(),
110
+ type: this.getAlgorithmType(),
111
+ stats: this.getStatistics()
112
+ };
113
+ }
114
+ }
115
+ exports.QLearning = QLearning;
116
+ //# sourceMappingURL=QLearning.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"QLearning.js","sourceRoot":"","sources":["../../../src/learning/algorithms/QLearning.ts"],"names":[],"mappings":";AAAA;;;;;;;;;GASG;;;AAEH,2DAAkE;AAWlE;;GAEG;AACH,MAAM,cAAc,GAAoB;IACtC,YAAY,EAAE,GAAG;IACjB,cAAc,EAAE,IAAI;IACpB,eAAe,EAAE,GAAG;IACpB,gBAAgB,EAAE,KAAK;IACvB,kBAAkB,EAAE,IAAI;IACxB,mBAAmB,EAAE,IAAI;IACzB,gBAAgB,EAAE,KAAK;IACvB,SAAS,EAAE,EAAE;CACd,CAAC;AAEF;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAa,SAAU,SAAQ,qCAAiB;IAG9C,YAAY,SAAmC,EAAE;QAC/C,MAAM,UAAU,GAAG,EAAE,GAAG,cAAc,EAAE,GAAG,MAAM,EAAE,CAAC;QACpD,KAAK,CAAC,UAAU,CAAC,CAAC;QAClB,IAAI,CAAC,aAAa,GAAG,UAAU,CAAC;QAChC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,6CAA6C,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,CAAC;IAC1F,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,UAA0B,EAAE,UAAwB;QACzD,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;QACpD,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;QACvD,MAAM,YAAY,GAAG,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC;QAE5D,6BAA6B;QAC7B,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAC/C,MAAM,QAAQ,GAAG,YAAY,EAAE,GAAG,CAAC,SAAS,CAAC,EAAE,KAAK,IAAI,CAAC,CAAC;QAE1D,sDAAsD;QACtD,wEAAwE;QACxE,MAAM,gBAAgB,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;QACvD,MAAM,QAAQ,GAAG,gBAAgB,IAAI,gBAAgB,CAAC,IAAI,GAAG,CAAC;YAC5D,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC;YACxE,CAAC,CAAC,CAAC,CAAC;QAEN,yBAAyB;QACzB,yDAAyD;QACzD,MAAM,QAAQ,GAAG,UAAU,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,cAAc,GAAG,QAAQ,CAAC;QAC3E,MAAM,OAAO,GAAG,QAAQ,GAAG,QAAQ,CAAC;QACpC,MAAM,IAAI,GAAG,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,YAAY,GAAG,OAAO,CAAC;QAE3D,iBAAiB;QACjB,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;QAE1C,6CAA6C;QAC7C,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YACtB,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,UAAU,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,6BAA6B;QACrF,CAAC;QAED,IAAI,CAAC,SAAS,EAAE,CAAC;IACnB,CAAC;IAED;;OAEG;IACO,yBAAyB;QACjC,OAAO,IAAI,CAAC,aAAa,CAAC,eAAe,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,qBAAqB;QAKnB,OAAO;YACL,SAAS,EAAE,IAAI,CAAC,gBAAgB,EAAE;YAClC,IAAI,EAAE,IAAI,CAAC,gBAAgB,EAAE;YAC7B,KAAK,EAAE,IAAI,CAAC,aAAa,EAAE;SAC5B,CAAC;IACJ,CAAC;CACF;AArFD,8BAqFC"}
@@ -0,0 +1,107 @@
1
+ /**
2
+ * SARSALearner - On-policy TD(0) Reinforcement Learning
3
+ *
4
+ * Implements SARSA (State-Action-Reward-State-Action) algorithm.
5
+ * Key differences from Q-Learning:
6
+ * - On-policy: learns Q-values for the policy being followed (epsilon-greedy)
7
+ * - Uses actual next action taken, not the max Q-value
8
+ * - Update rule: Q(s,a) ← Q(s,a) + α[r + γQ(s',a') - Q(s,a)]
9
+ * - More conservative than Q-Learning, safer for exploration
10
+ */
11
+ import { AbstractRLLearner, RLConfig } from './AbstractRLLearner';
12
+ import { TaskExperience, AgentAction, TaskState } from '../types';
13
+ /**
14
+ * SARSA configuration (same as base RL config)
15
+ */
16
+ export type SARSAConfig = RLConfig;
17
+ /**
18
+ * SARSALearner - On-policy Temporal Difference Learning
19
+ *
20
+ * SARSA is an on-policy TD control algorithm that learns the Q-values
21
+ * for the policy being followed (typically epsilon-greedy).
22
+ *
23
+ * Key Characteristics:
24
+ * - Updates based on (State, Action, Reward, next State, next Action)
25
+ * - Learns Q-values for the actual policy (including exploration)
26
+ * - More conservative than Q-Learning
27
+ * - Better for tasks where exploration is risky
28
+ * - Converges to optimal policy under certain conditions
29
+ *
30
+ * Update Rule:
31
+ * Q(s,a) ← Q(s,a) + α[r + γQ(s',a') - Q(s,a)]
32
+ * where a' is the action actually taken in state s' (not necessarily greedy)
33
+ */
34
+ export declare class SARSALearner extends AbstractRLLearner {
35
+ private readonly defaultConfig;
36
+ private lastStateAction?;
37
+ constructor(config?: Partial<RLConfig>);
38
+ /**
39
+ * Update Q-value using SARSA on-policy update rule
40
+ * Q(s,a) ← Q(s,a) + α[r + γQ(s',a') - Q(s,a)]
41
+ *
42
+ * @param experience The transition experience (s, a, r, s')
43
+ * @param nextAction The actual action taken in next state (SARSA requires this!)
44
+ * If not provided, selects action using current policy (epsilon-greedy)
45
+ */
46
+ update(experience: TaskExperience, nextAction?: AgentAction): void;
47
+ /**
48
+ * Calculate expected value for next state under current epsilon-greedy policy
49
+ * This is used when we don't have the actual next action (e.g., in batch updates)
50
+ *
51
+ * Expected SARSA: E[Q(s',a')] = ε/|A| * Σ Q(s',a) + (1-ε) * max Q(s',a)
52
+ */
53
+ private getExpectedValue;
54
+ /**
55
+ * Select next action and update with SARSA
56
+ * This is the typical SARSA flow: select action, observe reward, select next action, update
57
+ *
58
+ * @param currentState Current state
59
+ * @param currentAction Action taken in current state
60
+ * @param reward Reward received
61
+ * @param nextState Next state observed
62
+ * @param availableActions Actions available in next state
63
+ * @returns Next action selected (for continued learning)
64
+ */
65
+ selectAndUpdate(currentState: TaskState, currentAction: AgentAction, reward: number, nextState: TaskState, availableActions: AgentAction[]): AgentAction;
66
+ /**
67
+ * Learn from a complete episode trajectory
68
+ * Updates all state-action pairs in the trajectory using SARSA
69
+ *
70
+ * @param trajectory Array of (state, action, reward) tuples
71
+ */
72
+ learnFromEpisode(trajectory: Array<{
73
+ state: TaskState;
74
+ action: AgentAction;
75
+ reward: number;
76
+ }>): void;
77
+ /**
78
+ * Get the default exploration rate for this algorithm
79
+ */
80
+ protected getDefaultExplorationRate(): number;
81
+ /**
82
+ * Get algorithm name
83
+ */
84
+ getAlgorithmName(): string;
85
+ /**
86
+ * Get algorithm type (on-policy)
87
+ */
88
+ getAlgorithmType(): 'on-policy' | 'off-policy';
89
+ /**
90
+ * Get detailed statistics including SARSA-specific metrics
91
+ */
92
+ getDetailedStatistics(): {
93
+ algorithm: string;
94
+ type: 'on-policy' | 'off-policy';
95
+ stats: ReturnType<AbstractRLLearner['getStatistics']>;
96
+ };
97
+ /**
98
+ * Compare performance with expected convergence
99
+ * SARSA typically converges slower but more safely than Q-Learning
100
+ */
101
+ getConvergenceMetrics(): {
102
+ isConverging: boolean;
103
+ convergenceRate: number;
104
+ stability: number;
105
+ };
106
+ }
107
+ //# sourceMappingURL=SARSALearner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"SARSALearner.d.ts","sourceRoot":"","sources":["../../../src/learning/algorithms/SARSALearner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,qBAAqB,CAAC;AAClE,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAElE;;GAEG;AACH,MAAM,MAAM,WAAW,GAAG,QAAQ,CAAC;AAgBnC;;;;;;;;;;;;;;;;GAgBG;AACH,qBAAa,YAAa,SAAQ,iBAAiB;IACjD,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAW;IACzC,OAAO,CAAC,eAAe,CAAC,CAAoC;gBAEhD,MAAM,GAAE,OAAO,CAAC,QAAQ,CAAM;IAO1C;;;;;;;OAOG;IACH,MAAM,CAAC,UAAU,EAAE,cAAc,EAAE,UAAU,CAAC,EAAE,WAAW,GAAG,IAAI;IA8ClE;;;;;OAKG;IACH,OAAO,CAAC,gBAAgB;IAwBxB;;;;;;;;;;OAUG;IACH,eAAe,CACb,YAAY,EAAE,SAAS,EACvB,aAAa,EAAE,WAAW,EAC1B,MAAM,EAAE,MAAM,EACd,SAAS,EAAE,SAAS,EACpB,gBAAgB,EAAE,WAAW,EAAE,GAC9B,WAAW;IAsBd;;;;;OAKG;IACH,gBAAgB,CACd,UAAU,EAAE,KAAK,CAAC;QAChB,KAAK,EAAE,SAAS,CAAC;QACjB,MAAM,EAAE,WAAW,CAAC;QACpB,MAAM,EAAE,MAAM,CAAC;KAChB,CAAC,GACD,IAAI;IA0CP;;OAEG;IACH,SAAS,CAAC,yBAAyB,IAAI,MAAM;IAI7C;;OAEG;IACH,gBAAgB,IAAI,MAAM;IAI1B;;OAEG;IACH,gBAAgB,IAAI,WAAW,GAAG,YAAY;IAI9C;;OAEG;IACH,qBAAqB,IAAI;QACvB,SAAS,EAAE,MAAM,CAAC;QAClB,IAAI,EAAE,WAAW,GAAG,YAAY,CAAC;QACjC,KAAK,EAAE,UAAU,CAAC,iBAAiB,CAAC,eAAe,CAAC,CAAC,CAAC;KACvD;IAQD;;;OAGG;IACH,qBAAqB,IAAI;QACvB,YAAY,EAAE,OAAO,CAAC;QACtB,eAAe,EAAE,MAAM,CAAC;QACxB,SAAS,EAAE,MAAM,CAAC;KACnB;CA+BF"}
@@ -0,0 +1,252 @@
1
+ "use strict";
2
+ /**
3
+ * SARSALearner - On-policy TD(0) Reinforcement Learning
4
+ *
5
+ * Implements SARSA (State-Action-Reward-State-Action) algorithm.
6
+ * Key differences from Q-Learning:
7
+ * - On-policy: learns Q-values for the policy being followed (epsilon-greedy)
8
+ * - Uses actual next action taken, not the max Q-value
9
+ * - Update rule: Q(s,a) ← Q(s,a) + α[r + γQ(s',a') - Q(s,a)]
10
+ * - More conservative than Q-Learning, safer for exploration
11
+ */
12
+ Object.defineProperty(exports, "__esModule", { value: true });
13
+ exports.SARSALearner = void 0;
14
+ const AbstractRLLearner_1 = require("./AbstractRLLearner");
15
+ /**
16
+ * Default SARSA configuration
17
+ */
18
+ const DEFAULT_SARSA_CONFIG = {
19
+ learningRate: 0.1,
20
+ discountFactor: 0.95,
21
+ explorationRate: 0.3,
22
+ explorationDecay: 0.995,
23
+ minExplorationRate: 0.01,
24
+ useExperienceReplay: true,
25
+ replayBufferSize: 10000,
26
+ batchSize: 32
27
+ };
28
+ /**
29
+ * SARSALearner - On-policy Temporal Difference Learning
30
+ *
31
+ * SARSA is an on-policy TD control algorithm that learns the Q-values
32
+ * for the policy being followed (typically epsilon-greedy).
33
+ *
34
+ * Key Characteristics:
35
+ * - Updates based on (State, Action, Reward, next State, next Action)
36
+ * - Learns Q-values for the actual policy (including exploration)
37
+ * - More conservative than Q-Learning
38
+ * - Better for tasks where exploration is risky
39
+ * - Converges to optimal policy under certain conditions
40
+ *
41
+ * Update Rule:
42
+ * Q(s,a) ← Q(s,a) + α[r + γQ(s',a') - Q(s,a)]
43
+ * where a' is the action actually taken in state s' (not necessarily greedy)
44
+ */
45
+ class SARSALearner extends AbstractRLLearner_1.AbstractRLLearner {
46
+ constructor(config = {}) {
47
+ const fullConfig = { ...DEFAULT_SARSA_CONFIG, ...config };
48
+ super(fullConfig);
49
+ this.defaultConfig = fullConfig;
50
+ this.logger.info('SARSALearner initialized with on-policy TD(0)', { config: fullConfig });
51
+ }
52
+ /**
53
+ * Update Q-value using SARSA on-policy update rule
54
+ * Q(s,a) ← Q(s,a) + α[r + γQ(s',a') - Q(s,a)]
55
+ *
56
+ * @param experience The transition experience (s, a, r, s')
57
+ * @param nextAction The actual action taken in next state (SARSA requires this!)
58
+ * If not provided, selects action using current policy (epsilon-greedy)
59
+ */
60
+ update(experience, nextAction) {
61
+ const stateKey = this.encodeState(experience.state);
62
+ const actionKey = this.encodeAction(experience.action);
63
+ const nextStateKey = this.encodeState(experience.nextState);
64
+ // Get current Q-value Q(s,a)
65
+ const stateActions = this.qTable.get(stateKey);
66
+ const currentQ = stateActions?.get(actionKey)?.value ?? 0;
67
+ // SARSA: Get Q-value for next action that will actually be taken
68
+ // This is the key difference from Q-Learning (which uses max Q-value)
69
+ let nextQ = 0;
70
+ if (nextAction) {
71
+ // Use provided next action (typical in online learning)
72
+ const nextActionKey = this.encodeAction(nextAction);
73
+ const nextStateActions = this.qTable.get(nextStateKey);
74
+ nextQ = nextStateActions?.get(nextActionKey)?.value ?? 0;
75
+ }
76
+ else {
77
+ // If no next action provided, we need to select one using epsilon-greedy
78
+ // This happens in batch updates from experience replay
79
+ // We approximate by using a greedy action (conservative estimate)
80
+ const nextStateActions = this.qTable.get(nextStateKey);
81
+ if (nextStateActions && nextStateActions.size > 0) {
82
+ // Use expected SARSA approximation: average over all actions weighted by policy
83
+ nextQ = this.getExpectedValue(experience.nextState, nextStateActions);
84
+ }
85
+ }
86
+ // SARSA update rule
87
+ // Q(s,a) = Q(s,a) + α * [r + γ * Q(s',a') - Q(s,a)]
88
+ const tdTarget = experience.reward + this.config.discountFactor * nextQ;
89
+ const tdError = tdTarget - currentQ;
90
+ const newQ = currentQ + this.config.learningRate * tdError;
91
+ // Update Q-value
92
+ this.setQValue(stateKey, actionKey, newQ);
93
+ // Add to experience replay buffer if enabled
94
+ if (this.replayBuffer) {
95
+ this.replayBuffer.add(experience, Math.abs(tdError)); // Priority based on TD error
96
+ }
97
+ this.stepCount++;
98
+ }
99
+ /**
100
+ * Calculate expected value for next state under current epsilon-greedy policy
101
+ * This is used when we don't have the actual next action (e.g., in batch updates)
102
+ *
103
+ * Expected SARSA: E[Q(s',a')] = ε/|A| * Σ Q(s',a) + (1-ε) * max Q(s',a)
104
+ */
105
+ getExpectedValue(nextState, nextStateActions) {
106
+ if (nextStateActions.size === 0) {
107
+ return 0;
108
+ }
109
+ const epsilon = this.config.explorationRate;
110
+ const numActions = nextStateActions.size;
111
+ // Calculate average Q-value (for random exploration)
112
+ let sumQ = 0;
113
+ let maxQ = -Infinity;
114
+ for (const qValue of nextStateActions.values()) {
115
+ sumQ += qValue.value;
116
+ maxQ = Math.max(maxQ, qValue.value);
117
+ }
118
+ const avgQ = sumQ / numActions;
119
+ // Expected value under epsilon-greedy policy
120
+ // ε * (average of all actions) + (1-ε) * (max action)
121
+ return epsilon * avgQ + (1 - epsilon) * maxQ;
122
+ }
123
+ /**
124
+ * Select next action and update with SARSA
125
+ * This is the typical SARSA flow: select action, observe reward, select next action, update
126
+ *
127
+ * @param currentState Current state
128
+ * @param currentAction Action taken in current state
129
+ * @param reward Reward received
130
+ * @param nextState Next state observed
131
+ * @param availableActions Actions available in next state
132
+ * @returns Next action selected (for continued learning)
133
+ */
134
+ selectAndUpdate(currentState, currentAction, reward, nextState, availableActions) {
135
+ // Select next action using epsilon-greedy policy
136
+ const nextAction = this.selectAction(nextState, availableActions);
137
+ // Create experience
138
+ const experience = {
139
+ taskId: `sarsa-${Date.now()}`,
140
+ taskType: 'online-learning',
141
+ state: currentState,
142
+ action: currentAction,
143
+ reward,
144
+ nextState,
145
+ timestamp: new Date(),
146
+ agentId: 'sarsa-learner'
147
+ };
148
+ // Update Q-value using SARSA rule with actual next action
149
+ this.update(experience, nextAction);
150
+ return nextAction;
151
+ }
152
+ /**
153
+ * Learn from a complete episode trajectory
154
+ * Updates all state-action pairs in the trajectory using SARSA
155
+ *
156
+ * @param trajectory Array of (state, action, reward) tuples
157
+ */
158
+ learnFromEpisode(trajectory) {
159
+ // SARSA updates each transition with the next action in the trajectory
160
+ for (let i = 0; i < trajectory.length - 1; i++) {
161
+ const current = trajectory[i];
162
+ const next = trajectory[i + 1];
163
+ const experience = {
164
+ taskId: `episode-${Date.now()}-${i}`,
165
+ taskType: 'episode-learning',
166
+ state: current.state,
167
+ action: current.action,
168
+ reward: current.reward,
169
+ nextState: next.state,
170
+ timestamp: new Date(),
171
+ agentId: 'sarsa-learner'
172
+ };
173
+ // Update with the actual next action from trajectory
174
+ this.update(experience, next.action);
175
+ }
176
+ // Handle terminal state (last transition)
177
+ if (trajectory.length > 0) {
178
+ const last = trajectory[trajectory.length - 1];
179
+ const terminalExperience = {
180
+ taskId: `episode-${Date.now()}-terminal`,
181
+ taskType: 'episode-learning',
182
+ state: last.state,
183
+ action: last.action,
184
+ reward: last.reward,
185
+ nextState: last.state, // Terminal state transitions to itself
186
+ timestamp: new Date(),
187
+ agentId: 'sarsa-learner'
188
+ };
189
+ // Terminal state has no next action, Q(terminal, any) = 0
190
+ this.update(terminalExperience);
191
+ }
192
+ this.endEpisode();
193
+ }
194
+ /**
195
+ * Get the default exploration rate for this algorithm
196
+ */
197
+ getDefaultExplorationRate() {
198
+ return this.defaultConfig.explorationRate;
199
+ }
200
+ /**
201
+ * Get algorithm name
202
+ */
203
+ getAlgorithmName() {
204
+ return 'SARSA';
205
+ }
206
+ /**
207
+ * Get algorithm type (on-policy)
208
+ */
209
+ getAlgorithmType() {
210
+ return 'on-policy';
211
+ }
212
+ /**
213
+ * Get detailed statistics including SARSA-specific metrics
214
+ */
215
+ getDetailedStatistics() {
216
+ return {
217
+ algorithm: this.getAlgorithmName(),
218
+ type: this.getAlgorithmType(),
219
+ stats: this.getStatistics()
220
+ };
221
+ }
222
+ /**
223
+ * Compare performance with expected convergence
224
+ * SARSA typically converges slower but more safely than Q-Learning
225
+ */
226
+ getConvergenceMetrics() {
227
+ const stats = this.getStatistics();
228
+ // Check if Q-values are stabilizing
229
+ const avgQValue = stats.avgQValue;
230
+ const qValueRange = stats.maxQValue - stats.minQValue;
231
+ // Convergence indicators:
232
+ // 1. Low exploration rate (mostly exploiting)
233
+ // 2. Reasonable Q-value range (not diverging)
234
+ // 3. Sufficient episodes for learning
235
+ const isConverging = stats.explorationRate < 0.1 && // Low exploration
236
+ qValueRange < 10 && // Bounded Q-values
237
+ stats.episodes > 20; // Sufficient training
238
+ const convergenceRate = stats.episodes > 0
239
+ ? Math.min(1.0, stats.episodes / 100)
240
+ : 0;
241
+ const stability = qValueRange > 0
242
+ ? 1.0 - Math.min(1.0, qValueRange / 20)
243
+ : 0.5;
244
+ return {
245
+ isConverging,
246
+ convergenceRate,
247
+ stability
248
+ };
249
+ }
250
+ }
251
+ exports.SARSALearner = SARSALearner;
252
+ //# sourceMappingURL=SARSALearner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"SARSALearner.js","sourceRoot":"","sources":["../../../src/learning/algorithms/SARSALearner.ts"],"names":[],"mappings":";AAAA;;;;;;;;;GASG;;;AAEH,2DAAkE;AAQlE;;GAEG;AACH,MAAM,oBAAoB,GAAa;IACrC,YAAY,EAAE,GAAG;IACjB,cAAc,EAAE,IAAI;IACpB,eAAe,EAAE,GAAG;IACpB,gBAAgB,EAAE,KAAK;IACvB,kBAAkB,EAAE,IAAI;IACxB,mBAAmB,EAAE,IAAI;IACzB,gBAAgB,EAAE,KAAK;IACvB,SAAS,EAAE,EAAE;CACd,CAAC;AAEF;;;;;;;;;;;;;;;;GAgBG;AACH,MAAa,YAAa,SAAQ,qCAAiB;IAIjD,YAAY,SAA4B,EAAE;QACxC,MAAM,UAAU,GAAG,EAAE,GAAG,oBAAoB,EAAE,GAAG,MAAM,EAAE,CAAC;QAC1D,KAAK,CAAC,UAAU,CAAC,CAAC;QAClB,IAAI,CAAC,aAAa,GAAG,UAAU,CAAC;QAChC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,+CAA+C,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,CAAC;IAC5F,CAAC;IAED;;;;;;;OAOG;IACH,MAAM,CAAC,UAA0B,EAAE,UAAwB;QACzD,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;QACpD,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;QACvD,MAAM,YAAY,GAAG,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC;QAE5D,6BAA6B;QAC7B,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAC/C,MAAM,QAAQ,GAAG,YAAY,EAAE,GAAG,CAAC,SAAS,CAAC,EAAE,KAAK,IAAI,CAAC,CAAC;QAE1D,iEAAiE;QACjE,sEAAsE;QACtE,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,IAAI,UAAU,EAAE,CAAC;YACf,wDAAwD;YACxD,MAAM,aAAa,GAAG,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,CAAC;YACpD,MAAM,gBAAgB,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;YACvD,KAAK,GAAG,gBAAgB,EAAE,GAAG,CAAC,aAAa,CAAC,EAAE,KAAK,IAAI,CAAC,CAAC;QAC3D,CAAC;aAAM,CAAC;YACN,yEAAyE;YACzE,uDAAuD;YACvD,kEAAkE;YAClE,MAAM,gBAAgB,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;YACvD,IAAI,gBAAgB,IAAI,gBAAgB,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;gBAClD,gFAAgF;gBAChF,KAAK,GAAG,IAAI,CAAC,gBAAgB,CAAC,UAAU,CAAC,SAAS,EAAE,gBAAgB,CAAC,CAAC;YACxE,CAAC;QACH,CAAC;QAED,oBAAoB;QACpB,oDAAoD;QACpD,MAAM,QAAQ,GAAG,UAAU,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,cAAc,GAAG,KAAK,CAAC;QACxE,MAAM,OAAO,GAAG,QAAQ,GAAG,QAAQ,CAAC;QACpC,MAAM,IAAI,GAAG,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,YAAY,GAAG,OAAO,CAAC;QAE3D,iBAAiB;QACjB,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;QAE1C,6CAA6C;QAC7C,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YACtB,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,UAAU,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,6BAA6B;QACrF,CAAC;QAED,IAAI,CAAC,SAAS,EAAE,CAAC;IACnB,CAAC;IAED;;;;;OAKG;IACK,gBAAgB,CAAC,SAAoB,EAAE,gBAAkC;QAC/E,IAAI,gBAAgB,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;YAChC,OAAO,CAAC,CAAC;QACX,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC;QAC5C,MAAM,UAAU,GAAG,gBAAgB,CAAC,IAAI,CAAC;QAEzC,qDAAqD;QACrD,IAAI,IAAI,GAAG,CAAC,CAAC;QACb,IAAI,IAAI,GAAG,CAAC,QAAQ,CAAC;QAErB,KAAK,MAAM,MAAM,IAAI,gBAAgB,CAAC,MAAM,EAAE,EAAE,CAAC;YAC/C,IAAI,IAAI,MAAM,CAAC,KAAK,CAAC;YACrB,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QACtC,CAAC;QAED,MAAM,IAAI,GAAG,IAAI,GAAG,UAAU,CAAC;QAE/B,6CAA6C;QAC7C,sDAAsD;QACtD,OAAO,OAAO,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,OAAO,CAAC,GAAG,IAAI,CAAC;IAC/C,CAAC;IAED;;;;;;;;;;OAUG;IACH,eAAe,CACb,YAAuB,EACvB,aAA0B,EAC1B,MAAc,EACd,SAAoB,EACpB,gBAA+B;QAE/B,iDAAiD;QACjD,MAAM,UAAU,GAAG,IAAI,CAAC,YAAY,CAAC,SAAS,EAAE,gBAAgB,CAAC,CAAC;QAElE,oBAAoB;QACpB,MAAM,UAAU,GAAmB;YACjC,MAAM,EAAE,SAAS,IAAI,CAAC,GAAG,EAAE,EAAE;YAC7B,QAAQ,EAAE,iBAAiB;YAC3B,KAAK,EAAE,YAAY;YACnB,MAAM,EAAE,aAAa;YACrB,MAAM;YACN,SAAS;YACT,SAAS,EAAE,IAAI,IAAI,EAAE;YACrB,OAAO,EAAE,eAAe;SACzB,CAAC;QAEF,0DAA0D;QAC1D,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;QAEpC,OAAO,UAAU,CAAC;IACpB,CAAC;IAED;;;;;OAKG;IACH,gBAAgB,CACd,UAIE;QAEF,uEAAuE;QACvE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC/C,MAAM,OAAO,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAC9B,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;YAE/B,MAAM,UAAU,GAAmB;gBACjC,MAAM,EAAE,WAAW,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,EAAE;gBACpC,QAAQ,EAAE,kBAAkB;gBAC5B,KAAK,EAAE,OAAO,CAAC,KAAK;gBACpB,MAAM,EAAE,OAAO,CAAC,MAAM;gBACtB,MAAM,EAAE,OAAO,CAAC,MAAM;gBACtB,SAAS,EAAE,IAAI,CAAC,KAAK;gBACrB,SAAS,EAAE,IAAI,IAAI,EAAE;gBACrB,OAAO,EAAE,eAAe;aACzB,CAAC;YAEF,qDAAqD;YACrD,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QACvC,CAAC;QAED,0CAA0C;QAC1C,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,GAAG,UAAU,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YAC/C,MAAM,kBAAkB,GAAmB;gBACzC,MAAM,EAAE,WAAW,IAAI,CAAC,GAAG,EAAE,WAAW;gBACxC,QAAQ,EAAE,kBAAkB;gBAC5B,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,SAAS,EAAE,IAAI,CAAC,KAAK,EAAE,uCAAuC;gBAC9D,SAAS,EAAE,IAAI,IAAI,EAAE;gBACrB,OAAO,EAAE,eAAe;aACzB,CAAC;YAEF,0DAA0D;YAC1D,IAAI,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC;QAClC,CAAC;QAED,IAAI,CAAC,UAAU,EAAE,CAAC;IACpB,CAAC;IAED;;OAEG;IACO,yBAAyB;QACjC,OAAO,IAAI,CAAC,aAAa,CAAC,eAAe,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,OAAO,WAAW,CAAC;IACrB,CAAC;IAED;;OAEG;IACH,qBAAqB;QAKnB,OAAO;YACL,SAAS,EAAE,IAAI,CAAC,gBAAgB,EAAE;YAClC,IAAI,EAAE,IAAI,CAAC,gBAAgB,EAAE;YAC7B,KAAK,EAAE,IAAI,CAAC,aAAa,EAAE;SAC5B,CAAC;IACJ,CAAC;IAED;;;OAGG;IACH,qBAAqB;QAKnB,MAAM,KAAK,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;QAEnC,oCAAoC;QACpC,MAAM,SAAS,GAAG,KAAK,CAAC,SAAS,CAAC;QAClC,MAAM,WAAW,GAAG,KAAK,CAAC,SAAS,GAAG,KAAK,CAAC,SAAS,CAAC;QAEtD,0BAA0B;QAC1B,8CAA8C;QAC9C,8CAA8C;QAC9C,sCAAsC;QAEtC,MAAM,YAAY,GAChB,KAAK,CAAC,eAAe,GAAG,GAAG,IAAI,kBAAkB;YACjD,WAAW,GAAG,EAAE,IAAI,mBAAmB;YACvC,KAAK,CAAC,QAAQ,GAAG,EAAE,CAAC,CAAC,sBAAsB;QAE7C,MAAM,eAAe,GAAG,KAAK,CAAC,QAAQ,GAAG,CAAC;YACxC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,QAAQ,GAAG,GAAG,CAAC;YACrC,CAAC,CAAC,CAAC,CAAC;QAEN,MAAM,SAAS,GAAG,WAAW,GAAG,CAAC;YAC/B,CAAC,CAAC,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,WAAW,GAAG,EAAE,CAAC;YACvC,CAAC,CAAC,GAAG,CAAC;QAER,OAAO;YACL,YAAY;YACZ,eAAe;YACf,SAAS;SACV,CAAC;IACJ,CAAC;CACF;AAvQD,oCAuQC"}
@@ -0,0 +1,29 @@
1
+ /**
2
+ * Learning Algorithms - Reinforcement Learning Implementations
3
+ *
4
+ * This module provides various RL algorithms for agent learning:
5
+ * - AbstractRLLearner: Base class for all RL algorithms
6
+ * - QLearning: Off-policy TD(0) algorithm
7
+ * - SARSALearner: On-policy TD(0) algorithm
8
+ * - ActorCriticLearner: Advantage Actor-Critic (A2C) algorithm
9
+ * - PPOLearner: Proximal Policy Optimization (PPO-Clip) algorithm
10
+ */
11
+ import { AbstractRLLearner, RLConfig, QValue } from './AbstractRLLearner';
12
+ import { QLearning, QLearningConfig } from '../QLearning';
13
+ import { SARSALearner, SARSAConfig } from './SARSALearner';
14
+ import { ActorCriticLearner, ActorCriticConfig, createDefaultActorCriticConfig } from './ActorCriticLearner';
15
+ import { PPOLearner, PPOConfig, createDefaultPPOConfig } from './PPOLearner';
16
+ export { AbstractRLLearner, RLConfig, QValue };
17
+ export { QLearning, QLearningConfig };
18
+ export { SARSALearner, SARSAConfig };
19
+ export { ActorCriticLearner, ActorCriticConfig, createDefaultActorCriticConfig };
20
+ export { PPOLearner, PPOConfig, createDefaultPPOConfig };
21
+ /**
22
+ * Supported RL algorithm types
23
+ */
24
+ export type RLAlgorithmType = 'q-learning' | 'sarsa' | 'actor-critic' | 'ppo';
25
+ /**
26
+ * Factory function to create RL algorithm instances
27
+ */
28
+ export declare function createRLAlgorithm(type: RLAlgorithmType, config?: any): AbstractRLLearner;
29
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/learning/algorithms/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,qBAAqB,CAAC;AAC1E,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAC1D,OAAO,EAAE,YAAY,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC3D,OAAO,EAAE,kBAAkB,EAAE,iBAAiB,EAAE,8BAA8B,EAAE,MAAM,sBAAsB,CAAC;AAC7G,OAAO,EAAE,UAAU,EAAE,SAAS,EAAE,sBAAsB,EAAE,MAAM,cAAc,CAAC;AAE7E,OAAO,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC;AAC/C,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,CAAC;AACtC,OAAO,EAAE,YAAY,EAAE,WAAW,EAAE,CAAC;AACrC,OAAO,EAAE,kBAAkB,EAAE,iBAAiB,EAAE,8BAA8B,EAAE,CAAC;AACjF,OAAO,EAAE,UAAU,EAAE,SAAS,EAAE,sBAAsB,EAAE,CAAC;AAEzD;;GAEG;AACH,MAAM,MAAM,eAAe,GAAG,YAAY,GAAG,OAAO,GAAG,cAAc,GAAG,KAAK,CAAC;AAE9E;;GAEG;AACH,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE,eAAe,EACrB,MAAM,CAAC,EAAE,GAAG,GACX,iBAAiB,CAanB"}
@@ -0,0 +1,44 @@
1
+ "use strict";
2
+ /**
3
+ * Learning Algorithms - Reinforcement Learning Implementations
4
+ *
5
+ * This module provides various RL algorithms for agent learning:
6
+ * - AbstractRLLearner: Base class for all RL algorithms
7
+ * - QLearning: Off-policy TD(0) algorithm
8
+ * - SARSALearner: On-policy TD(0) algorithm
9
+ * - ActorCriticLearner: Advantage Actor-Critic (A2C) algorithm
10
+ * - PPOLearner: Proximal Policy Optimization (PPO-Clip) algorithm
11
+ */
12
+ Object.defineProperty(exports, "__esModule", { value: true });
13
+ exports.createDefaultPPOConfig = exports.PPOLearner = exports.createDefaultActorCriticConfig = exports.ActorCriticLearner = exports.SARSALearner = exports.QLearning = exports.AbstractRLLearner = void 0;
14
+ exports.createRLAlgorithm = createRLAlgorithm;
15
+ const AbstractRLLearner_1 = require("./AbstractRLLearner");
16
+ Object.defineProperty(exports, "AbstractRLLearner", { enumerable: true, get: function () { return AbstractRLLearner_1.AbstractRLLearner; } });
17
+ const QLearning_1 = require("../QLearning");
18
+ Object.defineProperty(exports, "QLearning", { enumerable: true, get: function () { return QLearning_1.QLearning; } });
19
+ const SARSALearner_1 = require("./SARSALearner");
20
+ Object.defineProperty(exports, "SARSALearner", { enumerable: true, get: function () { return SARSALearner_1.SARSALearner; } });
21
+ const ActorCriticLearner_1 = require("./ActorCriticLearner");
22
+ Object.defineProperty(exports, "ActorCriticLearner", { enumerable: true, get: function () { return ActorCriticLearner_1.ActorCriticLearner; } });
23
+ Object.defineProperty(exports, "createDefaultActorCriticConfig", { enumerable: true, get: function () { return ActorCriticLearner_1.createDefaultActorCriticConfig; } });
24
+ const PPOLearner_1 = require("./PPOLearner");
25
+ Object.defineProperty(exports, "PPOLearner", { enumerable: true, get: function () { return PPOLearner_1.PPOLearner; } });
26
+ Object.defineProperty(exports, "createDefaultPPOConfig", { enumerable: true, get: function () { return PPOLearner_1.createDefaultPPOConfig; } });
27
+ /**
28
+ * Factory function to create RL algorithm instances
29
+ */
30
+ function createRLAlgorithm(type, config) {
31
+ switch (type) {
32
+ case 'q-learning':
33
+ return new QLearning_1.QLearning(config);
34
+ case 'sarsa':
35
+ return new SARSALearner_1.SARSALearner(config);
36
+ case 'actor-critic':
37
+ return new ActorCriticLearner_1.ActorCriticLearner(config ?? (0, ActorCriticLearner_1.createDefaultActorCriticConfig)());
38
+ case 'ppo':
39
+ return new PPOLearner_1.PPOLearner(config ?? (0, PPOLearner_1.createDefaultPPOConfig)());
40
+ default:
41
+ throw new Error(`Unknown RL algorithm type: ${type}`);
42
+ }
43
+ }
44
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/learning/algorithms/index.ts"],"names":[],"mappings":";AAAA;;;;;;;;;GASG;;;AAsBH,8CAgBC;AApCD,2DAA0E;AAMjE,kGANA,qCAAiB,OAMA;AAL1B,4CAA0D;AAMjD,0FANA,qBAAS,OAMA;AALlB,iDAA2D;AAMlD,6FANA,2BAAY,OAMA;AALrB,6DAA6G;AAMpG,mGANA,uCAAkB,OAMA;AAAqB,+GANA,mDAA8B,OAMA;AAL9E,6CAA6E;AAMpE,2FANA,uBAAU,OAMA;AAAa,uGANA,mCAAsB,OAMA;AAOtD;;GAEG;AACH,SAAgB,iBAAiB,CAC/B,IAAqB,EACrB,MAAY;IAEZ,QAAQ,IAAI,EAAE,CAAC;QACb,KAAK,YAAY;YACf,OAAO,IAAI,qBAAS,CAAC,MAAM,CAAC,CAAC;QAC/B,KAAK,OAAO;YACV,OAAO,IAAI,2BAAY,CAAC,MAAM,CAAC,CAAC;QAClC,KAAK,cAAc;YACjB,OAAO,IAAI,uCAAkB,CAAC,MAAM,IAAI,IAAA,mDAA8B,GAAE,CAAC,CAAC;QAC5E,KAAK,KAAK;YACR,OAAO,IAAI,uBAAU,CAAC,MAAM,IAAI,IAAA,mCAAsB,GAAE,CAAC,CAAC;QAC5D;YACE,MAAM,IAAI,KAAK,CAAC,8BAA8B,IAAI,EAAE,CAAC,CAAC;IAC1D,CAAC;AACH,CAAC"}
@@ -1,6 +1,7 @@
1
1
  /**
2
2
  * Learning System - Phase 2 (Milestone 2.2)
3
3
  * Enhanced (v1.3.3+) - ML Root Cause Analysis and Fix Recommendations
4
+ * Enhanced (v2.2.0+) - Self-Learning Upgrade with RL Algorithms
4
5
  *
5
6
  * Exports all learning components for agent performance improvement.
6
7
  */
@@ -17,4 +18,6 @@ export * from './FlakyFixRecommendations';
17
18
  export * from './StatisticalAnalysis';
18
19
  export * from './SwarmIntegration';
19
20
  export { FixRecommendationEngine } from './FixRecommendationEngine';
21
+ export { AbstractRLLearner, RLConfig, QValue, SARSALearner, SARSAConfig, ActorCriticLearner, ActorCriticConfig, createDefaultActorCriticConfig, PPOLearner, PPOConfig, createDefaultPPOConfig, createRLAlgorithm } from './algorithms';
22
+ export { ExperienceSharingProtocol, ExperienceSharingConfig, SharedExperience, SharingStats, PeerConnection, SharingEvent } from './ExperienceSharingProtocol';
20
23
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/learning/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,cAAc,SAAS,CAAC;AACxB,cAAc,kBAAkB,CAAC;AACjC,cAAc,aAAa,CAAC;AAC5B,cAAc,0BAA0B,CAAC;AACzC,cAAc,sBAAsB,CAAC;AACrC,cAAc,mBAAmB,CAAC;AAClC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,wBAAwB,CAAC;AACvC,cAAc,2BAA2B,CAAC;AAC1C,cAAc,uBAAuB,CAAC;AACtC,cAAc,oBAAoB,CAAC;AAGnC,OAAO,EACL,uBAAuB,EACxB,MAAM,2BAA2B,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/learning/index.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,cAAc,SAAS,CAAC;AACxB,cAAc,kBAAkB,CAAC;AACjC,cAAc,aAAa,CAAC;AAC5B,cAAc,0BAA0B,CAAC;AACzC,cAAc,sBAAsB,CAAC;AACrC,cAAc,mBAAmB,CAAC;AAClC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,wBAAwB,CAAC;AACvC,cAAc,2BAA2B,CAAC;AAC1C,cAAc,uBAAuB,CAAC;AACtC,cAAc,oBAAoB,CAAC;AAGnC,OAAO,EACL,uBAAuB,EACxB,MAAM,2BAA2B,CAAC;AAInC,OAAO,EACL,iBAAiB,EACjB,QAAQ,EACR,MAAM,EACN,YAAY,EACZ,WAAW,EACX,kBAAkB,EAClB,iBAAiB,EACjB,8BAA8B,EAC9B,UAAU,EACV,SAAS,EACT,sBAAsB,EACtB,iBAAiB,EAClB,MAAM,cAAc,CAAC;AAGtB,OAAO,EACL,yBAAyB,EACzB,uBAAuB,EACvB,gBAAgB,EAChB,YAAY,EACZ,cAAc,EACd,YAAY,EACb,MAAM,6BAA6B,CAAC"}
@@ -2,6 +2,7 @@
2
2
  /**
3
3
  * Learning System - Phase 2 (Milestone 2.2)
4
4
  * Enhanced (v1.3.3+) - ML Root Cause Analysis and Fix Recommendations
5
+ * Enhanced (v2.2.0+) - Self-Learning Upgrade with RL Algorithms
5
6
  *
6
7
  * Exports all learning components for agent performance improvement.
7
8
  */
@@ -20,7 +21,7 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
20
21
  for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
21
22
  };
22
23
  Object.defineProperty(exports, "__esModule", { value: true });
23
- exports.FixRecommendationEngine = void 0;
24
+ exports.ExperienceSharingProtocol = exports.createRLAlgorithm = exports.createDefaultPPOConfig = exports.PPOLearner = exports.createDefaultActorCriticConfig = exports.ActorCriticLearner = exports.SARSALearner = exports.AbstractRLLearner = exports.FixRecommendationEngine = void 0;
24
25
  __exportStar(require("./types"), exports);
25
26
  __exportStar(require("./LearningEngine"), exports);
26
27
  __exportStar(require("./QLearning"), exports);
@@ -36,4 +37,17 @@ __exportStar(require("./SwarmIntegration"), exports);
36
37
  // Enhanced fix recommendations (NEW in v1.3.3+)
37
38
  var FixRecommendationEngine_1 = require("./FixRecommendationEngine");
38
39
  Object.defineProperty(exports, "FixRecommendationEngine", { enumerable: true, get: function () { return FixRecommendationEngine_1.FixRecommendationEngine; } });
40
+ // RL Algorithms (NEW in v2.2.0+)
41
+ // Note: RLAlgorithmType is already exported from LearningEngine, so we use explicit exports
42
+ var algorithms_1 = require("./algorithms");
43
+ Object.defineProperty(exports, "AbstractRLLearner", { enumerable: true, get: function () { return algorithms_1.AbstractRLLearner; } });
44
+ Object.defineProperty(exports, "SARSALearner", { enumerable: true, get: function () { return algorithms_1.SARSALearner; } });
45
+ Object.defineProperty(exports, "ActorCriticLearner", { enumerable: true, get: function () { return algorithms_1.ActorCriticLearner; } });
46
+ Object.defineProperty(exports, "createDefaultActorCriticConfig", { enumerable: true, get: function () { return algorithms_1.createDefaultActorCriticConfig; } });
47
+ Object.defineProperty(exports, "PPOLearner", { enumerable: true, get: function () { return algorithms_1.PPOLearner; } });
48
+ Object.defineProperty(exports, "createDefaultPPOConfig", { enumerable: true, get: function () { return algorithms_1.createDefaultPPOConfig; } });
49
+ Object.defineProperty(exports, "createRLAlgorithm", { enumerable: true, get: function () { return algorithms_1.createRLAlgorithm; } });
50
+ // Experience Sharing (NEW in v2.2.0+)
51
+ var ExperienceSharingProtocol_1 = require("./ExperienceSharingProtocol");
52
+ Object.defineProperty(exports, "ExperienceSharingProtocol", { enumerable: true, get: function () { return ExperienceSharingProtocol_1.ExperienceSharingProtocol; } });
39
53
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/learning/index.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;;;;;;;;;;;;;;;AAEH,0CAAwB;AACxB,mDAAiC;AACjC,8CAA4B;AAC5B,2DAAyC;AACzC,uDAAqC;AACrC,oDAAkC;AAClC,sDAAoC;AACpC,sDAAoC;AACpC,yDAAuC;AACvC,4DAA0C;AAC1C,wDAAsC;AACtC,qDAAmC;AAEnC,gDAAgD;AAChD,qEAEmC;AADjC,kIAAA,uBAAuB,OAAA"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/learning/index.ts"],"names":[],"mappings":";AAAA;;;;;;GAMG;;;;;;;;;;;;;;;;;AAEH,0CAAwB;AACxB,mDAAiC;AACjC,8CAA4B;AAC5B,2DAAyC;AACzC,uDAAqC;AACrC,oDAAkC;AAClC,sDAAoC;AACpC,sDAAoC;AACpC,yDAAuC;AACvC,4DAA0C;AAC1C,wDAAsC;AACtC,qDAAmC;AAEnC,gDAAgD;AAChD,qEAEmC;AADjC,kIAAA,uBAAuB,OAAA;AAGzB,iCAAiC;AACjC,4FAA4F;AAC5F,2CAasB;AAZpB,+GAAA,iBAAiB,OAAA;AAGjB,0GAAA,YAAY,OAAA;AAEZ,gHAAA,kBAAkB,OAAA;AAElB,4HAAA,8BAA8B,OAAA;AAC9B,wGAAA,UAAU,OAAA;AAEV,oHAAA,sBAAsB,OAAA;AACtB,+GAAA,iBAAiB,OAAA;AAGnB,sCAAsC;AACtC,yEAOqC;AANnC,sIAAA,yBAAyB,OAAA"}
@@ -31,6 +31,8 @@ export interface TaskExperience {
31
31
  nextState: TaskState;
32
32
  timestamp: Date;
33
33
  agentId: string;
34
+ /** Whether this experience represents a terminal state (episode end) */
35
+ done?: boolean;
34
36
  }
35
37
  /**
36
38
  * State representation for reinforcement learning