agentic-qe 2.1.2 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +123 -0
- package/README.md +1 -1
- package/dist/agents/index.d.ts.map +1 -1
- package/dist/agents/index.js +5 -1
- package/dist/agents/index.js.map +1 -1
- package/dist/core/di/AgentDependencies.d.ts +127 -0
- package/dist/core/di/AgentDependencies.d.ts.map +1 -0
- package/dist/core/di/AgentDependencies.js +251 -0
- package/dist/core/di/AgentDependencies.js.map +1 -0
- package/dist/core/di/DIContainer.d.ts +149 -0
- package/dist/core/di/DIContainer.d.ts.map +1 -0
- package/dist/core/di/DIContainer.js +333 -0
- package/dist/core/di/DIContainer.js.map +1 -0
- package/dist/core/di/index.d.ts +11 -0
- package/dist/core/di/index.d.ts.map +1 -0
- package/dist/core/di/index.js +22 -0
- package/dist/core/di/index.js.map +1 -0
- package/dist/core/index.d.ts +1 -0
- package/dist/core/index.d.ts.map +1 -1
- package/dist/core/index.js +11 -1
- package/dist/core/index.js.map +1 -1
- package/dist/learning/ExperienceSharingProtocol.d.ts +243 -0
- package/dist/learning/ExperienceSharingProtocol.d.ts.map +1 -0
- package/dist/learning/ExperienceSharingProtocol.js +538 -0
- package/dist/learning/ExperienceSharingProtocol.js.map +1 -0
- package/dist/learning/LearningEngine.d.ts +101 -1
- package/dist/learning/LearningEngine.d.ts.map +1 -1
- package/dist/learning/LearningEngine.js +330 -3
- package/dist/learning/LearningEngine.js.map +1 -1
- package/dist/learning/QLearning.d.ts +38 -125
- package/dist/learning/QLearning.d.ts.map +1 -1
- package/dist/learning/QLearning.js +46 -267
- package/dist/learning/QLearning.js.map +1 -1
- package/dist/learning/QLearningLegacy.d.ts +154 -0
- package/dist/learning/QLearningLegacy.d.ts.map +1 -0
- package/dist/learning/QLearningLegacy.js +337 -0
- package/dist/learning/QLearningLegacy.js.map +1 -0
- package/dist/learning/algorithms/AbstractRLLearner.d.ts +162 -0
- package/dist/learning/algorithms/AbstractRLLearner.d.ts.map +1 -0
- package/dist/learning/algorithms/AbstractRLLearner.js +300 -0
- package/dist/learning/algorithms/AbstractRLLearner.js.map +1 -0
- package/dist/learning/algorithms/ActorCriticLearner.d.ts +201 -0
- package/dist/learning/algorithms/ActorCriticLearner.d.ts.map +1 -0
- package/dist/learning/algorithms/ActorCriticLearner.js +447 -0
- package/dist/learning/algorithms/ActorCriticLearner.js.map +1 -0
- package/dist/learning/algorithms/PPOLearner.d.ts +207 -0
- package/dist/learning/algorithms/PPOLearner.d.ts.map +1 -0
- package/dist/learning/algorithms/PPOLearner.js +490 -0
- package/dist/learning/algorithms/PPOLearner.js.map +1 -0
- package/dist/learning/algorithms/QLearning.d.ts +68 -0
- package/dist/learning/algorithms/QLearning.d.ts.map +1 -0
- package/dist/learning/algorithms/QLearning.js +116 -0
- package/dist/learning/algorithms/QLearning.js.map +1 -0
- package/dist/learning/algorithms/SARSALearner.d.ts +107 -0
- package/dist/learning/algorithms/SARSALearner.d.ts.map +1 -0
- package/dist/learning/algorithms/SARSALearner.js +252 -0
- package/dist/learning/algorithms/SARSALearner.js.map +1 -0
- package/dist/learning/algorithms/index.d.ts +29 -0
- package/dist/learning/algorithms/index.d.ts.map +1 -0
- package/dist/learning/algorithms/index.js +44 -0
- package/dist/learning/algorithms/index.js.map +1 -0
- package/dist/learning/index.d.ts +3 -0
- package/dist/learning/index.d.ts.map +1 -1
- package/dist/learning/index.js +15 -1
- package/dist/learning/index.js.map +1 -1
- package/dist/learning/types.d.ts +2 -0
- package/dist/learning/types.d.ts.map +1 -1
- package/dist/memory/DistributedPatternLibrary.d.ts +159 -0
- package/dist/memory/DistributedPatternLibrary.d.ts.map +1 -0
- package/dist/memory/DistributedPatternLibrary.js +370 -0
- package/dist/memory/DistributedPatternLibrary.js.map +1 -0
- package/dist/memory/PatternQualityScorer.d.ts +169 -0
- package/dist/memory/PatternQualityScorer.d.ts.map +1 -0
- package/dist/memory/PatternQualityScorer.js +327 -0
- package/dist/memory/PatternQualityScorer.js.map +1 -0
- package/dist/memory/PatternReplicationService.d.ts +187 -0
- package/dist/memory/PatternReplicationService.d.ts.map +1 -0
- package/dist/memory/PatternReplicationService.js +392 -0
- package/dist/memory/PatternReplicationService.js.map +1 -0
- package/dist/providers/ClaudeProvider.d.ts +98 -0
- package/dist/providers/ClaudeProvider.d.ts.map +1 -0
- package/dist/providers/ClaudeProvider.js +418 -0
- package/dist/providers/ClaudeProvider.js.map +1 -0
- package/dist/providers/ILLMProvider.d.ts +287 -0
- package/dist/providers/ILLMProvider.d.ts.map +1 -0
- package/dist/providers/ILLMProvider.js +33 -0
- package/dist/providers/ILLMProvider.js.map +1 -0
- package/dist/providers/LLMProviderFactory.d.ts +154 -0
- package/dist/providers/LLMProviderFactory.d.ts.map +1 -0
- package/dist/providers/LLMProviderFactory.js +426 -0
- package/dist/providers/LLMProviderFactory.js.map +1 -0
- package/dist/providers/RuvllmProvider.d.ts +107 -0
- package/dist/providers/RuvllmProvider.d.ts.map +1 -0
- package/dist/providers/RuvllmProvider.js +417 -0
- package/dist/providers/RuvllmProvider.js.map +1 -0
- package/dist/providers/index.d.ts +31 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +69 -0
- package/dist/providers/index.js.map +1 -0
- package/package.json +1 -1
|
@@ -1,154 +1,67 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* QLearning -
|
|
2
|
+
* QLearning - Off-policy TD(0) Reinforcement Learning
|
|
3
3
|
*
|
|
4
4
|
* Implements standard Q-learning algorithm for reinforcement learning.
|
|
5
|
-
*
|
|
5
|
+
* Key differences from SARSA:
|
|
6
|
+
* - Off-policy: learns optimal Q-values regardless of policy being followed
|
|
7
|
+
* - Uses max Q-value for next state, not actual next action
|
|
8
|
+
* - Update rule: Q(s,a) ← Q(s,a) + α[r + γ·max(Q(s',a')) - Q(s,a)]
|
|
9
|
+
* - More aggressive than SARSA, finds optimal policy faster
|
|
6
10
|
*/
|
|
7
|
-
import {
|
|
11
|
+
import { AbstractRLLearner, RLConfig } from './algorithms/AbstractRLLearner';
|
|
12
|
+
import { TaskExperience, AgentAction } from './types';
|
|
8
13
|
/**
|
|
9
|
-
* Q-learning
|
|
14
|
+
* Q-learning configuration (same as base RL config)
|
|
10
15
|
*/
|
|
11
|
-
export
|
|
12
|
-
learningRate: number;
|
|
13
|
-
discountFactor: number;
|
|
14
|
-
explorationRate: number;
|
|
15
|
-
explorationDecay: number;
|
|
16
|
-
minExplorationRate: number;
|
|
17
|
-
useExperienceReplay: boolean;
|
|
18
|
-
replayBufferSize: number;
|
|
19
|
-
batchSize: number;
|
|
20
|
-
}
|
|
21
|
-
/**
|
|
22
|
-
* Q-learning action-value pair
|
|
23
|
-
*/
|
|
24
|
-
interface QValue {
|
|
25
|
-
state: string;
|
|
26
|
-
action: string;
|
|
27
|
-
value: number;
|
|
28
|
-
updateCount: number;
|
|
29
|
-
lastUpdated: number;
|
|
30
|
-
}
|
|
16
|
+
export type QLearningConfig = RLConfig;
|
|
31
17
|
/**
|
|
32
18
|
* QLearning - Standard Q-learning implementation
|
|
33
19
|
*
|
|
34
20
|
* Implements the classic Q-learning algorithm with:
|
|
35
21
|
* - Epsilon-greedy exploration policy
|
|
36
|
-
* -
|
|
22
|
+
* - Off-policy temporal difference (TD) learning
|
|
37
23
|
* - Q-table for state-action values
|
|
38
24
|
* - Optional experience replay for stability
|
|
25
|
+
*
|
|
26
|
+
* Update Rule:
|
|
27
|
+
* Q(s,a) ← Q(s,a) + α[r + γ·max_a'(Q(s',a')) - Q(s,a)]
|
|
28
|
+
*
|
|
29
|
+
* Key characteristics:
|
|
30
|
+
* - Off-policy: learns about optimal policy while following exploration policy
|
|
31
|
+
* - Uses max Q-value (greedy) for bootstrapping
|
|
32
|
+
* - Converges to optimal Q* under certain conditions
|
|
33
|
+
* - More sample-efficient than on-policy methods
|
|
39
34
|
*/
|
|
40
|
-
export declare class QLearning {
|
|
41
|
-
private readonly
|
|
42
|
-
|
|
43
|
-
private qTable;
|
|
44
|
-
private replayBuffer?;
|
|
45
|
-
private stepCount;
|
|
46
|
-
private episodeCount;
|
|
47
|
-
constructor(config?: Partial<QLearningConfig>);
|
|
48
|
-
/**
|
|
49
|
-
* Select action using epsilon-greedy policy
|
|
50
|
-
* With probability ε, select random action (exploration)
|
|
51
|
-
* Otherwise, select action with highest Q-value (exploitation)
|
|
52
|
-
*/
|
|
53
|
-
selectAction(state: TaskState, availableActions: AgentAction[]): AgentAction;
|
|
54
|
-
/**
|
|
55
|
-
* Get best action based on current Q-values
|
|
56
|
-
*/
|
|
57
|
-
getBestAction(state: TaskState, availableActions: AgentAction[]): AgentAction;
|
|
35
|
+
export declare class QLearning extends AbstractRLLearner {
|
|
36
|
+
private readonly defaultConfig;
|
|
37
|
+
constructor(config?: Partial<RLConfig>);
|
|
58
38
|
/**
|
|
59
39
|
* Update Q-value using Q-learning update rule
|
|
60
40
|
* Q(s,a) ← Q(s,a) + α[r + γ·max(Q(s',a')) - Q(s,a)]
|
|
41
|
+
*
|
|
42
|
+
* @param experience The transition experience (s, a, r, s')
|
|
43
|
+
* @param nextAction Ignored in Q-learning (uses max Q-value instead)
|
|
61
44
|
*/
|
|
62
|
-
update(experience: TaskExperience): void;
|
|
45
|
+
update(experience: TaskExperience, nextAction?: AgentAction): void;
|
|
63
46
|
/**
|
|
64
|
-
*
|
|
65
|
-
* Samples random batch from replay buffer and updates Q-values
|
|
47
|
+
* Get the default exploration rate for this algorithm
|
|
66
48
|
*/
|
|
67
|
-
|
|
49
|
+
protected getDefaultExplorationRate(): number;
|
|
68
50
|
/**
|
|
69
|
-
* Get
|
|
51
|
+
* Get algorithm name
|
|
70
52
|
*/
|
|
71
|
-
|
|
53
|
+
getAlgorithmName(): string;
|
|
72
54
|
/**
|
|
73
|
-
* Get
|
|
55
|
+
* Get algorithm type (off-policy)
|
|
74
56
|
*/
|
|
75
|
-
|
|
57
|
+
getAlgorithmType(): 'on-policy' | 'off-policy';
|
|
76
58
|
/**
|
|
77
|
-
* Get
|
|
78
|
-
* V(s) = max_a Q(s,a)
|
|
59
|
+
* Get detailed statistics including Q-learning-specific metrics
|
|
79
60
|
*/
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
*/
|
|
85
|
-
decayExploration(): void;
|
|
86
|
-
/**
|
|
87
|
-
* Mark end of episode
|
|
88
|
-
*/
|
|
89
|
-
endEpisode(): void;
|
|
90
|
-
/**
|
|
91
|
-
* Encode state to string key for Q-table
|
|
92
|
-
*/
|
|
93
|
-
private encodeState;
|
|
94
|
-
/**
|
|
95
|
-
* Encode action to string key for Q-table
|
|
96
|
-
*/
|
|
97
|
-
private encodeAction;
|
|
98
|
-
/**
|
|
99
|
-
* Get current exploration rate (epsilon)
|
|
100
|
-
*/
|
|
101
|
-
getExplorationRate(): number;
|
|
102
|
-
/**
|
|
103
|
-
* Get total number of learning steps
|
|
104
|
-
*/
|
|
105
|
-
getStepCount(): number;
|
|
106
|
-
/**
|
|
107
|
-
* Get total number of episodes
|
|
108
|
-
*/
|
|
109
|
-
getEpisodeCount(): number;
|
|
110
|
-
/**
|
|
111
|
-
* Get Q-table size (number of state-action pairs)
|
|
112
|
-
*/
|
|
113
|
-
getTableSize(): number;
|
|
114
|
-
/**
|
|
115
|
-
* Get statistics about learning progress
|
|
116
|
-
*/
|
|
117
|
-
getStatistics(): {
|
|
118
|
-
steps: number;
|
|
119
|
-
episodes: number;
|
|
120
|
-
tableSize: number;
|
|
121
|
-
explorationRate: number;
|
|
122
|
-
avgQValue: number;
|
|
123
|
-
maxQValue: number;
|
|
124
|
-
minQValue: number;
|
|
61
|
+
getDetailedStatistics(): {
|
|
62
|
+
algorithm: string;
|
|
63
|
+
type: 'on-policy' | 'off-policy';
|
|
64
|
+
stats: ReturnType<AbstractRLLearner['getStatistics']>;
|
|
125
65
|
};
|
|
126
|
-
/**
|
|
127
|
-
* Reset Q-table and learning state
|
|
128
|
-
*/
|
|
129
|
-
reset(): void;
|
|
130
|
-
/**
|
|
131
|
-
* Export Q-table and state for persistence
|
|
132
|
-
*/
|
|
133
|
-
export(): {
|
|
134
|
-
qTable: Record<string, Record<string, QValue>>;
|
|
135
|
-
config: QLearningConfig;
|
|
136
|
-
stepCount: number;
|
|
137
|
-
episodeCount: number;
|
|
138
|
-
};
|
|
139
|
-
/**
|
|
140
|
-
* Import Q-table and state from persistence
|
|
141
|
-
*/
|
|
142
|
-
import(state: {
|
|
143
|
-
qTable: Record<string, Record<string, QValue>>;
|
|
144
|
-
config: QLearningConfig;
|
|
145
|
-
stepCount: number;
|
|
146
|
-
episodeCount: number;
|
|
147
|
-
}): void;
|
|
148
|
-
/**
|
|
149
|
-
* Get memory usage estimate in bytes
|
|
150
|
-
*/
|
|
151
|
-
getMemoryUsage(): number;
|
|
152
66
|
}
|
|
153
|
-
export {};
|
|
154
67
|
//# sourceMappingURL=QLearning.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"QLearning.d.ts","sourceRoot":"","sources":["../../src/learning/QLearning.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"QLearning.d.ts","sourceRoot":"","sources":["../../src/learning/QLearning.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,gCAAgC,CAAC;AAC7E,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AAEtD;;GAEG;AACH,MAAM,MAAM,eAAe,GAAG,QAAQ,CAAC;AAgBvC;;;;;;;;;;;;;;;;;GAiBG;AACH,qBAAa,SAAU,SAAQ,iBAAiB;IAC9C,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAW;gBAE7B,MAAM,GAAE,OAAO,CAAC,QAAQ,CAAM;IAO1C;;;;;;OAMG;IACH,MAAM,CAAC,UAAU,EAAE,cAAc,EAAE,UAAU,CAAC,EAAE,WAAW,GAAG,IAAI;IAiClE;;OAEG;IACH,SAAS,CAAC,yBAAyB,IAAI,MAAM;IAI7C;;OAEG;IACH,gBAAgB,IAAI,MAAM;IAI1B;;OAEG;IACH,gBAAgB,IAAI,WAAW,GAAG,YAAY;IAI9C;;OAEG;IACH,qBAAqB,IAAI;QACvB,SAAS,EAAE,MAAM,CAAC;QAClB,IAAI,EAAE,WAAW,GAAG,YAAY,CAAC;QACjC,KAAK,EAAE,UAAU,CAAC,iBAAiB,CAAC,eAAe,CAAC,CAAC,CAAC;KACvD;CAOF"}
|
|
@@ -1,14 +1,17 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
/**
|
|
3
|
-
* QLearning -
|
|
3
|
+
* QLearning - Off-policy TD(0) Reinforcement Learning
|
|
4
4
|
*
|
|
5
5
|
* Implements standard Q-learning algorithm for reinforcement learning.
|
|
6
|
-
*
|
|
6
|
+
* Key differences from SARSA:
|
|
7
|
+
* - Off-policy: learns optimal Q-values regardless of policy being followed
|
|
8
|
+
* - Uses max Q-value for next state, not actual next action
|
|
9
|
+
* - Update rule: Q(s,a) ← Q(s,a) + α[r + γ·max(Q(s',a')) - Q(s,a)]
|
|
10
|
+
* - More aggressive than SARSA, finds optimal policy faster
|
|
7
11
|
*/
|
|
8
12
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
9
13
|
exports.QLearning = void 0;
|
|
10
|
-
const
|
|
11
|
-
const ExperienceReplayBuffer_1 = require("./ExperienceReplayBuffer");
|
|
14
|
+
const AbstractRLLearner_1 = require("./algorithms/AbstractRLLearner");
|
|
12
15
|
/**
|
|
13
16
|
* Default Q-learning configuration
|
|
14
17
|
*/
|
|
@@ -27,85 +30,42 @@ const DEFAULT_CONFIG = {
|
|
|
27
30
|
*
|
|
28
31
|
* Implements the classic Q-learning algorithm with:
|
|
29
32
|
* - Epsilon-greedy exploration policy
|
|
30
|
-
* -
|
|
33
|
+
* - Off-policy temporal difference (TD) learning
|
|
31
34
|
* - Q-table for state-action values
|
|
32
35
|
* - Optional experience replay for stability
|
|
36
|
+
*
|
|
37
|
+
* Update Rule:
|
|
38
|
+
* Q(s,a) ← Q(s,a) + α[r + γ·max_a'(Q(s',a')) - Q(s,a)]
|
|
39
|
+
*
|
|
40
|
+
* Key characteristics:
|
|
41
|
+
* - Off-policy: learns about optimal policy while following exploration policy
|
|
42
|
+
* - Uses max Q-value (greedy) for bootstrapping
|
|
43
|
+
* - Converges to optimal Q* under certain conditions
|
|
44
|
+
* - More sample-efficient than on-policy methods
|
|
33
45
|
*/
|
|
34
|
-
class QLearning {
|
|
46
|
+
class QLearning extends AbstractRLLearner_1.AbstractRLLearner {
|
|
35
47
|
constructor(config = {}) {
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
this.
|
|
39
|
-
this.
|
|
40
|
-
this.episodeCount = 0;
|
|
41
|
-
// Initialize experience replay buffer if enabled
|
|
42
|
-
if (this.config.useExperienceReplay) {
|
|
43
|
-
this.replayBuffer = new ExperienceReplayBuffer_1.ExperienceReplayBuffer({
|
|
44
|
-
maxSize: this.config.replayBufferSize,
|
|
45
|
-
minSize: this.config.batchSize,
|
|
46
|
-
prioritized: false
|
|
47
|
-
});
|
|
48
|
-
}
|
|
49
|
-
this.logger.info('QLearning initialized', { config: this.config });
|
|
50
|
-
}
|
|
51
|
-
/**
|
|
52
|
-
* Select action using epsilon-greedy policy
|
|
53
|
-
* With probability ε, select random action (exploration)
|
|
54
|
-
* Otherwise, select action with highest Q-value (exploitation)
|
|
55
|
-
*/
|
|
56
|
-
selectAction(state, availableActions) {
|
|
57
|
-
if (availableActions.length === 0) {
|
|
58
|
-
throw new Error('No available actions to select from');
|
|
59
|
-
}
|
|
60
|
-
// Exploration: random action
|
|
61
|
-
if (Math.random() < this.config.explorationRate) {
|
|
62
|
-
const randomIndex = Math.floor(Math.random() * availableActions.length);
|
|
63
|
-
return availableActions[randomIndex];
|
|
64
|
-
}
|
|
65
|
-
// Exploitation: best action based on Q-values
|
|
66
|
-
return this.getBestAction(state, availableActions);
|
|
67
|
-
}
|
|
68
|
-
/**
|
|
69
|
-
* Get best action based on current Q-values
|
|
70
|
-
*/
|
|
71
|
-
getBestAction(state, availableActions) {
|
|
72
|
-
const stateKey = this.encodeState(state);
|
|
73
|
-
const stateActions = this.qTable.get(stateKey);
|
|
74
|
-
if (!stateActions || stateActions.size === 0) {
|
|
75
|
-
// No Q-values yet, return random action
|
|
76
|
-
const randomIndex = Math.floor(Math.random() * availableActions.length);
|
|
77
|
-
return availableActions[randomIndex];
|
|
78
|
-
}
|
|
79
|
-
// Find action with highest Q-value
|
|
80
|
-
let bestAction = availableActions[0];
|
|
81
|
-
let bestValue = -Infinity;
|
|
82
|
-
for (const action of availableActions) {
|
|
83
|
-
const actionKey = this.encodeAction(action);
|
|
84
|
-
const qValue = stateActions.get(actionKey);
|
|
85
|
-
if (qValue && qValue.value > bestValue) {
|
|
86
|
-
bestValue = qValue.value;
|
|
87
|
-
bestAction = action;
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
return bestAction;
|
|
48
|
+
const fullConfig = { ...DEFAULT_CONFIG, ...config };
|
|
49
|
+
super(fullConfig);
|
|
50
|
+
this.defaultConfig = fullConfig;
|
|
51
|
+
this.logger.info('QLearning initialized with off-policy TD(0)', { config: fullConfig });
|
|
91
52
|
}
|
|
92
53
|
/**
|
|
93
54
|
* Update Q-value using Q-learning update rule
|
|
94
55
|
* Q(s,a) ← Q(s,a) + α[r + γ·max(Q(s',a')) - Q(s,a)]
|
|
56
|
+
*
|
|
57
|
+
* @param experience The transition experience (s, a, r, s')
|
|
58
|
+
* @param nextAction Ignored in Q-learning (uses max Q-value instead)
|
|
95
59
|
*/
|
|
96
|
-
update(experience) {
|
|
60
|
+
update(experience, nextAction) {
|
|
97
61
|
const stateKey = this.encodeState(experience.state);
|
|
98
62
|
const actionKey = this.encodeAction(experience.action);
|
|
99
63
|
const nextStateKey = this.encodeState(experience.nextState);
|
|
100
|
-
// Get
|
|
101
|
-
if (!this.qTable.has(stateKey)) {
|
|
102
|
-
this.qTable.set(stateKey, new Map());
|
|
103
|
-
}
|
|
64
|
+
// Get current Q-value Q(s,a)
|
|
104
65
|
const stateActions = this.qTable.get(stateKey);
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
// Get max Q-value for next state (for all possible actions)
|
|
66
|
+
const currentQ = stateActions?.get(actionKey)?.value ?? 0;
|
|
67
|
+
// Q-Learning: Get max Q-value for next state (greedy)
|
|
68
|
+
// This is the key difference from SARSA (which uses actual next action)
|
|
109
69
|
const nextStateActions = this.qTable.get(nextStateKey);
|
|
110
70
|
const maxNextQ = nextStateActions && nextStateActions.size > 0
|
|
111
71
|
? Math.max(...Array.from(nextStateActions.values()).map(qv => qv.value))
|
|
@@ -116,222 +76,41 @@ class QLearning {
|
|
|
116
76
|
const tdError = tdTarget - currentQ;
|
|
117
77
|
const newQ = currentQ + this.config.learningRate * tdError;
|
|
118
78
|
// Update Q-value
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
action: actionKey,
|
|
122
|
-
value: newQ,
|
|
123
|
-
updateCount: (currentQValue?.updateCount ?? 0) + 1,
|
|
124
|
-
lastUpdated: Date.now()
|
|
125
|
-
});
|
|
126
|
-
// Add to experience replay buffer
|
|
79
|
+
this.setQValue(stateKey, actionKey, newQ);
|
|
80
|
+
// Add to experience replay buffer if enabled
|
|
127
81
|
if (this.replayBuffer) {
|
|
128
82
|
this.replayBuffer.add(experience, Math.abs(tdError)); // Priority based on TD error
|
|
129
83
|
}
|
|
130
84
|
this.stepCount++;
|
|
131
85
|
}
|
|
132
86
|
/**
|
|
133
|
-
*
|
|
134
|
-
* Samples random batch from replay buffer and updates Q-values
|
|
135
|
-
*/
|
|
136
|
-
batchUpdate() {
|
|
137
|
-
if (!this.replayBuffer || !this.replayBuffer.canSample(this.config.batchSize)) {
|
|
138
|
-
return;
|
|
139
|
-
}
|
|
140
|
-
const batch = this.replayBuffer.sample(this.config.batchSize);
|
|
141
|
-
for (const experience of batch) {
|
|
142
|
-
this.update(experience);
|
|
143
|
-
}
|
|
144
|
-
this.logger.debug(`Performed batch update with ${batch.length} experiences`);
|
|
145
|
-
}
|
|
146
|
-
/**
|
|
147
|
-
* Get Q-value for a state-action pair
|
|
148
|
-
*/
|
|
149
|
-
getQValue(state, action) {
|
|
150
|
-
const stateKey = this.encodeState(state);
|
|
151
|
-
const actionKey = this.encodeAction(action);
|
|
152
|
-
const stateActions = this.qTable.get(stateKey);
|
|
153
|
-
if (!stateActions) {
|
|
154
|
-
return 0;
|
|
155
|
-
}
|
|
156
|
-
const qValue = stateActions.get(actionKey);
|
|
157
|
-
return qValue?.value ?? 0;
|
|
158
|
-
}
|
|
159
|
-
/**
|
|
160
|
-
* Get all Q-values for a state
|
|
161
|
-
*/
|
|
162
|
-
getStateValues(state) {
|
|
163
|
-
const stateKey = this.encodeState(state);
|
|
164
|
-
const stateActions = this.qTable.get(stateKey);
|
|
165
|
-
if (!stateActions) {
|
|
166
|
-
return new Map();
|
|
167
|
-
}
|
|
168
|
-
const values = new Map();
|
|
169
|
-
for (const [actionKey, qValue] of stateActions.entries()) {
|
|
170
|
-
values.set(actionKey, qValue.value);
|
|
171
|
-
}
|
|
172
|
-
return values;
|
|
173
|
-
}
|
|
174
|
-
/**
|
|
175
|
-
* Get value of a state (max Q-value over all actions)
|
|
176
|
-
* V(s) = max_a Q(s,a)
|
|
177
|
-
*/
|
|
178
|
-
getStateValue(state) {
|
|
179
|
-
const stateKey = this.encodeState(state);
|
|
180
|
-
const stateActions = this.qTable.get(stateKey);
|
|
181
|
-
if (!stateActions || stateActions.size === 0) {
|
|
182
|
-
return 0;
|
|
183
|
-
}
|
|
184
|
-
return Math.max(...Array.from(stateActions.values()).map(qv => qv.value));
|
|
185
|
-
}
|
|
186
|
-
/**
|
|
187
|
-
* Decay exploration rate (epsilon)
|
|
188
|
-
* Called after each episode to gradually reduce exploration
|
|
87
|
+
* Get the default exploration rate for this algorithm
|
|
189
88
|
*/
|
|
190
|
-
|
|
191
|
-
|
|
89
|
+
getDefaultExplorationRate() {
|
|
90
|
+
return this.defaultConfig.explorationRate;
|
|
192
91
|
}
|
|
193
92
|
/**
|
|
194
|
-
*
|
|
93
|
+
* Get algorithm name
|
|
195
94
|
*/
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
this.decayExploration();
|
|
199
|
-
// Perform batch update if using experience replay
|
|
200
|
-
if (this.config.useExperienceReplay) {
|
|
201
|
-
this.batchUpdate();
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
/**
|
|
205
|
-
* Encode state to string key for Q-table
|
|
206
|
-
*/
|
|
207
|
-
encodeState(state) {
|
|
208
|
-
// Create normalized feature vector
|
|
209
|
-
const features = [
|
|
210
|
-
state.taskComplexity,
|
|
211
|
-
state.requiredCapabilities.length / 10, // normalize
|
|
212
|
-
state.previousAttempts / 5, // normalize
|
|
213
|
-
state.availableResources,
|
|
214
|
-
state.timeConstraint ? Math.min(state.timeConstraint / 300000, 1) : 1 // normalize to 5 min
|
|
215
|
-
];
|
|
216
|
-
// Round to reduce state space (discretization)
|
|
217
|
-
return features.map(f => Math.round(f * 10) / 10).join(',');
|
|
218
|
-
}
|
|
219
|
-
/**
|
|
220
|
-
* Encode action to string key for Q-table
|
|
221
|
-
*/
|
|
222
|
-
encodeAction(action) {
|
|
223
|
-
return `${action.strategy}:${action.parallelization.toFixed(1)}:${action.retryPolicy}`;
|
|
224
|
-
}
|
|
225
|
-
/**
|
|
226
|
-
* Get current exploration rate (epsilon)
|
|
227
|
-
*/
|
|
228
|
-
getExplorationRate() {
|
|
229
|
-
return this.config.explorationRate;
|
|
230
|
-
}
|
|
231
|
-
/**
|
|
232
|
-
* Get total number of learning steps
|
|
233
|
-
*/
|
|
234
|
-
getStepCount() {
|
|
235
|
-
return this.stepCount;
|
|
236
|
-
}
|
|
237
|
-
/**
|
|
238
|
-
* Get total number of episodes
|
|
239
|
-
*/
|
|
240
|
-
getEpisodeCount() {
|
|
241
|
-
return this.episodeCount;
|
|
242
|
-
}
|
|
243
|
-
/**
|
|
244
|
-
* Get Q-table size (number of state-action pairs)
|
|
245
|
-
*/
|
|
246
|
-
getTableSize() {
|
|
247
|
-
let size = 0;
|
|
248
|
-
for (const stateActions of this.qTable.values()) {
|
|
249
|
-
size += stateActions.size;
|
|
250
|
-
}
|
|
251
|
-
return size;
|
|
95
|
+
getAlgorithmName() {
|
|
96
|
+
return 'Q-Learning';
|
|
252
97
|
}
|
|
253
98
|
/**
|
|
254
|
-
* Get
|
|
99
|
+
* Get algorithm type (off-policy)
|
|
255
100
|
*/
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
let count = 0;
|
|
259
|
-
let maxQ = -Infinity;
|
|
260
|
-
let minQ = Infinity;
|
|
261
|
-
for (const stateActions of this.qTable.values()) {
|
|
262
|
-
for (const qValue of stateActions.values()) {
|
|
263
|
-
totalQValue += qValue.value;
|
|
264
|
-
maxQ = Math.max(maxQ, qValue.value);
|
|
265
|
-
minQ = Math.min(minQ, qValue.value);
|
|
266
|
-
count++;
|
|
267
|
-
}
|
|
268
|
-
}
|
|
269
|
-
return {
|
|
270
|
-
steps: this.stepCount,
|
|
271
|
-
episodes: this.episodeCount,
|
|
272
|
-
tableSize: count,
|
|
273
|
-
explorationRate: this.config.explorationRate,
|
|
274
|
-
avgQValue: count > 0 ? totalQValue / count : 0,
|
|
275
|
-
maxQValue: count > 0 ? maxQ : 0,
|
|
276
|
-
minQValue: count > 0 ? minQ : 0
|
|
277
|
-
};
|
|
278
|
-
}
|
|
279
|
-
/**
|
|
280
|
-
* Reset Q-table and learning state
|
|
281
|
-
*/
|
|
282
|
-
reset() {
|
|
283
|
-
this.qTable.clear();
|
|
284
|
-
this.stepCount = 0;
|
|
285
|
-
this.episodeCount = 0;
|
|
286
|
-
this.config.explorationRate = DEFAULT_CONFIG.explorationRate;
|
|
287
|
-
if (this.replayBuffer) {
|
|
288
|
-
this.replayBuffer.clear();
|
|
289
|
-
}
|
|
290
|
-
this.logger.info('QLearning reset to initial state');
|
|
101
|
+
getAlgorithmType() {
|
|
102
|
+
return 'off-policy';
|
|
291
103
|
}
|
|
292
104
|
/**
|
|
293
|
-
*
|
|
105
|
+
* Get detailed statistics including Q-learning-specific metrics
|
|
294
106
|
*/
|
|
295
|
-
|
|
296
|
-
const serializedQTable = {};
|
|
297
|
-
for (const [state, actions] of this.qTable.entries()) {
|
|
298
|
-
serializedQTable[state] = {};
|
|
299
|
-
for (const [action, qValue] of actions.entries()) {
|
|
300
|
-
serializedQTable[state][action] = qValue;
|
|
301
|
-
}
|
|
302
|
-
}
|
|
107
|
+
getDetailedStatistics() {
|
|
303
108
|
return {
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
episodeCount: this.episodeCount
|
|
109
|
+
algorithm: this.getAlgorithmName(),
|
|
110
|
+
type: this.getAlgorithmType(),
|
|
111
|
+
stats: this.getStatistics()
|
|
308
112
|
};
|
|
309
113
|
}
|
|
310
|
-
/**
|
|
311
|
-
* Import Q-table and state from persistence
|
|
312
|
-
*/
|
|
313
|
-
import(state) {
|
|
314
|
-
this.qTable.clear();
|
|
315
|
-
for (const [stateKey, actions] of Object.entries(state.qTable)) {
|
|
316
|
-
const actionMap = new Map();
|
|
317
|
-
for (const [actionKey, qValue] of Object.entries(actions)) {
|
|
318
|
-
actionMap.set(actionKey, qValue);
|
|
319
|
-
}
|
|
320
|
-
this.qTable.set(stateKey, actionMap);
|
|
321
|
-
}
|
|
322
|
-
this.config = { ...state.config };
|
|
323
|
-
this.stepCount = state.stepCount;
|
|
324
|
-
this.episodeCount = state.episodeCount;
|
|
325
|
-
this.logger.info(`Imported Q-table with ${this.getTableSize()} state-action pairs`);
|
|
326
|
-
}
|
|
327
|
-
/**
|
|
328
|
-
* Get memory usage estimate in bytes
|
|
329
|
-
*/
|
|
330
|
-
getMemoryUsage() {
|
|
331
|
-
const qTableSize = JSON.stringify(this.export().qTable).length;
|
|
332
|
-
const bufferSize = this.replayBuffer?.getMemoryUsage() ?? 0;
|
|
333
|
-
return qTableSize + bufferSize;
|
|
334
|
-
}
|
|
335
114
|
}
|
|
336
115
|
exports.QLearning = QLearning;
|
|
337
116
|
//# sourceMappingURL=QLearning.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"QLearning.js","sourceRoot":"","sources":["../../src/learning/QLearning.ts"],"names":[],"mappings":";AAAA
|
|
1
|
+
{"version":3,"file":"QLearning.js","sourceRoot":"","sources":["../../src/learning/QLearning.ts"],"names":[],"mappings":";AAAA;;;;;;;;;GASG;;;AAEH,sEAA6E;AAQ7E;;GAEG;AACH,MAAM,cAAc,GAAa;IAC/B,YAAY,EAAE,GAAG;IACjB,cAAc,EAAE,IAAI;IACpB,eAAe,EAAE,GAAG;IACpB,gBAAgB,EAAE,KAAK;IACvB,kBAAkB,EAAE,IAAI;IACxB,mBAAmB,EAAE,IAAI;IACzB,gBAAgB,EAAE,KAAK;IACvB,SAAS,EAAE,EAAE;CACd,CAAC;AAEF;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAa,SAAU,SAAQ,qCAAiB;IAG9C,YAAY,SAA4B,EAAE;QACxC,MAAM,UAAU,GAAG,EAAE,GAAG,cAAc,EAAE,GAAG,MAAM,EAAE,CAAC;QACpD,KAAK,CAAC,UAAU,CAAC,CAAC;QAClB,IAAI,CAAC,aAAa,GAAG,UAAU,CAAC;QAChC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,6CAA6C,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,CAAC;IAC1F,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,UAA0B,EAAE,UAAwB;QACzD,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;QACpD,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;QACvD,MAAM,YAAY,GAAG,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC;QAE5D,6BAA6B;QAC7B,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAC/C,MAAM,QAAQ,GAAG,YAAY,EAAE,GAAG,CAAC,SAAS,CAAC,EAAE,KAAK,IAAI,CAAC,CAAC;QAE1D,sDAAsD;QACtD,wEAAwE;QACxE,MAAM,gBAAgB,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;QACvD,MAAM,QAAQ,GAAG,gBAAgB,IAAI,gBAAgB,CAAC,IAAI,GAAG,CAAC;YAC5D,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC;YACxE,CAAC,CAAC,CAAC,CAAC;QAEN,yBAAyB;QACzB,yDAAyD;QACzD,MAAM,QAAQ,GAAG,UAAU,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,cAAc,GAAG,QAAQ,CAAC;QAC3E,MAAM,OAAO,GAAG,QAAQ,GAAG,QAAQ,CAAC;QACpC,MAAM,IAAI,GAAG,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,YAAY,GAAG,OAAO,CAAC;QAE3D,iBAAiB;QACjB,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;QAE1C,6CAA6C;QAC7C,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YACtB,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,UAAU,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,6BAA6B;QACrF,CAAC;QAED,IAAI,CAAC,SAAS,EAAE,CAAC;IACnB,CAAC;IAED;;OAEG;IACO,yBAAyB;QACjC,OAAO,IAAI,CAAC,aAAa,CAAC,eAAe,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,qBAAqB;QAKnB,OAAO;YACL,SAAS,EAAE,IAAI,CAAC,gBAAgB,EAAE;YAClC,IAAI,EAAE,IAAI,CAAC,gBAAgB,EAAE;YAC7B,KAAK,EAAE,IAAI,CAAC,aAAa,EAAE;SAC5B,CAAC;IACJ,CAAC;CACF;AArFD,8BAqFC"}
|