agentic-qe 2.1.2 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/agentic-quality-engineering/SKILL.md +4 -4
- package/.claude/skills/cicd-pipeline-qe-orchestrator/README.md +14 -11
- package/.claude/skills/skills-manifest.json +2 -2
- package/CHANGELOG.md +138 -0
- package/README.md +92 -214
- package/dist/agents/BaseAgent.d.ts +5 -1
- package/dist/agents/BaseAgent.d.ts.map +1 -1
- package/dist/agents/BaseAgent.js +32 -17
- package/dist/agents/BaseAgent.js.map +1 -1
- package/dist/agents/index.d.ts.map +1 -1
- package/dist/agents/index.js +5 -1
- package/dist/agents/index.js.map +1 -1
- package/dist/cli/commands/improve/index.d.ts +8 -1
- package/dist/cli/commands/improve/index.d.ts.map +1 -1
- package/dist/cli/commands/improve/index.js +18 -16
- package/dist/cli/commands/improve/index.js.map +1 -1
- package/dist/cli/commands/learn/index.d.ts +10 -2
- package/dist/cli/commands/learn/index.d.ts.map +1 -1
- package/dist/cli/commands/learn/index.js +99 -63
- package/dist/cli/commands/learn/index.js.map +1 -1
- package/dist/cli/commands/patterns/index.d.ts +8 -1
- package/dist/cli/commands/patterns/index.d.ts.map +1 -1
- package/dist/cli/commands/patterns/index.js +79 -45
- package/dist/cli/commands/patterns/index.js.map +1 -1
- package/dist/cli/commands/routing/index.d.ts +5 -0
- package/dist/cli/commands/routing/index.d.ts.map +1 -1
- package/dist/cli/commands/routing/index.js +11 -10
- package/dist/cli/commands/routing/index.js.map +1 -1
- package/dist/cli/init/agents.d.ts +1 -1
- package/dist/cli/init/agents.js +2 -2
- package/dist/cli/init/database-init.d.ts +7 -0
- package/dist/cli/init/database-init.d.ts.map +1 -1
- package/dist/cli/init/database-init.js +29 -48
- package/dist/cli/init/database-init.js.map +1 -1
- package/dist/core/di/AgentDependencies.d.ts +127 -0
- package/dist/core/di/AgentDependencies.d.ts.map +1 -0
- package/dist/core/di/AgentDependencies.js +251 -0
- package/dist/core/di/AgentDependencies.js.map +1 -0
- package/dist/core/di/DIContainer.d.ts +149 -0
- package/dist/core/di/DIContainer.d.ts.map +1 -0
- package/dist/core/di/DIContainer.js +333 -0
- package/dist/core/di/DIContainer.js.map +1 -0
- package/dist/core/di/index.d.ts +11 -0
- package/dist/core/di/index.d.ts.map +1 -0
- package/dist/core/di/index.js +22 -0
- package/dist/core/di/index.js.map +1 -0
- package/dist/core/index.d.ts +1 -0
- package/dist/core/index.d.ts.map +1 -1
- package/dist/core/index.js +11 -1
- package/dist/core/index.js.map +1 -1
- package/dist/core/memory/HNSWVectorMemory.d.ts +261 -0
- package/dist/core/memory/HNSWVectorMemory.d.ts.map +1 -0
- package/dist/core/memory/HNSWVectorMemory.js +647 -0
- package/dist/core/memory/HNSWVectorMemory.js.map +1 -0
- package/dist/core/memory/SwarmMemoryManager.d.ts +7 -0
- package/dist/core/memory/SwarmMemoryManager.d.ts.map +1 -1
- package/dist/core/memory/SwarmMemoryManager.js +9 -0
- package/dist/core/memory/SwarmMemoryManager.js.map +1 -1
- package/dist/core/memory/index.d.ts +2 -0
- package/dist/core/memory/index.d.ts.map +1 -1
- package/dist/core/memory/index.js +11 -1
- package/dist/core/memory/index.js.map +1 -1
- package/dist/learning/ExperienceSharingProtocol.d.ts +243 -0
- package/dist/learning/ExperienceSharingProtocol.d.ts.map +1 -0
- package/dist/learning/ExperienceSharingProtocol.js +538 -0
- package/dist/learning/ExperienceSharingProtocol.js.map +1 -0
- package/dist/learning/ExplainableLearning.d.ts +191 -0
- package/dist/learning/ExplainableLearning.d.ts.map +1 -0
- package/dist/learning/ExplainableLearning.js +441 -0
- package/dist/learning/ExplainableLearning.js.map +1 -0
- package/dist/learning/GossipPatternSharingProtocol.d.ts +228 -0
- package/dist/learning/GossipPatternSharingProtocol.d.ts.map +1 -0
- package/dist/learning/GossipPatternSharingProtocol.js +590 -0
- package/dist/learning/GossipPatternSharingProtocol.js.map +1 -0
- package/dist/learning/LearningEngine.d.ts +104 -4
- package/dist/learning/LearningEngine.d.ts.map +1 -1
- package/dist/learning/LearningEngine.js +350 -16
- package/dist/learning/LearningEngine.js.map +1 -1
- package/dist/learning/PerformanceOptimizer.d.ts +268 -0
- package/dist/learning/PerformanceOptimizer.d.ts.map +1 -0
- package/dist/learning/PerformanceOptimizer.js +552 -0
- package/dist/learning/PerformanceOptimizer.js.map +1 -0
- package/dist/learning/PrivacyManager.d.ts +197 -0
- package/dist/learning/PrivacyManager.d.ts.map +1 -0
- package/dist/learning/PrivacyManager.js +551 -0
- package/dist/learning/PrivacyManager.js.map +1 -0
- package/dist/learning/QLearning.d.ts +38 -125
- package/dist/learning/QLearning.d.ts.map +1 -1
- package/dist/learning/QLearning.js +46 -267
- package/dist/learning/QLearning.js.map +1 -1
- package/dist/learning/QLearningLegacy.d.ts +154 -0
- package/dist/learning/QLearningLegacy.d.ts.map +1 -0
- package/dist/learning/QLearningLegacy.js +337 -0
- package/dist/learning/QLearningLegacy.js.map +1 -0
- package/dist/learning/TransferLearningManager.d.ts +212 -0
- package/dist/learning/TransferLearningManager.d.ts.map +1 -0
- package/dist/learning/TransferLearningManager.js +497 -0
- package/dist/learning/TransferLearningManager.js.map +1 -0
- package/dist/learning/algorithms/AbstractRLLearner.d.ts +162 -0
- package/dist/learning/algorithms/AbstractRLLearner.d.ts.map +1 -0
- package/dist/learning/algorithms/AbstractRLLearner.js +300 -0
- package/dist/learning/algorithms/AbstractRLLearner.js.map +1 -0
- package/dist/learning/algorithms/ActorCriticLearner.d.ts +201 -0
- package/dist/learning/algorithms/ActorCriticLearner.d.ts.map +1 -0
- package/dist/learning/algorithms/ActorCriticLearner.js +447 -0
- package/dist/learning/algorithms/ActorCriticLearner.js.map +1 -0
- package/dist/learning/algorithms/MAMLMetaLearner.d.ts +218 -0
- package/dist/learning/algorithms/MAMLMetaLearner.d.ts.map +1 -0
- package/dist/learning/algorithms/MAMLMetaLearner.js +532 -0
- package/dist/learning/algorithms/MAMLMetaLearner.js.map +1 -0
- package/dist/learning/algorithms/PPOLearner.d.ts +207 -0
- package/dist/learning/algorithms/PPOLearner.d.ts.map +1 -0
- package/dist/learning/algorithms/PPOLearner.js +490 -0
- package/dist/learning/algorithms/PPOLearner.js.map +1 -0
- package/dist/learning/algorithms/QLearning.d.ts +68 -0
- package/dist/learning/algorithms/QLearning.d.ts.map +1 -0
- package/dist/learning/algorithms/QLearning.js +116 -0
- package/dist/learning/algorithms/QLearning.js.map +1 -0
- package/dist/learning/algorithms/SARSALearner.d.ts +107 -0
- package/dist/learning/algorithms/SARSALearner.d.ts.map +1 -0
- package/dist/learning/algorithms/SARSALearner.js +252 -0
- package/dist/learning/algorithms/SARSALearner.js.map +1 -0
- package/dist/learning/algorithms/index.d.ts +32 -0
- package/dist/learning/algorithms/index.d.ts.map +1 -0
- package/dist/learning/algorithms/index.js +50 -0
- package/dist/learning/algorithms/index.js.map +1 -0
- package/dist/learning/index.d.ts +11 -0
- package/dist/learning/index.d.ts.map +1 -1
- package/dist/learning/index.js +31 -1
- package/dist/learning/index.js.map +1 -1
- package/dist/learning/types.d.ts +2 -0
- package/dist/learning/types.d.ts.map +1 -1
- package/dist/mcp/server-instructions.d.ts +1 -1
- package/dist/mcp/server-instructions.js +1 -1
- package/dist/memory/DistributedPatternLibrary.d.ts +159 -0
- package/dist/memory/DistributedPatternLibrary.d.ts.map +1 -0
- package/dist/memory/DistributedPatternLibrary.js +370 -0
- package/dist/memory/DistributedPatternLibrary.js.map +1 -0
- package/dist/memory/PatternQualityScorer.d.ts +169 -0
- package/dist/memory/PatternQualityScorer.d.ts.map +1 -0
- package/dist/memory/PatternQualityScorer.js +327 -0
- package/dist/memory/PatternQualityScorer.js.map +1 -0
- package/dist/memory/PatternReplicationService.d.ts +187 -0
- package/dist/memory/PatternReplicationService.d.ts.map +1 -0
- package/dist/memory/PatternReplicationService.js +392 -0
- package/dist/memory/PatternReplicationService.js.map +1 -0
- package/dist/providers/ClaudeProvider.d.ts +98 -0
- package/dist/providers/ClaudeProvider.d.ts.map +1 -0
- package/dist/providers/ClaudeProvider.js +418 -0
- package/dist/providers/ClaudeProvider.js.map +1 -0
- package/dist/providers/HybridRouter.d.ts +217 -0
- package/dist/providers/HybridRouter.d.ts.map +1 -0
- package/dist/providers/HybridRouter.js +679 -0
- package/dist/providers/HybridRouter.js.map +1 -0
- package/dist/providers/ILLMProvider.d.ts +287 -0
- package/dist/providers/ILLMProvider.d.ts.map +1 -0
- package/dist/providers/ILLMProvider.js +33 -0
- package/dist/providers/ILLMProvider.js.map +1 -0
- package/dist/providers/LLMProviderFactory.d.ts +154 -0
- package/dist/providers/LLMProviderFactory.d.ts.map +1 -0
- package/dist/providers/LLMProviderFactory.js +426 -0
- package/dist/providers/LLMProviderFactory.js.map +1 -0
- package/dist/providers/RuvllmProvider.d.ts +107 -0
- package/dist/providers/RuvllmProvider.d.ts.map +1 -0
- package/dist/providers/RuvllmProvider.js +417 -0
- package/dist/providers/RuvllmProvider.js.map +1 -0
- package/dist/providers/index.d.ts +32 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +75 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/telemetry/LearningTelemetry.d.ts +190 -0
- package/dist/telemetry/LearningTelemetry.d.ts.map +1 -0
- package/dist/telemetry/LearningTelemetry.js +403 -0
- package/dist/telemetry/LearningTelemetry.js.map +1 -0
- package/dist/telemetry/index.d.ts +1 -0
- package/dist/telemetry/index.d.ts.map +1 -1
- package/dist/telemetry/index.js +20 -2
- package/dist/telemetry/index.js.map +1 -1
- package/dist/telemetry/instrumentation/agent.d.ts +1 -1
- package/dist/telemetry/instrumentation/agent.js +1 -1
- package/dist/telemetry/instrumentation/index.d.ts +1 -1
- package/dist/telemetry/instrumentation/index.js +1 -1
- package/dist/utils/math.d.ts +11 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +16 -0
- package/dist/utils/math.js.map +1 -0
- package/docs/reference/agents.md +1 -1
- package/docs/reference/skills.md +3 -3
- package/docs/reference/usage.md +4 -4
- package/package.json +1 -1
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* AbstractRLLearner - Base class for Reinforcement Learning algorithms
|
|
4
|
+
*
|
|
5
|
+
* Provides common functionality for all RL algorithms including:
|
|
6
|
+
* - Epsilon-greedy exploration policy
|
|
7
|
+
* - State/action encoding
|
|
8
|
+
* - Q-table management
|
|
9
|
+
* - Experience replay integration
|
|
10
|
+
* - Statistics tracking
|
|
11
|
+
*/
|
|
12
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
13
|
+
exports.AbstractRLLearner = void 0;
|
|
14
|
+
const Logger_1 = require("../../utils/Logger");
|
|
15
|
+
const ExperienceReplayBuffer_1 = require("../ExperienceReplayBuffer");
|
|
16
|
+
/**
|
|
17
|
+
* Abstract base class for RL algorithms
|
|
18
|
+
*/
|
|
19
|
+
class AbstractRLLearner {
|
|
20
|
+
constructor(config) {
|
|
21
|
+
this.logger = Logger_1.Logger.getInstance();
|
|
22
|
+
this.config = config;
|
|
23
|
+
this.qTable = new Map();
|
|
24
|
+
this.stepCount = 0;
|
|
25
|
+
this.episodeCount = 0;
|
|
26
|
+
// Initialize experience replay buffer if enabled
|
|
27
|
+
if (this.config.useExperienceReplay) {
|
|
28
|
+
this.replayBuffer = new ExperienceReplayBuffer_1.ExperienceReplayBuffer({
|
|
29
|
+
maxSize: this.config.replayBufferSize,
|
|
30
|
+
minSize: this.config.batchSize,
|
|
31
|
+
prioritized: false
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
this.logger.info(`${this.constructor.name} initialized`, { config });
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Select action using epsilon-greedy policy
|
|
38
|
+
* With probability ε, select random action (exploration)
|
|
39
|
+
* Otherwise, select action with highest Q-value (exploitation)
|
|
40
|
+
*/
|
|
41
|
+
selectAction(state, availableActions) {
|
|
42
|
+
if (availableActions.length === 0) {
|
|
43
|
+
throw new Error('No available actions to select from');
|
|
44
|
+
}
|
|
45
|
+
// Exploration: random action
|
|
46
|
+
if (Math.random() < this.config.explorationRate) {
|
|
47
|
+
const randomIndex = Math.floor(Math.random() * availableActions.length);
|
|
48
|
+
return availableActions[randomIndex];
|
|
49
|
+
}
|
|
50
|
+
// Exploitation: best action based on Q-values
|
|
51
|
+
return this.getBestAction(state, availableActions);
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Get best action based on current Q-values (greedy policy)
|
|
55
|
+
*/
|
|
56
|
+
getBestAction(state, availableActions) {
|
|
57
|
+
const stateKey = this.encodeState(state);
|
|
58
|
+
const stateActions = this.qTable.get(stateKey);
|
|
59
|
+
if (!stateActions || stateActions.size === 0) {
|
|
60
|
+
// No Q-values yet, return random action
|
|
61
|
+
const randomIndex = Math.floor(Math.random() * availableActions.length);
|
|
62
|
+
return availableActions[randomIndex];
|
|
63
|
+
}
|
|
64
|
+
// Find action with highest Q-value
|
|
65
|
+
let bestAction = availableActions[0];
|
|
66
|
+
let bestValue = -Infinity;
|
|
67
|
+
for (const action of availableActions) {
|
|
68
|
+
const actionKey = this.encodeAction(action);
|
|
69
|
+
const qValue = stateActions.get(actionKey);
|
|
70
|
+
if (qValue && qValue.value > bestValue) {
|
|
71
|
+
bestValue = qValue.value;
|
|
72
|
+
bestAction = action;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return bestAction;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Get Q-value for a state-action pair
|
|
79
|
+
*/
|
|
80
|
+
getQValue(state, action) {
|
|
81
|
+
const stateKey = this.encodeState(state);
|
|
82
|
+
const actionKey = this.encodeAction(action);
|
|
83
|
+
const stateActions = this.qTable.get(stateKey);
|
|
84
|
+
if (!stateActions) {
|
|
85
|
+
return 0;
|
|
86
|
+
}
|
|
87
|
+
const qValue = stateActions.get(actionKey);
|
|
88
|
+
return qValue?.value ?? 0;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Set Q-value for a state-action pair (protected for subclass use)
|
|
92
|
+
*/
|
|
93
|
+
setQValue(stateKey, actionKey, value) {
|
|
94
|
+
if (!this.qTable.has(stateKey)) {
|
|
95
|
+
this.qTable.set(stateKey, new Map());
|
|
96
|
+
}
|
|
97
|
+
const stateActions = this.qTable.get(stateKey);
|
|
98
|
+
const currentQValue = stateActions.get(actionKey);
|
|
99
|
+
stateActions.set(actionKey, {
|
|
100
|
+
state: stateKey,
|
|
101
|
+
action: actionKey,
|
|
102
|
+
value,
|
|
103
|
+
updateCount: (currentQValue?.updateCount ?? 0) + 1,
|
|
104
|
+
lastUpdated: Date.now()
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Get all Q-values for a state
|
|
109
|
+
*/
|
|
110
|
+
getStateValues(state) {
|
|
111
|
+
const stateKey = this.encodeState(state);
|
|
112
|
+
const stateActions = this.qTable.get(stateKey);
|
|
113
|
+
if (!stateActions) {
|
|
114
|
+
return new Map();
|
|
115
|
+
}
|
|
116
|
+
const values = new Map();
|
|
117
|
+
for (const [actionKey, qValue] of stateActions.entries()) {
|
|
118
|
+
values.set(actionKey, qValue.value);
|
|
119
|
+
}
|
|
120
|
+
return values;
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Get value of a state (max Q-value over all actions)
|
|
124
|
+
* V(s) = max_a Q(s,a)
|
|
125
|
+
*/
|
|
126
|
+
getStateValue(state) {
|
|
127
|
+
const stateKey = this.encodeState(state);
|
|
128
|
+
const stateActions = this.qTable.get(stateKey);
|
|
129
|
+
if (!stateActions || stateActions.size === 0) {
|
|
130
|
+
return 0;
|
|
131
|
+
}
|
|
132
|
+
return Math.max(...Array.from(stateActions.values()).map(qv => qv.value));
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Perform batch update using experience replay
|
|
136
|
+
* Samples random batch from replay buffer and updates Q-values
|
|
137
|
+
*/
|
|
138
|
+
batchUpdate() {
|
|
139
|
+
if (!this.replayBuffer || !this.replayBuffer.canSample(this.config.batchSize)) {
|
|
140
|
+
return;
|
|
141
|
+
}
|
|
142
|
+
const batch = this.replayBuffer.sample(this.config.batchSize);
|
|
143
|
+
for (const experience of batch) {
|
|
144
|
+
this.update(experience);
|
|
145
|
+
}
|
|
146
|
+
this.logger.debug(`Performed batch update with ${batch.length} experiences`);
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Decay exploration rate (epsilon)
|
|
150
|
+
* Called after each episode to gradually reduce exploration
|
|
151
|
+
*/
|
|
152
|
+
decayExploration() {
|
|
153
|
+
this.config.explorationRate = Math.max(this.config.minExplorationRate, this.config.explorationRate * this.config.explorationDecay);
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Mark end of episode
|
|
157
|
+
*/
|
|
158
|
+
endEpisode() {
|
|
159
|
+
this.episodeCount++;
|
|
160
|
+
this.decayExploration();
|
|
161
|
+
// Perform batch update if using experience replay
|
|
162
|
+
if (this.config.useExperienceReplay) {
|
|
163
|
+
this.batchUpdate();
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Encode state to string key for Q-table
|
|
168
|
+
* Creates normalized feature vector and discretizes for generalization
|
|
169
|
+
*/
|
|
170
|
+
encodeState(state) {
|
|
171
|
+
// Create normalized feature vector
|
|
172
|
+
const features = [
|
|
173
|
+
state.taskComplexity,
|
|
174
|
+
state.requiredCapabilities.length / 10, // normalize
|
|
175
|
+
state.previousAttempts / 5, // normalize
|
|
176
|
+
state.availableResources,
|
|
177
|
+
state.timeConstraint ? Math.min(state.timeConstraint / 300000, 1) : 1 // normalize to 5 min
|
|
178
|
+
];
|
|
179
|
+
// Round to reduce state space (discretization)
|
|
180
|
+
return features.map(f => Math.round(f * 10) / 10).join(',');
|
|
181
|
+
}
|
|
182
|
+
/**
|
|
183
|
+
* Encode action to string key for Q-table
|
|
184
|
+
*/
|
|
185
|
+
encodeAction(action) {
|
|
186
|
+
return `${action.strategy}:${action.parallelization.toFixed(1)}:${action.retryPolicy}`;
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Get current exploration rate (epsilon)
|
|
190
|
+
*/
|
|
191
|
+
getExplorationRate() {
|
|
192
|
+
return this.config.explorationRate;
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Get total number of learning steps
|
|
196
|
+
*/
|
|
197
|
+
getStepCount() {
|
|
198
|
+
return this.stepCount;
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* Get total number of episodes
|
|
202
|
+
*/
|
|
203
|
+
getEpisodeCount() {
|
|
204
|
+
return this.episodeCount;
|
|
205
|
+
}
|
|
206
|
+
/**
|
|
207
|
+
* Get Q-table size (number of state-action pairs)
|
|
208
|
+
*/
|
|
209
|
+
getTableSize() {
|
|
210
|
+
let size = 0;
|
|
211
|
+
for (const stateActions of this.qTable.values()) {
|
|
212
|
+
size += stateActions.size;
|
|
213
|
+
}
|
|
214
|
+
return size;
|
|
215
|
+
}
|
|
216
|
+
/**
|
|
217
|
+
* Get statistics about learning progress
|
|
218
|
+
*/
|
|
219
|
+
getStatistics() {
|
|
220
|
+
let totalQValue = 0;
|
|
221
|
+
let count = 0;
|
|
222
|
+
let maxQ = -Infinity;
|
|
223
|
+
let minQ = Infinity;
|
|
224
|
+
for (const stateActions of this.qTable.values()) {
|
|
225
|
+
for (const qValue of stateActions.values()) {
|
|
226
|
+
totalQValue += qValue.value;
|
|
227
|
+
maxQ = Math.max(maxQ, qValue.value);
|
|
228
|
+
minQ = Math.min(minQ, qValue.value);
|
|
229
|
+
count++;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
return {
|
|
233
|
+
steps: this.stepCount,
|
|
234
|
+
episodes: this.episodeCount,
|
|
235
|
+
tableSize: count,
|
|
236
|
+
explorationRate: this.config.explorationRate,
|
|
237
|
+
avgQValue: count > 0 ? totalQValue / count : 0,
|
|
238
|
+
maxQValue: count > 0 ? maxQ : 0,
|
|
239
|
+
minQValue: count > 0 ? minQ : 0
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
/**
|
|
243
|
+
* Reset Q-table and learning state
|
|
244
|
+
*/
|
|
245
|
+
reset() {
|
|
246
|
+
this.qTable.clear();
|
|
247
|
+
this.stepCount = 0;
|
|
248
|
+
this.episodeCount = 0;
|
|
249
|
+
this.config.explorationRate = this.getDefaultExplorationRate();
|
|
250
|
+
if (this.replayBuffer) {
|
|
251
|
+
this.replayBuffer.clear();
|
|
252
|
+
}
|
|
253
|
+
this.logger.info(`${this.constructor.name} reset to initial state`);
|
|
254
|
+
}
|
|
255
|
+
/**
|
|
256
|
+
* Export Q-table and state for persistence
|
|
257
|
+
*/
|
|
258
|
+
export() {
|
|
259
|
+
const serializedQTable = {};
|
|
260
|
+
for (const [state, actions] of this.qTable.entries()) {
|
|
261
|
+
serializedQTable[state] = {};
|
|
262
|
+
for (const [action, qValue] of actions.entries()) {
|
|
263
|
+
serializedQTable[state][action] = qValue;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
return {
|
|
267
|
+
qTable: serializedQTable,
|
|
268
|
+
config: { ...this.config },
|
|
269
|
+
stepCount: this.stepCount,
|
|
270
|
+
episodeCount: this.episodeCount
|
|
271
|
+
};
|
|
272
|
+
}
|
|
273
|
+
/**
|
|
274
|
+
* Import Q-table and state from persistence
|
|
275
|
+
*/
|
|
276
|
+
import(state) {
|
|
277
|
+
this.qTable.clear();
|
|
278
|
+
for (const [stateKey, actions] of Object.entries(state.qTable)) {
|
|
279
|
+
const actionMap = new Map();
|
|
280
|
+
for (const [actionKey, qValue] of Object.entries(actions)) {
|
|
281
|
+
actionMap.set(actionKey, qValue);
|
|
282
|
+
}
|
|
283
|
+
this.qTable.set(stateKey, actionMap);
|
|
284
|
+
}
|
|
285
|
+
this.config = { ...state.config };
|
|
286
|
+
this.stepCount = state.stepCount;
|
|
287
|
+
this.episodeCount = state.episodeCount;
|
|
288
|
+
this.logger.info(`Imported Q-table with ${this.getTableSize()} state-action pairs`);
|
|
289
|
+
}
|
|
290
|
+
/**
|
|
291
|
+
* Get memory usage estimate in bytes
|
|
292
|
+
*/
|
|
293
|
+
getMemoryUsage() {
|
|
294
|
+
const qTableSize = JSON.stringify(this.export().qTable).length;
|
|
295
|
+
const bufferSize = this.replayBuffer?.getMemoryUsage() ?? 0;
|
|
296
|
+
return qTableSize + bufferSize;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
exports.AbstractRLLearner = AbstractRLLearner;
|
|
300
|
+
//# sourceMappingURL=AbstractRLLearner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"AbstractRLLearner.js","sourceRoot":"","sources":["../../../src/learning/algorithms/AbstractRLLearner.ts"],"names":[],"mappings":";AAAA;;;;;;;;;GASG;;;AAEH,+CAA4C;AAE5C,sEAAmE;AA2BnE;;GAEG;AACH,MAAsB,iBAAiB;IAQrC,YAAY,MAAgB;QAC1B,IAAI,CAAC,MAAM,GAAG,eAAM,CAAC,WAAW,EAAE,CAAC;QACnC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,IAAI,CAAC,MAAM,GAAG,IAAI,GAAG,EAAE,CAAC;QACxB,IAAI,CAAC,SAAS,GAAG,CAAC,CAAC;QACnB,IAAI,CAAC,YAAY,GAAG,CAAC,CAAC;QAEtB,iDAAiD;QACjD,IAAI,IAAI,CAAC,MAAM,CAAC,mBAAmB,EAAE,CAAC;YACpC,IAAI,CAAC,YAAY,GAAG,IAAI,+CAAsB,CAAC;gBAC7C,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,gBAAgB;gBACrC,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS;gBAC9B,WAAW,EAAE,KAAK;aACnB,CAAC,CAAC;QACL,CAAC;QAED,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,WAAW,CAAC,IAAI,cAAc,EAAE,EAAE,MAAM,EAAE,CAAC,CAAC;IACvE,CAAC;IAED;;;;OAIG;IACH,YAAY,CAAC,KAAgB,EAAE,gBAA+B;QAC5D,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAClC,MAAM,IAAI,KAAK,CAAC,qCAAqC,CAAC,CAAC;QACzD,CAAC;QAED,6BAA6B;QAC7B,IAAI,IAAI,CAAC,MAAM,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,eAAe,EAAE,CAAC;YAChD,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,gBAAgB,CAAC,MAAM,CAAC,CAAC;YACxE,OAAO,gBAAgB,CAAC,WAAW,CAAC,CAAC;QACvC,CAAC;QAED,8CAA8C;QAC9C,OAAO,IAAI,CAAC,aAAa,CAAC,KAAK,EAAE,gBAAgB,CAAC,CAAC;IACrD,CAAC;IAED;;OAEG;IACH,aAAa,CAAC,KAAgB,EAAE,gBAA+B;QAC7D,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAE/C,IAAI,CAAC,YAAY,IAAI,YAAY,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;YAC7C,wCAAwC;YACxC,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,gBAAgB,CAAC,MAAM,CAAC,CAAC;YACxE,OAAO,gBAAgB,CAAC,WAAW,CAAC,CAAC;QACvC,CAAC;QAED,mCAAmC;QACnC,IAAI,UAAU,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC;QACrC,IAAI,SAAS,GAAG,CAAC,QAAQ,CAAC;QAE1B,KAAK,MAAM,MAAM,IAAI,gBAAgB,EAAE,CAAC;YACtC,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YAC5C,MAAM,MAAM,GAAG,YAAY,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;YAE3C,IAAI,MAAM,IAAI,MAAM,CAAC,KAAK,GAAG,SAAS,EAAE,CAAC;gBACvC,SAAS,GAAG,MAAM,CAAC,KAAK,CAAC;gBACzB,UAAU,GAAG,MAAM,CAAC;YACtB,CAAC;QACH,CAAC;QAED,OAAO,UAAU,CAAC;IACpB,CAAC;IAQD;;OAEG;IACH,SAAS,CAAC,KAAgB,EAAE,MAAmB;QAC7C,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;QAE5C,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAC/C,IAAI,CAAC,YAAY,EAAE,CAAC;YAClB,OAAO,CAAC,CAAC;QACX,CAAC;QAED,MAAM,MAAM,GAAG,YAAY,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QAC3C,OAAO,MAAM,EAAE,KAAK,IAAI,CAAC,CAAC;IAC5B,CAAC;IAED;;OAEG;IACO,SAAS,CAAC,QAAgB,EAAE,SAAiB,EAAE,KAAa;QACpE,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC/B,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,QAAQ,EAAE,IAAI,GAAG,EAAE,CAAC,CAAC;QACvC,CAAC;QACD,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAE,CAAC;QAEhD,MAAM,aAAa,GAAG,YAAY,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QAClD,YAAY,CAAC,GAAG,CAAC,SAAS,EAAE;YAC1B,KAAK,EAAE,QAAQ;YACf,MAAM,EAAE,SAAS;YACjB,KAAK;YACL,WAAW,EAAE,CAAC,aAAa,EAAE,WAAW,IAAI,CAAC,CAAC,GAAG,CAAC;YAClD,WAAW,EAAE,IAAI,CAAC,GAAG,EAAE;SACxB,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,cAAc,CAAC,KAAgB;QAC7B,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAE/C,IAAI,CAAC,YAAY,EAAE,CAAC;YAClB,OAAO,IAAI,GAAG,EAAE,CAAC;QACnB,CAAC;QAED,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;QACzC,KAAK,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,IAAI,YAAY,CAAC,OAAO,EAAE,EAAE,CAAC;YACzD,MAAM,CAAC,GAAG,CAAC,SAAS,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QACtC,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;;OAGG;IACH,aAAa,CAAC,KAAgB;QAC5B,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAE/C,IAAI,CAAC,YAAY,IAAI,YAAY,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;YAC7C,OAAO,CAAC,CAAC;QACX,CAAC;QAED,OAAO,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC;IAC5E,CAAC;IAED;;;OAGG;IACH,WAAW;QACT,IAAI,CAAC,IAAI,CAAC,YAAY,IAAI,CAAC,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,EAAE,CAAC;YAC9E,OAAO;QACT,CAAC;QAED,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAE9D,KAAK,MAAM,UAAU,IAAI,KAAK,EAAE,CAAC;YAC/B,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;QAC1B,CAAC;QAED,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,+BAA+B,KAAK,CAAC,MAAM,cAAc,CAAC,CAAC;IAC/E,CAAC;IAED;;;OAGG;IACH,gBAAgB;QACd,IAAI,CAAC,MAAM,CAAC,eAAe,GAAG,IAAI,CAAC,GAAG,CACpC,IAAI,CAAC,MAAM,CAAC,kBAAkB,EAC9B,IAAI,CAAC,MAAM,CAAC,eAAe,GAAG,IAAI,CAAC,MAAM,CAAC,gBAAgB,CAC3D,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,UAAU;QACR,IAAI,CAAC,YAAY,EAAE,CAAC;QACpB,IAAI,CAAC,gBAAgB,EAAE,CAAC;QAExB,kDAAkD;QAClD,IAAI,IAAI,CAAC,MAAM,CAAC,mBAAmB,EAAE,CAAC;YACpC,IAAI,CAAC,WAAW,EAAE,CAAC;QACrB,CAAC;IACH,CAAC;IAED;;;OAGG;IACO,WAAW,CAAC,KAAgB;QACpC,mCAAmC;QACnC,MAAM,QAAQ,GAAG;YACf,KAAK,CAAC,cAAc;YACpB,KAAK,CAAC,oBAAoB,CAAC,MAAM,GAAG,EAAE,EAAE,YAAY;YACpD,KAAK,CAAC,gBAAgB,GAAG,CAAC,EAAE,YAAY;YACxC,KAAK,CAAC,kBAAkB;YACxB,KAAK,CAAC,cAAc,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,cAAc,GAAG,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,qBAAqB;SAC5F,CAAC;QAEF,+CAA+C;QAC/C,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,EAAE,CAAC,GAAG,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC9D,CAAC;IAED;;OAEG;IACO,YAAY,CAAC,MAAmB;QACxC,OAAO,GAAG,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,eAAe,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC;IACzF,CAAC;IAED;;OAEG;IACH,kBAAkB;QAChB,OAAO,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC;IACrC,CAAC;IAED;;OAEG;IACH,YAAY;QACV,OAAO,IAAI,CAAC,SAAS,CAAC;IACxB,CAAC;IAED;;OAEG;IACH,eAAe;QACb,OAAO,IAAI,CAAC,YAAY,CAAC;IAC3B,CAAC;IAED;;OAEG;IACH,YAAY;QACV,IAAI,IAAI,GAAG,CAAC,CAAC;QACb,KAAK,MAAM,YAAY,IAAI,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,EAAE,CAAC;YAChD,IAAI,IAAI,YAAY,CAAC,IAAI,CAAC;QAC5B,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAED;;OAEG;IACH,aAAa;QASX,IAAI,WAAW,GAAG,CAAC,CAAC;QACpB,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,IAAI,GAAG,CAAC,QAAQ,CAAC;QACrB,IAAI,IAAI,GAAG,QAAQ,CAAC;QAEpB,KAAK,MAAM,YAAY,IAAI,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,EAAE,CAAC;YAChD,KAAK,MAAM,MAAM,IAAI,YAAY,CAAC,MAAM,EAAE,EAAE,CAAC;gBAC3C,WAAW,IAAI,MAAM,CAAC,KAAK,CAAC;gBAC5B,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;gBACpC,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;gBACpC,KAAK,EAAE,CAAC;YACV,CAAC;QACH,CAAC;QAED,OAAO;YACL,KAAK,EAAE,IAAI,CAAC,SAAS;YACrB,QAAQ,EAAE,IAAI,CAAC,YAAY;YAC3B,SAAS,EAAE,KAAK;YAChB,eAAe,EAAE,IAAI,CAAC,MAAM,CAAC,eAAe;YAC5C,SAAS,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YAC9C,SAAS,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC/B,SAAS,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;SAChC,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK;QACH,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;QACpB,IAAI,CAAC,SAAS,GAAG,CAAC,CAAC;QACnB,IAAI,CAAC,YAAY,GAAG,CAAC,CAAC;QACtB,IAAI,CAAC,MAAM,CAAC,eAAe,GAAG,IAAI,CAAC,yBAAyB,EAAE,CAAC;QAE/D,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YACtB,IAAI,CAAC,YAAY,CAAC,KAAK,EAAE,CAAC;QAC5B,CAAC;QAED,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,WAAW,CAAC,IAAI,yBAAyB,CAAC,CAAC;IACtE,CAAC;IAOD;;OAEG;IACH,MAAM;QAMJ,MAAM,gBAAgB,GAA2C,EAAE,CAAC;QAEpE,KAAK,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,EAAE,CAAC;YACrD,gBAAgB,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC;YAC7B,KAAK,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;gBACjD,gBAAgB,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,GAAG,MAAM,CAAC;YAC3C,CAAC;QACH,CAAC;QAED,OAAO;YACL,MAAM,EAAE,gBAAgB;YACxB,MAAM,EAAE,EAAE,GAAG,IAAI,CAAC,MAAM,EAAE;YAC1B,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,YAAY,EAAE,IAAI,CAAC,YAAY;SAChC,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,KAKN;QACC,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;QAEpB,KAAK,MAAM,CAAC,QAAQ,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,EAAE,CAAC;YAC/D,MAAM,SAAS,GAAG,IAAI,GAAG,EAAkB,CAAC;YAC5C,KAAK,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;gBAC1D,SAAS,CAAC,GAAG,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;YACnC,CAAC;YACD,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QACvC,CAAC;QAED,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;QAClC,IAAI,CAAC,SAAS,GAAG,KAAK,CAAC,SAAS,CAAC;QACjC,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC,YAAY,CAAC;QAEvC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,yBAAyB,IAAI,CAAC,YAAY,EAAE,qBAAqB,CAAC,CAAC;IACtF,CAAC;IAED;;OAEG;IACH,cAAc;QACZ,MAAM,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;QAC/D,MAAM,UAAU,GAAG,IAAI,CAAC,YAAY,EAAE,cAAc,EAAE,IAAI,CAAC,CAAC;QAC5D,OAAO,UAAU,GAAG,UAAU,CAAC;IACjC,CAAC;CACF;AAjXD,8CAiXC"}
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ActorCriticLearner - Actor-Critic Reinforcement Learning Algorithm
|
|
3
|
+
*
|
|
4
|
+
* Implements Advantage Actor-Critic (A2C) combining:
|
|
5
|
+
* - Actor: Policy network that selects actions using softmax policy
|
|
6
|
+
* - Critic: Value network that estimates state values for advantage calculation
|
|
7
|
+
*
|
|
8
|
+
* Key features:
|
|
9
|
+
* - Continuous action probabilities via softmax
|
|
10
|
+
* - Advantage-based updates to reduce variance
|
|
11
|
+
* - Entropy bonus for exploration
|
|
12
|
+
* - Policy gradient with baseline
|
|
13
|
+
*
|
|
14
|
+
* Update rules:
|
|
15
|
+
* - Critic (Value): V(s) += α_c * δ where δ = r + γV(s') - V(s)
|
|
16
|
+
* - Actor (Policy): π(a|s) += α_a * δ * ∇log(π(a|s)) + β * H(π)
|
|
17
|
+
*
|
|
18
|
+
* @module learning/algorithms/ActorCriticLearner
|
|
19
|
+
* @version 1.0.0
|
|
20
|
+
*/
|
|
21
|
+
import { AbstractRLLearner, RLConfig } from './AbstractRLLearner';
|
|
22
|
+
import { TaskState, AgentAction, TaskExperience } from '../types';
|
|
23
|
+
/**
|
|
24
|
+
* Configuration specific to Actor-Critic algorithm
|
|
25
|
+
*/
|
|
26
|
+
export interface ActorCriticConfig extends RLConfig {
|
|
27
|
+
/** Actor learning rate (α_a) - typically smaller than critic */
|
|
28
|
+
actorLearningRate: number;
|
|
29
|
+
/** Critic learning rate (α_c) */
|
|
30
|
+
criticLearningRate: number;
|
|
31
|
+
/** Entropy coefficient (β) for exploration bonus */
|
|
32
|
+
entropyCoefficient: number;
|
|
33
|
+
/** Temperature for softmax action selection */
|
|
34
|
+
temperature: number;
|
|
35
|
+
/** Whether to use advantage normalization */
|
|
36
|
+
normalizeAdvantage: boolean;
|
|
37
|
+
/** Target network update frequency (for stability) */
|
|
38
|
+
targetUpdateFrequency: number;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Policy entry storing action probabilities
|
|
42
|
+
*/
|
|
43
|
+
interface PolicyEntry {
|
|
44
|
+
action: string;
|
|
45
|
+
probability: number;
|
|
46
|
+
logProbability: number;
|
|
47
|
+
updateCount: number;
|
|
48
|
+
lastUpdated: number;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* State value entry for critic
|
|
52
|
+
*/
|
|
53
|
+
interface StateValueEntry {
|
|
54
|
+
state: string;
|
|
55
|
+
value: number;
|
|
56
|
+
updateCount: number;
|
|
57
|
+
lastUpdated: number;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* ActorCriticLearner - Advantage Actor-Critic implementation
|
|
61
|
+
*
|
|
62
|
+
* Combines policy gradient (actor) with value function approximation (critic)
|
|
63
|
+
* for more stable and efficient learning than pure Q-learning.
|
|
64
|
+
*
|
|
65
|
+
* Usage:
|
|
66
|
+
* ```typescript
|
|
67
|
+
* const ac = new ActorCriticLearner({
|
|
68
|
+
* learningRate: 0.1,
|
|
69
|
+
* actorLearningRate: 0.01,
|
|
70
|
+
* criticLearningRate: 0.1,
|
|
71
|
+
* discountFactor: 0.95,
|
|
72
|
+
* explorationRate: 0.3,
|
|
73
|
+
* explorationDecay: 0.995,
|
|
74
|
+
* minExplorationRate: 0.01,
|
|
75
|
+
* entropyCoefficient: 0.01,
|
|
76
|
+
* temperature: 1.0,
|
|
77
|
+
* normalizeAdvantage: true,
|
|
78
|
+
* targetUpdateFrequency: 100,
|
|
79
|
+
* useExperienceReplay: true,
|
|
80
|
+
* replayBufferSize: 10000,
|
|
81
|
+
* batchSize: 32
|
|
82
|
+
* });
|
|
83
|
+
*
|
|
84
|
+
* const action = ac.selectAction(state, availableActions);
|
|
85
|
+
* ac.update(experience);
|
|
86
|
+
* ```
|
|
87
|
+
*/
|
|
88
|
+
export declare class ActorCriticLearner extends AbstractRLLearner {
|
|
89
|
+
private actorConfig;
|
|
90
|
+
private policyTable;
|
|
91
|
+
private valueTable;
|
|
92
|
+
private targetValueTable;
|
|
93
|
+
private updatesSinceTargetSync;
|
|
94
|
+
private advantageHistory;
|
|
95
|
+
private readonly defaultExploration;
|
|
96
|
+
constructor(config: ActorCriticConfig);
|
|
97
|
+
/**
|
|
98
|
+
* Select action using softmax policy with exploration
|
|
99
|
+
* π(a|s) = exp(Q(s,a)/τ) / Σ_a' exp(Q(s,a')/τ)
|
|
100
|
+
*/
|
|
101
|
+
selectAction(state: TaskState, availableActions: AgentAction[]): AgentAction;
|
|
102
|
+
/**
|
|
103
|
+
* Sample action from softmax policy distribution
|
|
104
|
+
*/
|
|
105
|
+
private sampleFromPolicy;
|
|
106
|
+
/**
|
|
107
|
+
* Get softmax action probabilities
|
|
108
|
+
* π(a|s) = exp(preference(s,a)/τ) / Σ_a' exp(preference(s,a')/τ)
|
|
109
|
+
*/
|
|
110
|
+
private getActionProbabilities;
|
|
111
|
+
/**
|
|
112
|
+
* Get preference for state-action pair from policy table
|
|
113
|
+
*/
|
|
114
|
+
private getPreference;
|
|
115
|
+
/**
|
|
116
|
+
* Update actor and critic using temporal difference
|
|
117
|
+
*
|
|
118
|
+
* TD Error (advantage): δ = r + γV(s') - V(s)
|
|
119
|
+
* Critic update: V(s) += α_c * δ
|
|
120
|
+
* Actor update: preference(s,a) += α_a * δ * (1 - π(a|s))
|
|
121
|
+
*/
|
|
122
|
+
update(experience: TaskExperience, nextAction?: AgentAction): void;
|
|
123
|
+
/**
|
|
124
|
+
* Update critic (value function)
|
|
125
|
+
* V(s) += α_c * δ
|
|
126
|
+
*/
|
|
127
|
+
private updateCritic;
|
|
128
|
+
/**
|
|
129
|
+
* Update actor (policy)
|
|
130
|
+
* For softmax policy: preference(s,a) += α_a * δ * (1 - π(a|s))
|
|
131
|
+
* This increases preference for actions with positive advantage
|
|
132
|
+
*/
|
|
133
|
+
private updateActor;
|
|
134
|
+
/**
|
|
135
|
+
* Calculate entropy bonus for a state
|
|
136
|
+
* H(π(·|s)) = -Σ_a π(a|s) log(π(a|s))
|
|
137
|
+
*/
|
|
138
|
+
private calculateEntropyBonus;
|
|
139
|
+
/**
|
|
140
|
+
* Get softmax probability for a specific action
|
|
141
|
+
*/
|
|
142
|
+
private softmaxProb;
|
|
143
|
+
/**
|
|
144
|
+
* Normalize advantage using running statistics
|
|
145
|
+
*/
|
|
146
|
+
private normalizeAdvantage;
|
|
147
|
+
/**
|
|
148
|
+
* Get state value from value table
|
|
149
|
+
*/
|
|
150
|
+
getStateValue(state: TaskState): number;
|
|
151
|
+
/**
|
|
152
|
+
* Get state value from target network (for stability)
|
|
153
|
+
*/
|
|
154
|
+
private getTargetStateValue;
|
|
155
|
+
/**
|
|
156
|
+
* Sync target network with main network
|
|
157
|
+
*/
|
|
158
|
+
private syncTargetNetwork;
|
|
159
|
+
/**
|
|
160
|
+
* Extract experience components
|
|
161
|
+
*/
|
|
162
|
+
private extractExperience;
|
|
163
|
+
/**
|
|
164
|
+
* Get default exploration rate for reset
|
|
165
|
+
*/
|
|
166
|
+
protected getDefaultExplorationRate(): number;
|
|
167
|
+
/**
|
|
168
|
+
* Get actor-critic specific statistics
|
|
169
|
+
*/
|
|
170
|
+
getActorCriticStatistics(): {
|
|
171
|
+
valueTableSize: number;
|
|
172
|
+
policyTableSize: number;
|
|
173
|
+
avgStateValue: number;
|
|
174
|
+
avgEntropy: number;
|
|
175
|
+
advantageMean: number;
|
|
176
|
+
advantageStd: number;
|
|
177
|
+
};
|
|
178
|
+
/**
|
|
179
|
+
* Reset actor-critic specific state
|
|
180
|
+
*/
|
|
181
|
+
reset(): void;
|
|
182
|
+
/**
|
|
183
|
+
* Export complete actor-critic state
|
|
184
|
+
*/
|
|
185
|
+
exportActorCritic(): {
|
|
186
|
+
base: ReturnType<AbstractRLLearner['export']>;
|
|
187
|
+
valueTable: Record<string, StateValueEntry>;
|
|
188
|
+
policyTable: Record<string, Record<string, PolicyEntry>>;
|
|
189
|
+
actorConfig: ActorCriticConfig;
|
|
190
|
+
};
|
|
191
|
+
/**
|
|
192
|
+
* Import complete actor-critic state
|
|
193
|
+
*/
|
|
194
|
+
importActorCritic(state: ReturnType<typeof this.exportActorCritic>): void;
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* Create default Actor-Critic configuration
|
|
198
|
+
*/
|
|
199
|
+
export declare function createDefaultActorCriticConfig(): ActorCriticConfig;
|
|
200
|
+
export {};
|
|
201
|
+
//# sourceMappingURL=ActorCriticLearner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ActorCriticLearner.d.ts","sourceRoot":"","sources":["../../../src/learning/algorithms/ActorCriticLearner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAEH,OAAO,EAAE,iBAAiB,EAAE,QAAQ,EAAU,MAAM,qBAAqB,CAAC;AAC1E,OAAO,EAAE,SAAS,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,UAAU,CAAC;AAElE;;GAEG;AACH,MAAM,WAAW,iBAAkB,SAAQ,QAAQ;IACjD,gEAAgE;IAChE,iBAAiB,EAAE,MAAM,CAAC;IAC1B,iCAAiC;IACjC,kBAAkB,EAAE,MAAM,CAAC;IAC3B,oDAAoD;IACpD,kBAAkB,EAAE,MAAM,CAAC;IAC3B,+CAA+C;IAC/C,WAAW,EAAE,MAAM,CAAC;IACpB,6CAA6C;IAC7C,kBAAkB,EAAE,OAAO,CAAC;IAC5B,sDAAsD;IACtD,qBAAqB,EAAE,MAAM,CAAC;CAC/B;AAED;;GAEG;AACH,UAAU,WAAW;IACnB,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;IACpB,cAAc,EAAE,MAAM,CAAC;IACvB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,UAAU,eAAe;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,qBAAa,kBAAmB,SAAQ,iBAAiB;IACvD,OAAO,CAAC,WAAW,CAAoB;IACvC,OAAO,CAAC,WAAW,CAAwC;IAC3D,OAAO,CAAC,UAAU,CAA+B;IACjD,OAAO,CAAC,gBAAgB,CAA+B;IACvD,OAAO,CAAC,sBAAsB,CAAS;IACvC,OAAO,CAAC,gBAAgB,CAAW;IACnC,OAAO,CAAC,QAAQ,CAAC,kBAAkB,CAAS;gBAEhC,MAAM,EAAE,iBAAiB;IAkBrC;;;OAGG;IACM,YAAY,CAAC,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,WAAW,EAAE,GAAG,WAAW;IAerF;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;;OAGG;IACH,OAAO,CAAC,sBAAsB;IAmB9B;;OAEG;IACH,OAAO,CAAC,aAAa;IAUrB;;;;;;OAMG;IACM,MAAM,CAAC,UAAU,EAAE,cAAc,EAAE,UAAU,CAAC,EAAE,WAAW,GAAG,IAAI;IA8C3E;;;OAGG;IACH,OAAO,CAAC,YAAY;IAYpB;;;;OAIG;IACH,OAAO,CAAC,WAAW;IA2BnB;;;OAGG;IACH,OAAO,CAAC,qBAAqB;IAuB7B;;OAEG;IACH,OAAO,CAAC,WAAW;IAwBnB;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAmB1B;;OAEG;IACM,aAAa,CAAC,KAAK,EAAE,SAAS,GAAG,MAAM;IAMhD;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAM3B;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAQzB;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAgBzB;;OAEG;IACH,SAAS,CAAC,yBAAyB,IAAI,MAAM;IAI7C;;OAEG;IACH,wBAAwB,IAAI;QAC1B,cAAc,EAAE,MAAM,CAAC;QACvB,eAAe,EAAE,MAAM,CAAC;QACxB,aAAa,EAAE,MAAM,CAAC;QACtB,UAAU,EAAE,MAAM,CAAC;QACnB,aAAa,EAAE,MAAM,CAAC;QACtB,YAAY,EAAE,MAAM,CAAC;KACtB;IA0CD;;OAEG;IACM,KAAK,IAAI,IAAI;IAUtB;;OAEG;IACH,iBAAiB,IAAI;QACnB,IAAI,EAAE,UAAU,CAAC,iBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC;QAC9C,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,eAAe,CAAC,CAAC;QAC5C,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC,CAAC;QACzD,WAAW,EAAE,iBAAiB,CAAC;KAChC;IAsBD;;OAEG;IACH,iBAAiB,CAAC,KAAK,EAAE,UAAU,CAAC,OAAO,IAAI,CAAC,iBAAiB,CAAC,GAAG,IAAI;CAyB1E;AAED;;GAEG;AACH,wBAAgB,8BAA8B,IAAI,iBAAiB,CAiBlE"}
|