ruvector 0.1.65 → 0.1.66
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.ruvector/intelligence.json +125 -0
- package/README.md +169 -20
- package/bin/cli.js +443 -0
- package/bin/mcp-server.js +337 -0
- package/dist/core/index.d.ts +4 -0
- package/dist/core/index.d.ts.map +1 -1
- package/dist/core/index.js +8 -1
- package/dist/core/learning-engine.d.ts +160 -0
- package/dist/core/learning-engine.d.ts.map +1 -0
- package/dist/core/learning-engine.js +589 -0
- package/dist/core/tensor-compress.d.ts +134 -0
- package/dist/core/tensor-compress.d.ts.map +1 -0
- package/dist/core/tensor-compress.js +432 -0
- package/package.json +1 -1
- package/ruvector.db +0 -0
|
@@ -0,0 +1,589 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Multi-Algorithm Learning Engine
|
|
4
|
+
* Supports 9 RL algorithms for intelligent hooks optimization
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.LearningEngine = void 0;
|
|
8
|
+
// Default configs for each task type
|
|
9
|
+
const TASK_ALGORITHM_MAP = {
|
|
10
|
+
'agent-routing': {
|
|
11
|
+
algorithm: 'double-q',
|
|
12
|
+
learningRate: 0.1,
|
|
13
|
+
discountFactor: 0.95,
|
|
14
|
+
epsilon: 0.1,
|
|
15
|
+
},
|
|
16
|
+
'error-avoidance': {
|
|
17
|
+
algorithm: 'sarsa',
|
|
18
|
+
learningRate: 0.05,
|
|
19
|
+
discountFactor: 0.99,
|
|
20
|
+
epsilon: 0.05,
|
|
21
|
+
},
|
|
22
|
+
'confidence-scoring': {
|
|
23
|
+
algorithm: 'actor-critic',
|
|
24
|
+
learningRate: 0.01,
|
|
25
|
+
discountFactor: 0.95,
|
|
26
|
+
epsilon: 0.1,
|
|
27
|
+
entropyCoef: 0.01,
|
|
28
|
+
},
|
|
29
|
+
'trajectory-learning': {
|
|
30
|
+
algorithm: 'decision-transformer',
|
|
31
|
+
learningRate: 0.001,
|
|
32
|
+
discountFactor: 0.99,
|
|
33
|
+
epsilon: 0,
|
|
34
|
+
sequenceLength: 20,
|
|
35
|
+
},
|
|
36
|
+
'context-ranking': {
|
|
37
|
+
algorithm: 'ppo',
|
|
38
|
+
learningRate: 0.0003,
|
|
39
|
+
discountFactor: 0.99,
|
|
40
|
+
epsilon: 0.2,
|
|
41
|
+
clipRange: 0.2,
|
|
42
|
+
entropyCoef: 0.01,
|
|
43
|
+
},
|
|
44
|
+
'memory-recall': {
|
|
45
|
+
algorithm: 'td-lambda',
|
|
46
|
+
learningRate: 0.1,
|
|
47
|
+
discountFactor: 0.9,
|
|
48
|
+
epsilon: 0.1,
|
|
49
|
+
lambda: 0.8,
|
|
50
|
+
},
|
|
51
|
+
};
|
|
52
|
+
class LearningEngine {
|
|
53
|
+
constructor() {
|
|
54
|
+
this.configs = new Map();
|
|
55
|
+
this.qTables = new Map();
|
|
56
|
+
this.qTables2 = new Map(); // For Double-Q
|
|
57
|
+
this.eligibilityTraces = new Map();
|
|
58
|
+
this.actorWeights = new Map();
|
|
59
|
+
this.criticValues = new Map();
|
|
60
|
+
this.trajectories = [];
|
|
61
|
+
this.stats = new Map();
|
|
62
|
+
this.rewardHistory = [];
|
|
63
|
+
// Initialize with default configs
|
|
64
|
+
for (const [task, config] of Object.entries(TASK_ALGORITHM_MAP)) {
|
|
65
|
+
this.configs.set(task, { ...config });
|
|
66
|
+
}
|
|
67
|
+
// Initialize stats for all algorithms
|
|
68
|
+
const algorithms = [
|
|
69
|
+
'q-learning', 'sarsa', 'double-q', 'actor-critic',
|
|
70
|
+
'ppo', 'decision-transformer', 'monte-carlo', 'td-lambda', 'dqn'
|
|
71
|
+
];
|
|
72
|
+
for (const alg of algorithms) {
|
|
73
|
+
this.stats.set(alg, {
|
|
74
|
+
algorithm: alg,
|
|
75
|
+
updates: 0,
|
|
76
|
+
avgReward: 0,
|
|
77
|
+
convergenceScore: 0,
|
|
78
|
+
lastUpdate: Date.now(),
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Configure algorithm for a specific task type
|
|
84
|
+
*/
|
|
85
|
+
configure(task, config) {
|
|
86
|
+
const existing = this.configs.get(task) || TASK_ALGORITHM_MAP[task];
|
|
87
|
+
this.configs.set(task, { ...existing, ...config });
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Get current configuration for a task
|
|
91
|
+
*/
|
|
92
|
+
getConfig(task) {
|
|
93
|
+
return this.configs.get(task) || TASK_ALGORITHM_MAP[task];
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Update Q-value using the appropriate algorithm
|
|
97
|
+
*/
|
|
98
|
+
update(task, experience) {
|
|
99
|
+
const config = this.getConfig(task);
|
|
100
|
+
let delta = 0;
|
|
101
|
+
switch (config.algorithm) {
|
|
102
|
+
case 'q-learning':
|
|
103
|
+
delta = this.qLearningUpdate(experience, config);
|
|
104
|
+
break;
|
|
105
|
+
case 'sarsa':
|
|
106
|
+
delta = this.sarsaUpdate(experience, config);
|
|
107
|
+
break;
|
|
108
|
+
case 'double-q':
|
|
109
|
+
delta = this.doubleQUpdate(experience, config);
|
|
110
|
+
break;
|
|
111
|
+
case 'actor-critic':
|
|
112
|
+
delta = this.actorCriticUpdate(experience, config);
|
|
113
|
+
break;
|
|
114
|
+
case 'ppo':
|
|
115
|
+
delta = this.ppoUpdate(experience, config);
|
|
116
|
+
break;
|
|
117
|
+
case 'td-lambda':
|
|
118
|
+
delta = this.tdLambdaUpdate(experience, config);
|
|
119
|
+
break;
|
|
120
|
+
case 'monte-carlo':
|
|
121
|
+
// Monte Carlo needs full episodes
|
|
122
|
+
this.addToCurrentTrajectory(experience);
|
|
123
|
+
if (experience.done) {
|
|
124
|
+
delta = this.monteCarloUpdate(config);
|
|
125
|
+
}
|
|
126
|
+
break;
|
|
127
|
+
case 'decision-transformer':
|
|
128
|
+
this.addToCurrentTrajectory(experience);
|
|
129
|
+
if (experience.done) {
|
|
130
|
+
delta = this.decisionTransformerUpdate(config);
|
|
131
|
+
}
|
|
132
|
+
break;
|
|
133
|
+
case 'dqn':
|
|
134
|
+
delta = this.dqnUpdate(experience, config);
|
|
135
|
+
break;
|
|
136
|
+
}
|
|
137
|
+
// Update stats
|
|
138
|
+
this.updateStats(config.algorithm, experience.reward, Math.abs(delta));
|
|
139
|
+
return delta;
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Get best action for a state
|
|
143
|
+
*/
|
|
144
|
+
getBestAction(task, state, actions) {
|
|
145
|
+
const config = this.getConfig(task);
|
|
146
|
+
// Epsilon-greedy exploration
|
|
147
|
+
if (Math.random() < config.epsilon) {
|
|
148
|
+
const randomAction = actions[Math.floor(Math.random() * actions.length)];
|
|
149
|
+
return { action: randomAction, confidence: 0.5 };
|
|
150
|
+
}
|
|
151
|
+
let bestAction = actions[0];
|
|
152
|
+
let bestValue = -Infinity;
|
|
153
|
+
let values = [];
|
|
154
|
+
const qTable = this.getQTable(state);
|
|
155
|
+
for (const action of actions) {
|
|
156
|
+
const value = qTable.get(action) || 0;
|
|
157
|
+
values.push(value);
|
|
158
|
+
if (value > bestValue) {
|
|
159
|
+
bestValue = value;
|
|
160
|
+
bestAction = action;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
// Calculate confidence using softmax
|
|
164
|
+
const confidence = this.softmaxConfidence(values, actions.indexOf(bestAction));
|
|
165
|
+
return { action: bestAction, confidence };
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Get action probabilities (for Actor-Critic and PPO)
|
|
169
|
+
*/
|
|
170
|
+
getActionProbabilities(state, actions) {
|
|
171
|
+
const probs = new Map();
|
|
172
|
+
const qTable = this.getQTable(state);
|
|
173
|
+
const values = actions.map(a => qTable.get(a) || 0);
|
|
174
|
+
const maxVal = Math.max(...values);
|
|
175
|
+
const expValues = values.map(v => Math.exp(v - maxVal));
|
|
176
|
+
const sumExp = expValues.reduce((a, b) => a + b, 0);
|
|
177
|
+
for (let i = 0; i < actions.length; i++) {
|
|
178
|
+
probs.set(actions[i], expValues[i] / sumExp);
|
|
179
|
+
}
|
|
180
|
+
return probs;
|
|
181
|
+
}
|
|
182
|
+
// ============ Algorithm Implementations ============
|
|
183
|
+
/**
|
|
184
|
+
* Standard Q-Learning: Q(s,a) += α * (r + γ * max_a' Q(s',a') - Q(s,a))
|
|
185
|
+
*/
|
|
186
|
+
qLearningUpdate(exp, config) {
|
|
187
|
+
const { state, action, reward, nextState, done } = exp;
|
|
188
|
+
const { learningRate: α, discountFactor: γ } = config;
|
|
189
|
+
const qTable = this.getQTable(state);
|
|
190
|
+
const nextQTable = this.getQTable(nextState);
|
|
191
|
+
const currentQ = qTable.get(action) || 0;
|
|
192
|
+
const maxNextQ = done ? 0 : Math.max(0, ...Array.from(nextQTable.values()));
|
|
193
|
+
const target = reward + γ * maxNextQ;
|
|
194
|
+
const delta = target - currentQ;
|
|
195
|
+
const newQ = currentQ + α * delta;
|
|
196
|
+
qTable.set(action, newQ);
|
|
197
|
+
return delta;
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* SARSA: On-policy, more conservative
|
|
201
|
+
* Q(s,a) += α * (r + γ * Q(s',a') - Q(s,a))
|
|
202
|
+
*/
|
|
203
|
+
sarsaUpdate(exp, config) {
|
|
204
|
+
const { state, action, reward, nextState, done } = exp;
|
|
205
|
+
const { learningRate: α, discountFactor: γ, epsilon } = config;
|
|
206
|
+
const qTable = this.getQTable(state);
|
|
207
|
+
const nextQTable = this.getQTable(nextState);
|
|
208
|
+
const currentQ = qTable.get(action) || 0;
|
|
209
|
+
// On-policy: use expected value under current policy (ε-greedy)
|
|
210
|
+
let nextQ = 0;
|
|
211
|
+
if (!done) {
|
|
212
|
+
const nextActions = Array.from(nextQTable.keys());
|
|
213
|
+
if (nextActions.length > 0) {
|
|
214
|
+
const maxQ = Math.max(...Array.from(nextQTable.values()));
|
|
215
|
+
const avgQ = Array.from(nextQTable.values()).reduce((a, b) => a + b, 0) / nextActions.length;
|
|
216
|
+
// Expected value under ε-greedy
|
|
217
|
+
nextQ = (1 - epsilon) * maxQ + epsilon * avgQ;
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
const target = reward + γ * nextQ;
|
|
221
|
+
const delta = target - currentQ;
|
|
222
|
+
const newQ = currentQ + α * delta;
|
|
223
|
+
qTable.set(action, newQ);
|
|
224
|
+
return delta;
|
|
225
|
+
}
|
|
226
|
+
/**
|
|
227
|
+
* Double Q-Learning: Reduces overestimation bias
|
|
228
|
+
* Uses two Q-tables, randomly updates one using the other for target
|
|
229
|
+
*/
|
|
230
|
+
doubleQUpdate(exp, config) {
|
|
231
|
+
const { state, action, reward, nextState, done } = exp;
|
|
232
|
+
const { learningRate: α, discountFactor: γ } = config;
|
|
233
|
+
const useFirst = Math.random() < 0.5;
|
|
234
|
+
const qTable = useFirst ? this.getQTable(state) : this.getQTable2(state);
|
|
235
|
+
const otherQTable = useFirst ? this.getQTable2(nextState) : this.getQTable(nextState);
|
|
236
|
+
const nextQTable = useFirst ? this.getQTable(nextState) : this.getQTable2(nextState);
|
|
237
|
+
const currentQ = qTable.get(action) || 0;
|
|
238
|
+
let nextQ = 0;
|
|
239
|
+
if (!done) {
|
|
240
|
+
// Find best action in next state using one table
|
|
241
|
+
let bestAction = '';
|
|
242
|
+
let bestValue = -Infinity;
|
|
243
|
+
for (const [a, v] of nextQTable) {
|
|
244
|
+
if (v > bestValue) {
|
|
245
|
+
bestValue = v;
|
|
246
|
+
bestAction = a;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
// Evaluate using other table
|
|
250
|
+
if (bestAction) {
|
|
251
|
+
nextQ = otherQTable.get(bestAction) || 0;
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
const target = reward + γ * nextQ;
|
|
255
|
+
const delta = target - currentQ;
|
|
256
|
+
const newQ = currentQ + α * delta;
|
|
257
|
+
qTable.set(action, newQ);
|
|
258
|
+
return delta;
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Actor-Critic: Policy gradient with value baseline
|
|
262
|
+
*/
|
|
263
|
+
actorCriticUpdate(exp, config) {
|
|
264
|
+
const { state, action, reward, nextState, done } = exp;
|
|
265
|
+
const { learningRate: α, discountFactor: γ } = config;
|
|
266
|
+
// Critic update (TD error)
|
|
267
|
+
const V = this.criticValues.get(state) || 0;
|
|
268
|
+
const V_next = done ? 0 : (this.criticValues.get(nextState) || 0);
|
|
269
|
+
const tdError = reward + γ * V_next - V;
|
|
270
|
+
this.criticValues.set(state, V + α * tdError);
|
|
271
|
+
// Actor update (policy gradient)
|
|
272
|
+
const qTable = this.getQTable(state);
|
|
273
|
+
const currentQ = qTable.get(action) || 0;
|
|
274
|
+
// Use TD error as advantage estimate
|
|
275
|
+
const newQ = currentQ + α * tdError;
|
|
276
|
+
qTable.set(action, newQ);
|
|
277
|
+
return tdError;
|
|
278
|
+
}
|
|
279
|
+
/**
|
|
280
|
+
* PPO: Clipped policy gradient for stable training
|
|
281
|
+
*/
|
|
282
|
+
ppoUpdate(exp, config) {
|
|
283
|
+
const { state, action, reward, nextState, done } = exp;
|
|
284
|
+
const { learningRate: α, discountFactor: γ, clipRange = 0.2 } = config;
|
|
285
|
+
// Critic update
|
|
286
|
+
const V = this.criticValues.get(state) || 0;
|
|
287
|
+
const V_next = done ? 0 : (this.criticValues.get(nextState) || 0);
|
|
288
|
+
const advantage = reward + γ * V_next - V;
|
|
289
|
+
this.criticValues.set(state, V + α * advantage);
|
|
290
|
+
// Actor update with clipping
|
|
291
|
+
const qTable = this.getQTable(state);
|
|
292
|
+
const oldQ = qTable.get(action) || 0;
|
|
293
|
+
// Compute probability ratio (simplified)
|
|
294
|
+
const ratio = Math.exp(α * advantage);
|
|
295
|
+
const clippedRatio = Math.max(1 - clipRange, Math.min(1 + clipRange, ratio));
|
|
296
|
+
// PPO objective: min(ratio * A, clip(ratio) * A)
|
|
297
|
+
const update = Math.min(ratio * advantage, clippedRatio * advantage);
|
|
298
|
+
const newQ = oldQ + α * update;
|
|
299
|
+
qTable.set(action, newQ);
|
|
300
|
+
return advantage;
|
|
301
|
+
}
|
|
302
|
+
/**
|
|
303
|
+
* TD(λ): Temporal difference with eligibility traces
|
|
304
|
+
*/
|
|
305
|
+
tdLambdaUpdate(exp, config) {
|
|
306
|
+
const { state, action, reward, nextState, done } = exp;
|
|
307
|
+
const { learningRate: α, discountFactor: γ, lambda = 0.8 } = config;
|
|
308
|
+
const qTable = this.getQTable(state);
|
|
309
|
+
const nextQTable = this.getQTable(nextState);
|
|
310
|
+
const currentQ = qTable.get(action) || 0;
|
|
311
|
+
const maxNextQ = done ? 0 : Math.max(0, ...Array.from(nextQTable.values()));
|
|
312
|
+
const tdError = reward + γ * maxNextQ - currentQ;
|
|
313
|
+
// Update eligibility trace for current state-action
|
|
314
|
+
const traces = this.getEligibilityTraces(state);
|
|
315
|
+
traces.set(action, (traces.get(action) || 0) + 1);
|
|
316
|
+
// Update all state-actions with eligibility traces
|
|
317
|
+
for (const [s, sTraces] of this.eligibilityTraces) {
|
|
318
|
+
const sQTable = this.getQTable(s);
|
|
319
|
+
for (const [a, trace] of sTraces) {
|
|
320
|
+
const q = sQTable.get(a) || 0;
|
|
321
|
+
sQTable.set(a, q + α * tdError * trace);
|
|
322
|
+
// Decay trace
|
|
323
|
+
sTraces.set(a, γ * lambda * trace);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
return tdError;
|
|
327
|
+
}
|
|
328
|
+
/**
|
|
329
|
+
* Monte Carlo: Full episode learning
|
|
330
|
+
*/
|
|
331
|
+
monteCarloUpdate(config) {
|
|
332
|
+
const { learningRate: α, discountFactor: γ } = config;
|
|
333
|
+
const trajectory = this.trajectories[this.trajectories.length - 1];
|
|
334
|
+
if (!trajectory || trajectory.experiences.length === 0)
|
|
335
|
+
return 0;
|
|
336
|
+
let G = 0; // Return
|
|
337
|
+
let totalDelta = 0;
|
|
338
|
+
// Work backwards through episode
|
|
339
|
+
for (let t = trajectory.experiences.length - 1; t >= 0; t--) {
|
|
340
|
+
const exp = trajectory.experiences[t];
|
|
341
|
+
G = exp.reward + γ * G;
|
|
342
|
+
const qTable = this.getQTable(exp.state);
|
|
343
|
+
const currentQ = qTable.get(exp.action) || 0;
|
|
344
|
+
const delta = G - currentQ;
|
|
345
|
+
qTable.set(exp.action, currentQ + α * delta);
|
|
346
|
+
totalDelta += Math.abs(delta);
|
|
347
|
+
}
|
|
348
|
+
trajectory.completed = true;
|
|
349
|
+
trajectory.totalReward = G;
|
|
350
|
+
return totalDelta / trajectory.experiences.length;
|
|
351
|
+
}
|
|
352
|
+
/**
|
|
353
|
+
* Decision Transformer: Sequence modeling for trajectories
|
|
354
|
+
*/
|
|
355
|
+
decisionTransformerUpdate(config) {
|
|
356
|
+
const { learningRate: α, sequenceLength = 20 } = config;
|
|
357
|
+
const trajectory = this.trajectories[this.trajectories.length - 1];
|
|
358
|
+
if (!trajectory || trajectory.experiences.length === 0)
|
|
359
|
+
return 0;
|
|
360
|
+
// Decision Transformer learns to predict actions given (return, state, action) sequences
|
|
361
|
+
// Here we use a simplified version that learns state-action patterns
|
|
362
|
+
let totalDelta = 0;
|
|
363
|
+
const experiences = trajectory.experiences.slice(-sequenceLength);
|
|
364
|
+
// Calculate returns-to-go
|
|
365
|
+
const returns = [];
|
|
366
|
+
let R = 0;
|
|
367
|
+
for (let i = experiences.length - 1; i >= 0; i--) {
|
|
368
|
+
R += experiences[i].reward;
|
|
369
|
+
returns.unshift(R);
|
|
370
|
+
}
|
|
371
|
+
// Update Q-values weighted by return-to-go
|
|
372
|
+
for (let i = 0; i < experiences.length; i++) {
|
|
373
|
+
const exp = experiences[i];
|
|
374
|
+
const qTable = this.getQTable(exp.state);
|
|
375
|
+
const currentQ = qTable.get(exp.action) || 0;
|
|
376
|
+
// Weight by normalized return
|
|
377
|
+
const normalizedReturn = returns[i] / (Math.abs(returns[0]) + 1);
|
|
378
|
+
const target = currentQ + α * normalizedReturn * exp.reward;
|
|
379
|
+
const delta = target - currentQ;
|
|
380
|
+
qTable.set(exp.action, target);
|
|
381
|
+
totalDelta += Math.abs(delta);
|
|
382
|
+
}
|
|
383
|
+
trajectory.completed = true;
|
|
384
|
+
trajectory.totalReward = returns[0];
|
|
385
|
+
return totalDelta / experiences.length;
|
|
386
|
+
}
|
|
387
|
+
/**
|
|
388
|
+
* DQN: Deep Q-Network (simplified without actual neural network)
|
|
389
|
+
* Uses experience replay and target network concepts
|
|
390
|
+
*/
|
|
391
|
+
dqnUpdate(exp, config) {
|
|
392
|
+
// Add to replay buffer (trajectory)
|
|
393
|
+
this.addToCurrentTrajectory(exp);
|
|
394
|
+
// Sample from replay buffer
|
|
395
|
+
const replayExp = this.sampleFromReplay();
|
|
396
|
+
if (!replayExp)
|
|
397
|
+
return this.qLearningUpdate(exp, config);
|
|
398
|
+
// Use sampled experience for update (breaks correlation)
|
|
399
|
+
return this.qLearningUpdate(replayExp, config);
|
|
400
|
+
}
|
|
401
|
+
// ============ Helper Methods ============
|
|
402
|
+
getQTable(state) {
|
|
403
|
+
if (!this.qTables.has(state)) {
|
|
404
|
+
this.qTables.set(state, new Map());
|
|
405
|
+
}
|
|
406
|
+
return this.qTables.get(state);
|
|
407
|
+
}
|
|
408
|
+
getQTable2(state) {
|
|
409
|
+
if (!this.qTables2.has(state)) {
|
|
410
|
+
this.qTables2.set(state, new Map());
|
|
411
|
+
}
|
|
412
|
+
return this.qTables2.get(state);
|
|
413
|
+
}
|
|
414
|
+
getEligibilityTraces(state) {
|
|
415
|
+
if (!this.eligibilityTraces.has(state)) {
|
|
416
|
+
this.eligibilityTraces.set(state, new Map());
|
|
417
|
+
}
|
|
418
|
+
return this.eligibilityTraces.get(state);
|
|
419
|
+
}
|
|
420
|
+
softmaxConfidence(values, selectedIdx) {
|
|
421
|
+
if (values.length === 0)
|
|
422
|
+
return 0.5;
|
|
423
|
+
const maxVal = Math.max(...values);
|
|
424
|
+
const expValues = values.map(v => Math.exp(v - maxVal));
|
|
425
|
+
const sumExp = expValues.reduce((a, b) => a + b, 0);
|
|
426
|
+
return expValues[selectedIdx] / sumExp;
|
|
427
|
+
}
|
|
428
|
+
addToCurrentTrajectory(exp) {
|
|
429
|
+
if (this.trajectories.length === 0 || this.trajectories[this.trajectories.length - 1].completed) {
|
|
430
|
+
this.trajectories.push({
|
|
431
|
+
experiences: [],
|
|
432
|
+
totalReward: 0,
|
|
433
|
+
completed: false,
|
|
434
|
+
});
|
|
435
|
+
}
|
|
436
|
+
this.trajectories[this.trajectories.length - 1].experiences.push(exp);
|
|
437
|
+
}
|
|
438
|
+
sampleFromReplay() {
|
|
439
|
+
const allExperiences = [];
|
|
440
|
+
for (const traj of this.trajectories) {
|
|
441
|
+
allExperiences.push(...traj.experiences);
|
|
442
|
+
}
|
|
443
|
+
if (allExperiences.length === 0)
|
|
444
|
+
return null;
|
|
445
|
+
return allExperiences[Math.floor(Math.random() * allExperiences.length)];
|
|
446
|
+
}
|
|
447
|
+
updateStats(algorithm, reward, delta) {
|
|
448
|
+
const stats = this.stats.get(algorithm);
|
|
449
|
+
if (!stats)
|
|
450
|
+
return;
|
|
451
|
+
stats.updates++;
|
|
452
|
+
stats.lastUpdate = Date.now();
|
|
453
|
+
// Running average reward
|
|
454
|
+
this.rewardHistory.push(reward);
|
|
455
|
+
if (this.rewardHistory.length > 1000) {
|
|
456
|
+
this.rewardHistory.shift();
|
|
457
|
+
}
|
|
458
|
+
stats.avgReward = this.rewardHistory.reduce((a, b) => a + b, 0) / this.rewardHistory.length;
|
|
459
|
+
// Convergence score (inverse of recent delta magnitude)
|
|
460
|
+
stats.convergenceScore = 1 / (1 + delta);
|
|
461
|
+
}
|
|
462
|
+
/**
|
|
463
|
+
* Get statistics for all algorithms
|
|
464
|
+
*/
|
|
465
|
+
getStats() {
|
|
466
|
+
return new Map(this.stats);
|
|
467
|
+
}
|
|
468
|
+
/**
|
|
469
|
+
* Get statistics summary
|
|
470
|
+
*/
|
|
471
|
+
getStatsSummary() {
|
|
472
|
+
let bestAlgorithm = 'q-learning';
|
|
473
|
+
let bestScore = -Infinity;
|
|
474
|
+
let totalUpdates = 0;
|
|
475
|
+
const algorithms = [];
|
|
476
|
+
for (const [alg, stats] of this.stats) {
|
|
477
|
+
algorithms.push(stats);
|
|
478
|
+
totalUpdates += stats.updates;
|
|
479
|
+
const score = stats.avgReward * stats.convergenceScore;
|
|
480
|
+
if (score > bestScore && stats.updates > 0) {
|
|
481
|
+
bestScore = score;
|
|
482
|
+
bestAlgorithm = alg;
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
return {
|
|
486
|
+
bestAlgorithm,
|
|
487
|
+
totalUpdates,
|
|
488
|
+
avgReward: this.rewardHistory.length > 0
|
|
489
|
+
? this.rewardHistory.reduce((a, b) => a + b, 0) / this.rewardHistory.length
|
|
490
|
+
: 0,
|
|
491
|
+
algorithms: algorithms.filter(a => a.updates > 0),
|
|
492
|
+
};
|
|
493
|
+
}
|
|
494
|
+
/**
|
|
495
|
+
* Export state for persistence
|
|
496
|
+
*/
|
|
497
|
+
export() {
|
|
498
|
+
const qTables = {};
|
|
499
|
+
for (const [state, actions] of this.qTables) {
|
|
500
|
+
qTables[state] = Object.fromEntries(actions);
|
|
501
|
+
}
|
|
502
|
+
const qTables2 = {};
|
|
503
|
+
for (const [state, actions] of this.qTables2) {
|
|
504
|
+
qTables2[state] = Object.fromEntries(actions);
|
|
505
|
+
}
|
|
506
|
+
const criticValues = Object.fromEntries(this.criticValues);
|
|
507
|
+
const stats = {};
|
|
508
|
+
for (const [alg, s] of this.stats) {
|
|
509
|
+
stats[alg] = s;
|
|
510
|
+
}
|
|
511
|
+
const configs = {};
|
|
512
|
+
for (const [task, config] of this.configs) {
|
|
513
|
+
configs[task] = config;
|
|
514
|
+
}
|
|
515
|
+
return {
|
|
516
|
+
qTables,
|
|
517
|
+
qTables2,
|
|
518
|
+
criticValues,
|
|
519
|
+
trajectories: this.trajectories.slice(-100), // Keep last 100 trajectories
|
|
520
|
+
stats,
|
|
521
|
+
configs,
|
|
522
|
+
rewardHistory: this.rewardHistory.slice(-1000),
|
|
523
|
+
};
|
|
524
|
+
}
|
|
525
|
+
/**
|
|
526
|
+
* Import state from persistence
|
|
527
|
+
*/
|
|
528
|
+
import(data) {
|
|
529
|
+
// Q-tables
|
|
530
|
+
this.qTables.clear();
|
|
531
|
+
for (const [state, actions] of Object.entries(data.qTables || {})) {
|
|
532
|
+
this.qTables.set(state, new Map(Object.entries(actions)));
|
|
533
|
+
}
|
|
534
|
+
this.qTables2.clear();
|
|
535
|
+
for (const [state, actions] of Object.entries(data.qTables2 || {})) {
|
|
536
|
+
this.qTables2.set(state, new Map(Object.entries(actions)));
|
|
537
|
+
}
|
|
538
|
+
// Critic values
|
|
539
|
+
this.criticValues = new Map(Object.entries(data.criticValues || {}));
|
|
540
|
+
// Trajectories
|
|
541
|
+
this.trajectories = data.trajectories || [];
|
|
542
|
+
// Stats
|
|
543
|
+
for (const [alg, s] of Object.entries(data.stats || {})) {
|
|
544
|
+
this.stats.set(alg, s);
|
|
545
|
+
}
|
|
546
|
+
// Configs
|
|
547
|
+
for (const [task, config] of Object.entries(data.configs || {})) {
|
|
548
|
+
this.configs.set(task, config);
|
|
549
|
+
}
|
|
550
|
+
// Reward history
|
|
551
|
+
this.rewardHistory = data.rewardHistory || [];
|
|
552
|
+
}
|
|
553
|
+
/**
|
|
554
|
+
* Clear all learning data
|
|
555
|
+
*/
|
|
556
|
+
clear() {
|
|
557
|
+
this.qTables.clear();
|
|
558
|
+
this.qTables2.clear();
|
|
559
|
+
this.eligibilityTraces.clear();
|
|
560
|
+
this.actorWeights.clear();
|
|
561
|
+
this.criticValues.clear();
|
|
562
|
+
this.trajectories = [];
|
|
563
|
+
this.rewardHistory = [];
|
|
564
|
+
// Reset stats
|
|
565
|
+
for (const stats of this.stats.values()) {
|
|
566
|
+
stats.updates = 0;
|
|
567
|
+
stats.avgReward = 0;
|
|
568
|
+
stats.convergenceScore = 0;
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
/**
|
|
572
|
+
* Get available algorithms
|
|
573
|
+
*/
|
|
574
|
+
static getAlgorithms() {
|
|
575
|
+
return [
|
|
576
|
+
{ algorithm: 'q-learning', description: 'Simple off-policy learning', bestFor: 'General routing' },
|
|
577
|
+
{ algorithm: 'sarsa', description: 'On-policy, conservative', bestFor: 'Error avoidance' },
|
|
578
|
+
{ algorithm: 'double-q', description: 'Reduces overestimation', bestFor: 'Precise routing' },
|
|
579
|
+
{ algorithm: 'actor-critic', description: 'Policy gradient + value', bestFor: 'Confidence scoring' },
|
|
580
|
+
{ algorithm: 'ppo', description: 'Stable policy updates', bestFor: 'Preference learning' },
|
|
581
|
+
{ algorithm: 'decision-transformer', description: 'Sequence modeling', bestFor: 'Trajectory patterns' },
|
|
582
|
+
{ algorithm: 'monte-carlo', description: 'Full episode learning', bestFor: 'Unbiased estimates' },
|
|
583
|
+
{ algorithm: 'td-lambda', description: 'Eligibility traces', bestFor: 'Credit assignment' },
|
|
584
|
+
{ algorithm: 'dqn', description: 'Experience replay', bestFor: 'High-dim states' },
|
|
585
|
+
];
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
exports.LearningEngine = LearningEngine;
|
|
589
|
+
exports.default = LearningEngine;
|