ruvector 0.1.65 → 0.1.66

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,589 @@
1
+ "use strict";
2
+ /**
3
+ * Multi-Algorithm Learning Engine
4
+ * Supports 9 RL algorithms for intelligent hooks optimization
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.LearningEngine = void 0;
8
+ // Default configs for each task type
9
+ const TASK_ALGORITHM_MAP = {
10
+ 'agent-routing': {
11
+ algorithm: 'double-q',
12
+ learningRate: 0.1,
13
+ discountFactor: 0.95,
14
+ epsilon: 0.1,
15
+ },
16
+ 'error-avoidance': {
17
+ algorithm: 'sarsa',
18
+ learningRate: 0.05,
19
+ discountFactor: 0.99,
20
+ epsilon: 0.05,
21
+ },
22
+ 'confidence-scoring': {
23
+ algorithm: 'actor-critic',
24
+ learningRate: 0.01,
25
+ discountFactor: 0.95,
26
+ epsilon: 0.1,
27
+ entropyCoef: 0.01,
28
+ },
29
+ 'trajectory-learning': {
30
+ algorithm: 'decision-transformer',
31
+ learningRate: 0.001,
32
+ discountFactor: 0.99,
33
+ epsilon: 0,
34
+ sequenceLength: 20,
35
+ },
36
+ 'context-ranking': {
37
+ algorithm: 'ppo',
38
+ learningRate: 0.0003,
39
+ discountFactor: 0.99,
40
+ epsilon: 0.2,
41
+ clipRange: 0.2,
42
+ entropyCoef: 0.01,
43
+ },
44
+ 'memory-recall': {
45
+ algorithm: 'td-lambda',
46
+ learningRate: 0.1,
47
+ discountFactor: 0.9,
48
+ epsilon: 0.1,
49
+ lambda: 0.8,
50
+ },
51
+ };
52
+ class LearningEngine {
53
+ constructor() {
54
+ this.configs = new Map();
55
+ this.qTables = new Map();
56
+ this.qTables2 = new Map(); // For Double-Q
57
+ this.eligibilityTraces = new Map();
58
+ this.actorWeights = new Map();
59
+ this.criticValues = new Map();
60
+ this.trajectories = [];
61
+ this.stats = new Map();
62
+ this.rewardHistory = [];
63
+ // Initialize with default configs
64
+ for (const [task, config] of Object.entries(TASK_ALGORITHM_MAP)) {
65
+ this.configs.set(task, { ...config });
66
+ }
67
+ // Initialize stats for all algorithms
68
+ const algorithms = [
69
+ 'q-learning', 'sarsa', 'double-q', 'actor-critic',
70
+ 'ppo', 'decision-transformer', 'monte-carlo', 'td-lambda', 'dqn'
71
+ ];
72
+ for (const alg of algorithms) {
73
+ this.stats.set(alg, {
74
+ algorithm: alg,
75
+ updates: 0,
76
+ avgReward: 0,
77
+ convergenceScore: 0,
78
+ lastUpdate: Date.now(),
79
+ });
80
+ }
81
+ }
82
+ /**
83
+ * Configure algorithm for a specific task type
84
+ */
85
+ configure(task, config) {
86
+ const existing = this.configs.get(task) || TASK_ALGORITHM_MAP[task];
87
+ this.configs.set(task, { ...existing, ...config });
88
+ }
89
+ /**
90
+ * Get current configuration for a task
91
+ */
92
+ getConfig(task) {
93
+ return this.configs.get(task) || TASK_ALGORITHM_MAP[task];
94
+ }
95
+ /**
96
+ * Update Q-value using the appropriate algorithm
97
+ */
98
+ update(task, experience) {
99
+ const config = this.getConfig(task);
100
+ let delta = 0;
101
+ switch (config.algorithm) {
102
+ case 'q-learning':
103
+ delta = this.qLearningUpdate(experience, config);
104
+ break;
105
+ case 'sarsa':
106
+ delta = this.sarsaUpdate(experience, config);
107
+ break;
108
+ case 'double-q':
109
+ delta = this.doubleQUpdate(experience, config);
110
+ break;
111
+ case 'actor-critic':
112
+ delta = this.actorCriticUpdate(experience, config);
113
+ break;
114
+ case 'ppo':
115
+ delta = this.ppoUpdate(experience, config);
116
+ break;
117
+ case 'td-lambda':
118
+ delta = this.tdLambdaUpdate(experience, config);
119
+ break;
120
+ case 'monte-carlo':
121
+ // Monte Carlo needs full episodes
122
+ this.addToCurrentTrajectory(experience);
123
+ if (experience.done) {
124
+ delta = this.monteCarloUpdate(config);
125
+ }
126
+ break;
127
+ case 'decision-transformer':
128
+ this.addToCurrentTrajectory(experience);
129
+ if (experience.done) {
130
+ delta = this.decisionTransformerUpdate(config);
131
+ }
132
+ break;
133
+ case 'dqn':
134
+ delta = this.dqnUpdate(experience, config);
135
+ break;
136
+ }
137
+ // Update stats
138
+ this.updateStats(config.algorithm, experience.reward, Math.abs(delta));
139
+ return delta;
140
+ }
141
+ /**
142
+ * Get best action for a state
143
+ */
144
+ getBestAction(task, state, actions) {
145
+ const config = this.getConfig(task);
146
+ // Epsilon-greedy exploration
147
+ if (Math.random() < config.epsilon) {
148
+ const randomAction = actions[Math.floor(Math.random() * actions.length)];
149
+ return { action: randomAction, confidence: 0.5 };
150
+ }
151
+ let bestAction = actions[0];
152
+ let bestValue = -Infinity;
153
+ let values = [];
154
+ const qTable = this.getQTable(state);
155
+ for (const action of actions) {
156
+ const value = qTable.get(action) || 0;
157
+ values.push(value);
158
+ if (value > bestValue) {
159
+ bestValue = value;
160
+ bestAction = action;
161
+ }
162
+ }
163
+ // Calculate confidence using softmax
164
+ const confidence = this.softmaxConfidence(values, actions.indexOf(bestAction));
165
+ return { action: bestAction, confidence };
166
+ }
167
+ /**
168
+ * Get action probabilities (for Actor-Critic and PPO)
169
+ */
170
+ getActionProbabilities(state, actions) {
171
+ const probs = new Map();
172
+ const qTable = this.getQTable(state);
173
+ const values = actions.map(a => qTable.get(a) || 0);
174
+ const maxVal = Math.max(...values);
175
+ const expValues = values.map(v => Math.exp(v - maxVal));
176
+ const sumExp = expValues.reduce((a, b) => a + b, 0);
177
+ for (let i = 0; i < actions.length; i++) {
178
+ probs.set(actions[i], expValues[i] / sumExp);
179
+ }
180
+ return probs;
181
+ }
182
+ // ============ Algorithm Implementations ============
183
+ /**
184
+ * Standard Q-Learning: Q(s,a) += α * (r + γ * max_a' Q(s',a') - Q(s,a))
185
+ */
186
+ qLearningUpdate(exp, config) {
187
+ const { state, action, reward, nextState, done } = exp;
188
+ const { learningRate: α, discountFactor: γ } = config;
189
+ const qTable = this.getQTable(state);
190
+ const nextQTable = this.getQTable(nextState);
191
+ const currentQ = qTable.get(action) || 0;
192
+ const maxNextQ = done ? 0 : Math.max(0, ...Array.from(nextQTable.values()));
193
+ const target = reward + γ * maxNextQ;
194
+ const delta = target - currentQ;
195
+ const newQ = currentQ + α * delta;
196
+ qTable.set(action, newQ);
197
+ return delta;
198
+ }
199
+ /**
200
+ * SARSA: On-policy, more conservative
201
+ * Q(s,a) += α * (r + γ * Q(s',a') - Q(s,a))
202
+ */
203
+ sarsaUpdate(exp, config) {
204
+ const { state, action, reward, nextState, done } = exp;
205
+ const { learningRate: α, discountFactor: γ, epsilon } = config;
206
+ const qTable = this.getQTable(state);
207
+ const nextQTable = this.getQTable(nextState);
208
+ const currentQ = qTable.get(action) || 0;
209
+ // On-policy: use expected value under current policy (ε-greedy)
210
+ let nextQ = 0;
211
+ if (!done) {
212
+ const nextActions = Array.from(nextQTable.keys());
213
+ if (nextActions.length > 0) {
214
+ const maxQ = Math.max(...Array.from(nextQTable.values()));
215
+ const avgQ = Array.from(nextQTable.values()).reduce((a, b) => a + b, 0) / nextActions.length;
216
+ // Expected value under ε-greedy
217
+ nextQ = (1 - epsilon) * maxQ + epsilon * avgQ;
218
+ }
219
+ }
220
+ const target = reward + γ * nextQ;
221
+ const delta = target - currentQ;
222
+ const newQ = currentQ + α * delta;
223
+ qTable.set(action, newQ);
224
+ return delta;
225
+ }
226
+ /**
227
+ * Double Q-Learning: Reduces overestimation bias
228
+ * Uses two Q-tables, randomly updates one using the other for target
229
+ */
230
+ doubleQUpdate(exp, config) {
231
+ const { state, action, reward, nextState, done } = exp;
232
+ const { learningRate: α, discountFactor: γ } = config;
233
+ const useFirst = Math.random() < 0.5;
234
+ const qTable = useFirst ? this.getQTable(state) : this.getQTable2(state);
235
+ const otherQTable = useFirst ? this.getQTable2(nextState) : this.getQTable(nextState);
236
+ const nextQTable = useFirst ? this.getQTable(nextState) : this.getQTable2(nextState);
237
+ const currentQ = qTable.get(action) || 0;
238
+ let nextQ = 0;
239
+ if (!done) {
240
+ // Find best action in next state using one table
241
+ let bestAction = '';
242
+ let bestValue = -Infinity;
243
+ for (const [a, v] of nextQTable) {
244
+ if (v > bestValue) {
245
+ bestValue = v;
246
+ bestAction = a;
247
+ }
248
+ }
249
+ // Evaluate using other table
250
+ if (bestAction) {
251
+ nextQ = otherQTable.get(bestAction) || 0;
252
+ }
253
+ }
254
+ const target = reward + γ * nextQ;
255
+ const delta = target - currentQ;
256
+ const newQ = currentQ + α * delta;
257
+ qTable.set(action, newQ);
258
+ return delta;
259
+ }
260
+ /**
261
+ * Actor-Critic: Policy gradient with value baseline
262
+ */
263
+ actorCriticUpdate(exp, config) {
264
+ const { state, action, reward, nextState, done } = exp;
265
+ const { learningRate: α, discountFactor: γ } = config;
266
+ // Critic update (TD error)
267
+ const V = this.criticValues.get(state) || 0;
268
+ const V_next = done ? 0 : (this.criticValues.get(nextState) || 0);
269
+ const tdError = reward + γ * V_next - V;
270
+ this.criticValues.set(state, V + α * tdError);
271
+ // Actor update (policy gradient)
272
+ const qTable = this.getQTable(state);
273
+ const currentQ = qTable.get(action) || 0;
274
+ // Use TD error as advantage estimate
275
+ const newQ = currentQ + α * tdError;
276
+ qTable.set(action, newQ);
277
+ return tdError;
278
+ }
279
+ /**
280
+ * PPO: Clipped policy gradient for stable training
281
+ */
282
+ ppoUpdate(exp, config) {
283
+ const { state, action, reward, nextState, done } = exp;
284
+ const { learningRate: α, discountFactor: γ, clipRange = 0.2 } = config;
285
+ // Critic update
286
+ const V = this.criticValues.get(state) || 0;
287
+ const V_next = done ? 0 : (this.criticValues.get(nextState) || 0);
288
+ const advantage = reward + γ * V_next - V;
289
+ this.criticValues.set(state, V + α * advantage);
290
+ // Actor update with clipping
291
+ const qTable = this.getQTable(state);
292
+ const oldQ = qTable.get(action) || 0;
293
+ // Compute probability ratio (simplified)
294
+ const ratio = Math.exp(α * advantage);
295
+ const clippedRatio = Math.max(1 - clipRange, Math.min(1 + clipRange, ratio));
296
+ // PPO objective: min(ratio * A, clip(ratio) * A)
297
+ const update = Math.min(ratio * advantage, clippedRatio * advantage);
298
+ const newQ = oldQ + α * update;
299
+ qTable.set(action, newQ);
300
+ return advantage;
301
+ }
302
+ /**
303
+ * TD(λ): Temporal difference with eligibility traces
304
+ */
305
+ tdLambdaUpdate(exp, config) {
306
+ const { state, action, reward, nextState, done } = exp;
307
+ const { learningRate: α, discountFactor: γ, lambda = 0.8 } = config;
308
+ const qTable = this.getQTable(state);
309
+ const nextQTable = this.getQTable(nextState);
310
+ const currentQ = qTable.get(action) || 0;
311
+ const maxNextQ = done ? 0 : Math.max(0, ...Array.from(nextQTable.values()));
312
+ const tdError = reward + γ * maxNextQ - currentQ;
313
+ // Update eligibility trace for current state-action
314
+ const traces = this.getEligibilityTraces(state);
315
+ traces.set(action, (traces.get(action) || 0) + 1);
316
+ // Update all state-actions with eligibility traces
317
+ for (const [s, sTraces] of this.eligibilityTraces) {
318
+ const sQTable = this.getQTable(s);
319
+ for (const [a, trace] of sTraces) {
320
+ const q = sQTable.get(a) || 0;
321
+ sQTable.set(a, q + α * tdError * trace);
322
+ // Decay trace
323
+ sTraces.set(a, γ * lambda * trace);
324
+ }
325
+ }
326
+ return tdError;
327
+ }
328
+ /**
329
+ * Monte Carlo: Full episode learning
330
+ */
331
+ monteCarloUpdate(config) {
332
+ const { learningRate: α, discountFactor: γ } = config;
333
+ const trajectory = this.trajectories[this.trajectories.length - 1];
334
+ if (!trajectory || trajectory.experiences.length === 0)
335
+ return 0;
336
+ let G = 0; // Return
337
+ let totalDelta = 0;
338
+ // Work backwards through episode
339
+ for (let t = trajectory.experiences.length - 1; t >= 0; t--) {
340
+ const exp = trajectory.experiences[t];
341
+ G = exp.reward + γ * G;
342
+ const qTable = this.getQTable(exp.state);
343
+ const currentQ = qTable.get(exp.action) || 0;
344
+ const delta = G - currentQ;
345
+ qTable.set(exp.action, currentQ + α * delta);
346
+ totalDelta += Math.abs(delta);
347
+ }
348
+ trajectory.completed = true;
349
+ trajectory.totalReward = G;
350
+ return totalDelta / trajectory.experiences.length;
351
+ }
352
+ /**
353
+ * Decision Transformer: Sequence modeling for trajectories
354
+ */
355
+ decisionTransformerUpdate(config) {
356
+ const { learningRate: α, sequenceLength = 20 } = config;
357
+ const trajectory = this.trajectories[this.trajectories.length - 1];
358
+ if (!trajectory || trajectory.experiences.length === 0)
359
+ return 0;
360
+ // Decision Transformer learns to predict actions given (return, state, action) sequences
361
+ // Here we use a simplified version that learns state-action patterns
362
+ let totalDelta = 0;
363
+ const experiences = trajectory.experiences.slice(-sequenceLength);
364
+ // Calculate returns-to-go
365
+ const returns = [];
366
+ let R = 0;
367
+ for (let i = experiences.length - 1; i >= 0; i--) {
368
+ R += experiences[i].reward;
369
+ returns.unshift(R);
370
+ }
371
+ // Update Q-values weighted by return-to-go
372
+ for (let i = 0; i < experiences.length; i++) {
373
+ const exp = experiences[i];
374
+ const qTable = this.getQTable(exp.state);
375
+ const currentQ = qTable.get(exp.action) || 0;
376
+ // Weight by normalized return
377
+ const normalizedReturn = returns[i] / (Math.abs(returns[0]) + 1);
378
+ const target = currentQ + α * normalizedReturn * exp.reward;
379
+ const delta = target - currentQ;
380
+ qTable.set(exp.action, target);
381
+ totalDelta += Math.abs(delta);
382
+ }
383
+ trajectory.completed = true;
384
+ trajectory.totalReward = returns[0];
385
+ return totalDelta / experiences.length;
386
+ }
387
+ /**
388
+ * DQN: Deep Q-Network (simplified without actual neural network)
389
+ * Uses experience replay and target network concepts
390
+ */
391
+ dqnUpdate(exp, config) {
392
+ // Add to replay buffer (trajectory)
393
+ this.addToCurrentTrajectory(exp);
394
+ // Sample from replay buffer
395
+ const replayExp = this.sampleFromReplay();
396
+ if (!replayExp)
397
+ return this.qLearningUpdate(exp, config);
398
+ // Use sampled experience for update (breaks correlation)
399
+ return this.qLearningUpdate(replayExp, config);
400
+ }
401
+ // ============ Helper Methods ============
402
+ getQTable(state) {
403
+ if (!this.qTables.has(state)) {
404
+ this.qTables.set(state, new Map());
405
+ }
406
+ return this.qTables.get(state);
407
+ }
408
+ getQTable2(state) {
409
+ if (!this.qTables2.has(state)) {
410
+ this.qTables2.set(state, new Map());
411
+ }
412
+ return this.qTables2.get(state);
413
+ }
414
+ getEligibilityTraces(state) {
415
+ if (!this.eligibilityTraces.has(state)) {
416
+ this.eligibilityTraces.set(state, new Map());
417
+ }
418
+ return this.eligibilityTraces.get(state);
419
+ }
420
+ softmaxConfidence(values, selectedIdx) {
421
+ if (values.length === 0)
422
+ return 0.5;
423
+ const maxVal = Math.max(...values);
424
+ const expValues = values.map(v => Math.exp(v - maxVal));
425
+ const sumExp = expValues.reduce((a, b) => a + b, 0);
426
+ return expValues[selectedIdx] / sumExp;
427
+ }
428
+ addToCurrentTrajectory(exp) {
429
+ if (this.trajectories.length === 0 || this.trajectories[this.trajectories.length - 1].completed) {
430
+ this.trajectories.push({
431
+ experiences: [],
432
+ totalReward: 0,
433
+ completed: false,
434
+ });
435
+ }
436
+ this.trajectories[this.trajectories.length - 1].experiences.push(exp);
437
+ }
438
+ sampleFromReplay() {
439
+ const allExperiences = [];
440
+ for (const traj of this.trajectories) {
441
+ allExperiences.push(...traj.experiences);
442
+ }
443
+ if (allExperiences.length === 0)
444
+ return null;
445
+ return allExperiences[Math.floor(Math.random() * allExperiences.length)];
446
+ }
447
+ updateStats(algorithm, reward, delta) {
448
+ const stats = this.stats.get(algorithm);
449
+ if (!stats)
450
+ return;
451
+ stats.updates++;
452
+ stats.lastUpdate = Date.now();
453
+ // Running average reward
454
+ this.rewardHistory.push(reward);
455
+ if (this.rewardHistory.length > 1000) {
456
+ this.rewardHistory.shift();
457
+ }
458
+ stats.avgReward = this.rewardHistory.reduce((a, b) => a + b, 0) / this.rewardHistory.length;
459
+ // Convergence score (inverse of recent delta magnitude)
460
+ stats.convergenceScore = 1 / (1 + delta);
461
+ }
462
+ /**
463
+ * Get statistics for all algorithms
464
+ */
465
+ getStats() {
466
+ return new Map(this.stats);
467
+ }
468
+ /**
469
+ * Get statistics summary
470
+ */
471
+ getStatsSummary() {
472
+ let bestAlgorithm = 'q-learning';
473
+ let bestScore = -Infinity;
474
+ let totalUpdates = 0;
475
+ const algorithms = [];
476
+ for (const [alg, stats] of this.stats) {
477
+ algorithms.push(stats);
478
+ totalUpdates += stats.updates;
479
+ const score = stats.avgReward * stats.convergenceScore;
480
+ if (score > bestScore && stats.updates > 0) {
481
+ bestScore = score;
482
+ bestAlgorithm = alg;
483
+ }
484
+ }
485
+ return {
486
+ bestAlgorithm,
487
+ totalUpdates,
488
+ avgReward: this.rewardHistory.length > 0
489
+ ? this.rewardHistory.reduce((a, b) => a + b, 0) / this.rewardHistory.length
490
+ : 0,
491
+ algorithms: algorithms.filter(a => a.updates > 0),
492
+ };
493
+ }
494
+ /**
495
+ * Export state for persistence
496
+ */
497
+ export() {
498
+ const qTables = {};
499
+ for (const [state, actions] of this.qTables) {
500
+ qTables[state] = Object.fromEntries(actions);
501
+ }
502
+ const qTables2 = {};
503
+ for (const [state, actions] of this.qTables2) {
504
+ qTables2[state] = Object.fromEntries(actions);
505
+ }
506
+ const criticValues = Object.fromEntries(this.criticValues);
507
+ const stats = {};
508
+ for (const [alg, s] of this.stats) {
509
+ stats[alg] = s;
510
+ }
511
+ const configs = {};
512
+ for (const [task, config] of this.configs) {
513
+ configs[task] = config;
514
+ }
515
+ return {
516
+ qTables,
517
+ qTables2,
518
+ criticValues,
519
+ trajectories: this.trajectories.slice(-100), // Keep last 100 trajectories
520
+ stats,
521
+ configs,
522
+ rewardHistory: this.rewardHistory.slice(-1000),
523
+ };
524
+ }
525
+ /**
526
+ * Import state from persistence
527
+ */
528
+ import(data) {
529
+ // Q-tables
530
+ this.qTables.clear();
531
+ for (const [state, actions] of Object.entries(data.qTables || {})) {
532
+ this.qTables.set(state, new Map(Object.entries(actions)));
533
+ }
534
+ this.qTables2.clear();
535
+ for (const [state, actions] of Object.entries(data.qTables2 || {})) {
536
+ this.qTables2.set(state, new Map(Object.entries(actions)));
537
+ }
538
+ // Critic values
539
+ this.criticValues = new Map(Object.entries(data.criticValues || {}));
540
+ // Trajectories
541
+ this.trajectories = data.trajectories || [];
542
+ // Stats
543
+ for (const [alg, s] of Object.entries(data.stats || {})) {
544
+ this.stats.set(alg, s);
545
+ }
546
+ // Configs
547
+ for (const [task, config] of Object.entries(data.configs || {})) {
548
+ this.configs.set(task, config);
549
+ }
550
+ // Reward history
551
+ this.rewardHistory = data.rewardHistory || [];
552
+ }
553
+ /**
554
+ * Clear all learning data
555
+ */
556
+ clear() {
557
+ this.qTables.clear();
558
+ this.qTables2.clear();
559
+ this.eligibilityTraces.clear();
560
+ this.actorWeights.clear();
561
+ this.criticValues.clear();
562
+ this.trajectories = [];
563
+ this.rewardHistory = [];
564
+ // Reset stats
565
+ for (const stats of this.stats.values()) {
566
+ stats.updates = 0;
567
+ stats.avgReward = 0;
568
+ stats.convergenceScore = 0;
569
+ }
570
+ }
571
+ /**
572
+ * Get available algorithms
573
+ */
574
+ static getAlgorithms() {
575
+ return [
576
+ { algorithm: 'q-learning', description: 'Simple off-policy learning', bestFor: 'General routing' },
577
+ { algorithm: 'sarsa', description: 'On-policy, conservative', bestFor: 'Error avoidance' },
578
+ { algorithm: 'double-q', description: 'Reduces overestimation', bestFor: 'Precise routing' },
579
+ { algorithm: 'actor-critic', description: 'Policy gradient + value', bestFor: 'Confidence scoring' },
580
+ { algorithm: 'ppo', description: 'Stable policy updates', bestFor: 'Preference learning' },
581
+ { algorithm: 'decision-transformer', description: 'Sequence modeling', bestFor: 'Trajectory patterns' },
582
+ { algorithm: 'monte-carlo', description: 'Full episode learning', bestFor: 'Unbiased estimates' },
583
+ { algorithm: 'td-lambda', description: 'Eligibility traces', bestFor: 'Credit assignment' },
584
+ { algorithm: 'dqn', description: 'Experience replay', bestFor: 'High-dim states' },
585
+ ];
586
+ }
587
+ }
588
+ exports.LearningEngine = LearningEngine;
589
+ exports.default = LearningEngine;