@arcanea/guardian-evolution 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/dist/algorithms/a2c.d.ts +86 -0
  2. package/dist/algorithms/a2c.d.ts.map +1 -0
  3. package/dist/algorithms/a2c.js +361 -0
  4. package/dist/algorithms/a2c.js.map +1 -0
  5. package/dist/algorithms/curiosity.d.ts +82 -0
  6. package/dist/algorithms/curiosity.d.ts.map +1 -0
  7. package/dist/algorithms/curiosity.js +392 -0
  8. package/dist/algorithms/curiosity.js.map +1 -0
  9. package/dist/algorithms/decision-transformer.d.ts +82 -0
  10. package/dist/algorithms/decision-transformer.d.ts.map +1 -0
  11. package/dist/algorithms/decision-transformer.js +415 -0
  12. package/dist/algorithms/decision-transformer.js.map +1 -0
  13. package/dist/algorithms/dqn.d.ts +72 -0
  14. package/dist/algorithms/dqn.d.ts.map +1 -0
  15. package/dist/algorithms/dqn.js +303 -0
  16. package/dist/algorithms/dqn.js.map +1 -0
  17. package/dist/algorithms/index.d.ts +32 -0
  18. package/dist/algorithms/index.d.ts.map +1 -0
  19. package/dist/algorithms/index.js +74 -0
  20. package/dist/algorithms/index.js.map +1 -0
  21. package/dist/algorithms/ppo.d.ts +72 -0
  22. package/dist/algorithms/ppo.d.ts.map +1 -0
  23. package/dist/algorithms/ppo.js +331 -0
  24. package/dist/algorithms/ppo.js.map +1 -0
  25. package/dist/algorithms/q-learning.d.ts +77 -0
  26. package/dist/algorithms/q-learning.d.ts.map +1 -0
  27. package/dist/algorithms/q-learning.js +259 -0
  28. package/dist/algorithms/q-learning.js.map +1 -0
  29. package/dist/algorithms/sarsa.d.ts +82 -0
  30. package/dist/algorithms/sarsa.d.ts.map +1 -0
  31. package/dist/algorithms/sarsa.js +297 -0
  32. package/dist/algorithms/sarsa.js.map +1 -0
  33. package/dist/index.d.ts +118 -0
  34. package/dist/index.d.ts.map +1 -0
  35. package/dist/index.js +201 -0
  36. package/dist/index.js.map +1 -0
  37. package/dist/modes/balanced.d.ts +60 -0
  38. package/dist/modes/balanced.d.ts.map +1 -0
  39. package/dist/modes/balanced.js +234 -0
  40. package/dist/modes/balanced.js.map +1 -0
  41. package/dist/modes/batch.d.ts +82 -0
  42. package/dist/modes/batch.d.ts.map +1 -0
  43. package/dist/modes/batch.js +316 -0
  44. package/dist/modes/batch.js.map +1 -0
  45. package/dist/modes/edge.d.ts +85 -0
  46. package/dist/modes/edge.d.ts.map +1 -0
  47. package/dist/modes/edge.js +310 -0
  48. package/dist/modes/edge.js.map +1 -0
  49. package/dist/modes/index.d.ts +55 -0
  50. package/dist/modes/index.d.ts.map +1 -0
  51. package/dist/modes/index.js +83 -0
  52. package/dist/modes/index.js.map +1 -0
  53. package/dist/modes/real-time.d.ts +58 -0
  54. package/dist/modes/real-time.d.ts.map +1 -0
  55. package/dist/modes/real-time.js +196 -0
  56. package/dist/modes/real-time.js.map +1 -0
  57. package/dist/modes/research.d.ts +79 -0
  58. package/dist/modes/research.d.ts.map +1 -0
  59. package/dist/modes/research.js +389 -0
  60. package/dist/modes/research.js.map +1 -0
  61. package/dist/pattern-learner.d.ts +117 -0
  62. package/dist/pattern-learner.d.ts.map +1 -0
  63. package/dist/pattern-learner.js +603 -0
  64. package/dist/pattern-learner.js.map +1 -0
  65. package/dist/reasoning-bank.d.ts +259 -0
  66. package/dist/reasoning-bank.d.ts.map +1 -0
  67. package/dist/reasoning-bank.js +993 -0
  68. package/dist/reasoning-bank.js.map +1 -0
  69. package/dist/reasoningbank-adapter.d.ts +168 -0
  70. package/dist/reasoningbank-adapter.d.ts.map +1 -0
  71. package/dist/reasoningbank-adapter.js +463 -0
  72. package/dist/reasoningbank-adapter.js.map +1 -0
  73. package/dist/sona-integration.d.ts +168 -0
  74. package/dist/sona-integration.d.ts.map +1 -0
  75. package/dist/sona-integration.js +316 -0
  76. package/dist/sona-integration.js.map +1 -0
  77. package/dist/sona-manager.d.ts +147 -0
  78. package/dist/sona-manager.d.ts.map +1 -0
  79. package/dist/sona-manager.js +695 -0
  80. package/dist/sona-manager.js.map +1 -0
  81. package/dist/types.d.ts +431 -0
  82. package/dist/types.d.ts.map +1 -0
  83. package/dist/types.js +11 -0
  84. package/dist/types.js.map +1 -0
  85. package/package.json +47 -0
@@ -0,0 +1,331 @@
1
+ /**
2
+ * Proximal Policy Optimization (PPO)
3
+ *
4
+ * Implements PPO algorithm for stable policy learning with:
5
+ * - Clipped surrogate objective
6
+ * - GAE (Generalized Advantage Estimation)
7
+ * - Value function clipping
8
+ * - Entropy bonus
9
+ *
10
+ * Performance Target: <10ms per update step
11
+ */
12
+ /**
13
+ * Default PPO configuration
14
+ */
15
+ export const DEFAULT_PPO_CONFIG = {
16
+ algorithm: 'ppo',
17
+ learningRate: 0.0003,
18
+ gamma: 0.99,
19
+ entropyCoef: 0.01,
20
+ valueLossCoef: 0.5,
21
+ maxGradNorm: 0.5,
22
+ epochs: 4,
23
+ miniBatchSize: 64,
24
+ clipRange: 0.2,
25
+ clipRangeVf: null,
26
+ targetKL: 0.01,
27
+ gaeLambda: 0.95,
28
+ };
29
+ /**
30
+ * PPO Algorithm Implementation
31
+ */
32
+ export class PPOAlgorithm {
33
+ config;
34
+ // Policy network weights (simplified linear model for speed)
35
+ policyWeights;
36
+ valueWeights;
37
+ // Optimizer state
38
+ policyMomentum;
39
+ valueMomentum;
40
+ // Experience buffer
41
+ buffer = [];
42
+ // Statistics
43
+ updateCount = 0;
44
+ totalLoss = 0;
45
+ approxKL = 0;
46
+ clipFraction = 0;
47
+ constructor(config = {}) {
48
+ this.config = { ...DEFAULT_PPO_CONFIG, ...config };
49
+ // Initialize weights (768 input dim, simplified)
50
+ const dim = 768;
51
+ this.policyWeights = new Float32Array(dim);
52
+ this.valueWeights = new Float32Array(dim);
53
+ this.policyMomentum = new Float32Array(dim);
54
+ this.valueMomentum = new Float32Array(dim);
55
+ // Xavier initialization
56
+ const scale = Math.sqrt(2 / dim);
57
+ for (let i = 0; i < dim; i++) {
58
+ this.policyWeights[i] = (Math.random() - 0.5) * scale;
59
+ this.valueWeights[i] = (Math.random() - 0.5) * scale;
60
+ }
61
+ }
62
+ /**
63
+ * Add experience from trajectory
64
+ */
65
+ addExperience(trajectory) {
66
+ if (trajectory.steps.length === 0)
67
+ return;
68
+ // Compute values for each step
69
+ const values = trajectory.steps.map(step => this.computeValue(step.stateAfter));
70
+ // Compute advantages using GAE
71
+ const advantages = this.computeGAE(trajectory.steps.map(s => s.reward), values);
72
+ // Compute returns
73
+ const returns = this.computeReturns(trajectory.steps.map(s => s.reward));
74
+ // Add to buffer
75
+ for (let i = 0; i < trajectory.steps.length; i++) {
76
+ const step = trajectory.steps[i];
77
+ this.buffer.push({
78
+ state: step.stateAfter,
79
+ action: this.hashAction(step.action),
80
+ reward: step.reward,
81
+ value: values[i],
82
+ logProb: this.computeLogProb(step.stateAfter, step.action),
83
+ advantage: advantages[i],
84
+ return_: returns[i],
85
+ });
86
+ }
87
+ }
88
+ /**
89
+ * Perform PPO update
90
+ * Target: <10ms
91
+ */
92
+ update() {
93
+ const startTime = performance.now();
94
+ if (this.buffer.length < this.config.miniBatchSize) {
95
+ return { policyLoss: 0, valueLoss: 0, entropy: 0 };
96
+ }
97
+ // Normalize advantages
98
+ const advantages = this.buffer.map(e => e.advantage);
99
+ const advMean = advantages.reduce((a, b) => a + b, 0) / advantages.length;
100
+ const advStd = Math.sqrt(advantages.reduce((a, b) => a + (b - advMean) ** 2, 0) / advantages.length) + 1e-8;
101
+ for (const exp of this.buffer) {
102
+ exp.advantage = (exp.advantage - advMean) / advStd;
103
+ }
104
+ let totalPolicyLoss = 0;
105
+ let totalValueLoss = 0;
106
+ let totalEntropy = 0;
107
+ let totalClipFrac = 0;
108
+ let totalKL = 0;
109
+ let numUpdates = 0;
110
+ // Multiple epochs
111
+ for (let epoch = 0; epoch < this.config.epochs; epoch++) {
112
+ // Shuffle buffer
113
+ this.shuffleBuffer();
114
+ // Process mini-batches
115
+ for (let i = 0; i < this.buffer.length; i += this.config.miniBatchSize) {
116
+ const batch = this.buffer.slice(i, i + this.config.miniBatchSize);
117
+ if (batch.length < this.config.miniBatchSize / 2)
118
+ continue;
119
+ const result = this.updateMiniBatch(batch);
120
+ totalPolicyLoss += result.policyLoss;
121
+ totalValueLoss += result.valueLoss;
122
+ totalEntropy += result.entropy;
123
+ totalClipFrac += result.clipFrac;
124
+ totalKL += result.kl;
125
+ numUpdates++;
126
+ // Early stopping if KL too high
127
+ if (result.kl > this.config.targetKL * 1.5) {
128
+ break;
129
+ }
130
+ }
131
+ }
132
+ // Clear buffer
133
+ this.buffer = [];
134
+ this.updateCount++;
135
+ const elapsed = performance.now() - startTime;
136
+ if (elapsed > 10) {
137
+ console.warn(`PPO update exceeded target: ${elapsed.toFixed(2)}ms > 10ms`);
138
+ }
139
+ return {
140
+ policyLoss: numUpdates > 0 ? totalPolicyLoss / numUpdates : 0,
141
+ valueLoss: numUpdates > 0 ? totalValueLoss / numUpdates : 0,
142
+ entropy: numUpdates > 0 ? totalEntropy / numUpdates : 0,
143
+ };
144
+ }
145
+ /**
146
+ * Get action from policy
147
+ */
148
+ getAction(state) {
149
+ const logits = this.computeLogits(state);
150
+ const probs = this.softmax(logits);
151
+ const action = this.sampleAction(probs);
152
+ return {
153
+ action,
154
+ logProb: Math.log(probs[action] + 1e-8),
155
+ value: this.computeValue(state),
156
+ };
157
+ }
158
+ /**
159
+ * Get statistics
160
+ */
161
+ getStats() {
162
+ return {
163
+ updateCount: this.updateCount,
164
+ bufferSize: this.buffer.length,
165
+ avgLoss: this.updateCount > 0 ? this.totalLoss / this.updateCount : 0,
166
+ approxKL: this.approxKL,
167
+ clipFraction: this.clipFraction,
168
+ };
169
+ }
170
+ // ==========================================================================
171
+ // Private Methods
172
+ // ==========================================================================
173
+ computeValue(state) {
174
+ let value = 0;
175
+ for (let i = 0; i < Math.min(state.length, this.valueWeights.length); i++) {
176
+ value += state[i] * this.valueWeights[i];
177
+ }
178
+ return value;
179
+ }
180
+ computeLogits(state) {
181
+ // Simplified: 4 discrete actions
182
+ const numActions = 4;
183
+ const logits = new Float32Array(numActions);
184
+ for (let a = 0; a < numActions; a++) {
185
+ for (let i = 0; i < Math.min(state.length, this.policyWeights.length); i++) {
186
+ logits[a] += state[i] * this.policyWeights[i] * (1 + a * 0.1);
187
+ }
188
+ }
189
+ return logits;
190
+ }
191
+ computeLogProb(state, action) {
192
+ const logits = this.computeLogits(state);
193
+ const probs = this.softmax(logits);
194
+ const actionIdx = this.hashAction(action);
195
+ return Math.log(probs[actionIdx] + 1e-8);
196
+ }
197
+ hashAction(action) {
198
+ // Simple hash to action index (0-3)
199
+ let hash = 0;
200
+ for (let i = 0; i < action.length; i++) {
201
+ hash = (hash * 31 + action.charCodeAt(i)) % 4;
202
+ }
203
+ return hash;
204
+ }
205
+ softmax(logits) {
206
+ const max = Math.max(...logits);
207
+ const exps = new Float32Array(logits.length);
208
+ let sum = 0;
209
+ for (let i = 0; i < logits.length; i++) {
210
+ exps[i] = Math.exp(logits[i] - max);
211
+ sum += exps[i];
212
+ }
213
+ for (let i = 0; i < exps.length; i++) {
214
+ exps[i] /= sum;
215
+ }
216
+ return exps;
217
+ }
218
+ sampleAction(probs) {
219
+ const r = Math.random();
220
+ let cumSum = 0;
221
+ for (let i = 0; i < probs.length; i++) {
222
+ cumSum += probs[i];
223
+ if (r < cumSum)
224
+ return i;
225
+ }
226
+ return probs.length - 1;
227
+ }
228
+ computeGAE(rewards, values) {
229
+ const advantages = new Array(rewards.length).fill(0);
230
+ let lastGae = 0;
231
+ for (let t = rewards.length - 1; t >= 0; t--) {
232
+ const nextValue = t < rewards.length - 1 ? values[t + 1] : 0;
233
+ const delta = rewards[t] + this.config.gamma * nextValue - values[t];
234
+ lastGae = delta + this.config.gamma * this.config.gaeLambda * lastGae;
235
+ advantages[t] = lastGae;
236
+ }
237
+ return advantages;
238
+ }
239
+ computeReturns(rewards) {
240
+ const returns = new Array(rewards.length).fill(0);
241
+ let cumReturn = 0;
242
+ for (let t = rewards.length - 1; t >= 0; t--) {
243
+ cumReturn = rewards[t] + this.config.gamma * cumReturn;
244
+ returns[t] = cumReturn;
245
+ }
246
+ return returns;
247
+ }
248
+ shuffleBuffer() {
249
+ for (let i = this.buffer.length - 1; i > 0; i--) {
250
+ const j = Math.floor(Math.random() * (i + 1));
251
+ [this.buffer[i], this.buffer[j]] = [this.buffer[j], this.buffer[i]];
252
+ }
253
+ }
254
+ updateMiniBatch(batch) {
255
+ let policyLoss = 0;
256
+ let valueLoss = 0;
257
+ let entropy = 0;
258
+ let clipFrac = 0;
259
+ let kl = 0;
260
+ const policyGrad = new Float32Array(this.policyWeights.length);
261
+ const valueGrad = new Float32Array(this.valueWeights.length);
262
+ for (const exp of batch) {
263
+ // Current policy
264
+ const logits = this.computeLogits(exp.state);
265
+ const probs = this.softmax(logits);
266
+ const newLogProb = Math.log(probs[exp.action] + 1e-8);
267
+ const currentValue = this.computeValue(exp.state);
268
+ // Ratio for PPO
269
+ const ratio = Math.exp(newLogProb - exp.logProb);
270
+ // Clipped surrogate objective
271
+ const surr1 = ratio * exp.advantage;
272
+ const surr2 = Math.max(Math.min(ratio, 1 + this.config.clipRange), 1 - this.config.clipRange) * exp.advantage;
273
+ const policyLossI = -Math.min(surr1, surr2);
274
+ policyLoss += policyLossI;
275
+ // Track clipping
276
+ if (Math.abs(ratio - 1) > this.config.clipRange) {
277
+ clipFrac++;
278
+ }
279
+ // KL divergence approximation
280
+ kl += (exp.logProb - newLogProb);
281
+ // Value loss
282
+ let valueLossI;
283
+ if (this.config.clipRangeVf !== null) {
284
+ const valuePred = currentValue;
285
+ const valueClipped = exp.value + Math.max(Math.min(valuePred - exp.value, this.config.clipRangeVf), -this.config.clipRangeVf);
286
+ const vf1 = (valuePred - exp.return_) ** 2;
287
+ const vf2 = (valueClipped - exp.return_) ** 2;
288
+ valueLossI = Math.max(vf1, vf2);
289
+ }
290
+ else {
291
+ valueLossI = (currentValue - exp.return_) ** 2;
292
+ }
293
+ valueLoss += valueLossI;
294
+ // Entropy
295
+ let entropyI = 0;
296
+ for (const p of probs) {
297
+ if (p > 0)
298
+ entropyI -= p * Math.log(p);
299
+ }
300
+ entropy += entropyI;
301
+ // Compute gradients (simplified)
302
+ for (let i = 0; i < Math.min(exp.state.length, policyGrad.length); i++) {
303
+ policyGrad[i] += exp.state[i] * policyLossI * 0.01;
304
+ valueGrad[i] += exp.state[i] * valueLossI * 0.01;
305
+ }
306
+ }
307
+ // Apply gradients with momentum
308
+ const lr = this.config.learningRate;
309
+ const beta = 0.9;
310
+ for (let i = 0; i < this.policyWeights.length; i++) {
311
+ this.policyMomentum[i] = beta * this.policyMomentum[i] + (1 - beta) * policyGrad[i];
312
+ this.policyWeights[i] -= lr * this.policyMomentum[i];
313
+ this.valueMomentum[i] = beta * this.valueMomentum[i] + (1 - beta) * valueGrad[i];
314
+ this.valueWeights[i] -= lr * this.valueMomentum[i];
315
+ }
316
+ return {
317
+ policyLoss: policyLoss / batch.length,
318
+ valueLoss: valueLoss / batch.length,
319
+ entropy: entropy / batch.length,
320
+ clipFrac: clipFrac / batch.length,
321
+ kl: kl / batch.length,
322
+ };
323
+ }
324
+ }
325
+ /**
326
+ * Factory function
327
+ */
328
+ export function createPPO(config) {
329
+ return new PPOAlgorithm(config);
330
+ }
331
+ //# sourceMappingURL=ppo.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ppo.js","sourceRoot":"","sources":["../../src/algorithms/ppo.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAQH;;GAEG;AACH,MAAM,CAAC,MAAM,kBAAkB,GAAc;IAC3C,SAAS,EAAE,KAAK;IAChB,YAAY,EAAE,MAAM;IACpB,KAAK,EAAE,IAAI;IACX,WAAW,EAAE,IAAI;IACjB,aAAa,EAAE,GAAG;IAClB,WAAW,EAAE,GAAG;IAChB,MAAM,EAAE,CAAC;IACT,aAAa,EAAE,EAAE;IACjB,SAAS,EAAE,GAAG;IACd,WAAW,EAAE,IAAI;IACjB,QAAQ,EAAE,IAAI;IACd,SAAS,EAAE,IAAI;CAChB,CAAC;AAeF;;GAEG;AACH,MAAM,OAAO,YAAY;IACf,MAAM,CAAY;IAE1B,6DAA6D;IACrD,aAAa,CAAe;IAC5B,YAAY,CAAe;IAEnC,kBAAkB;IACV,cAAc,CAAe;IAC7B,aAAa,CAAe;IAEpC,oBAAoB;IACZ,MAAM,GAAoB,EAAE,CAAC;IAErC,aAAa;IACL,WAAW,GAAG,CAAC,CAAC;IAChB,SAAS,GAAG,CAAC,CAAC;IACd,QAAQ,GAAG,CAAC,CAAC;IACb,YAAY,GAAG,CAAC,CAAC;IAEzB,YAAY,SAA6B,EAAE;QACzC,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,kBAAkB,EAAE,GAAG,MAAM,EAAE,CAAC;QAEnD,iDAAiD;QACjD,MAAM,GAAG,GAAG,GAAG,CAAC;QAChB,IAAI,CAAC,aAAa,GAAG,IAAI,YAAY,CAAC,GAAG,CAAC,CAAC;QAC3C,IAAI,CAAC,YAAY,GAAG,IAAI,YAAY,CAAC,GAAG,CAAC,CAAC;QAC1C,IAAI,CAAC,cAAc,GAAG,IAAI,YAAY,CAAC,GAAG,CAAC,CAAC;QAC5C,IAAI,CAAC,aAAa,GAAG,IAAI,YAAY,CAAC,GAAG,CAAC,CAAC;QAE3C,wBAAwB;QACxB,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;QACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7B,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,GAAG,KAAK,CAAC;YACtD,IAAI,CAAC,YAAY,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,GAAG,KAAK,CAAC;QACvD,CAAC;IACH,CAAC;IAED;;OAEG;IACH,aAAa,CAAC,UAAsB;QAClC,IAAI,UAAU,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO;QAE1C,+BAA+B;QAC/B,MAAM,MAAM,GAAG,UAAU,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CACzC,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,UAAU,CAAC,CACnC,CAAC;QAEF,+BAA+B;QAC/B,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAChC,UAAU,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,EACnC,MAAM,CACP,CAAC;QAEF,kBAAkB;QAClB,MAAM,OAAO,GAAG,IAAI,CAAC,cAAc,CACjC,UAAU,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CACpC,CAAC;QAEF,gBAAgB;QAChB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjD,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YACjC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC;gBACf,KAAK,EAAE,IAAI,CAAC,UAAU;gBACtB,MAAM,EAAE,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC;gBACpC,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,KAAK,EAAE,MAAM,CAAC,CAAC,CAAC;gBAChB,OAAO,EAAE,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,MAAM,CAAC;gBAC1D,SAAS,EAAE,UAAU,CAAC,CAAC,CAAC;gBACxB,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC;aACpB,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED;;;OAGG;IACH,MAAM;QACJ,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAEpC,IAAI,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,aAAa,EAAE,CAAC;YACnD,OAAO,EAAE,UAAU,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC;QACrD,CAAC;QAED,uBAAuB;QACvB,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACrD,MAAM,OAAO,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC;QAC1E,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CACtB,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,GAAG,UAAU,CAAC,MAAM,CAC3E,GAAG,IAAI,CAAC;QAET,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAC9B,GAAG,CAAC,SAAS,GAAG,CAAC,GAAG,CAAC,SAAS,GAAG,OAAO,CAAC,GAAG,MAAM,CAAC;QACrD,CAAC;QAED,IAAI,eAAe,GAAG,CAAC,CAAC;QACxB,IAAI,cAAc,GAAG,CAAC,CAAC;QACvB,IAAI,YAAY,GAAG,CAAC,CAAC;QACrB,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,kBAAkB;QAClB,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,KAAK,EAAE,EAAE,CAAC;YACxD,iBAAiB;YACjB,IAAI,CAAC,aAAa,EAAE,CAAC;YAErB,uBAAuB;YACvB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC,aAAa,EAAE,CAAC;gBACvE,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC;gBAClE,IAAI,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,aAAa,GAAG,CAAC;oBAAE,SAAS;gBAE3D,MAAM,MAAM,GAAG,IAAI,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;gBAC3C,eAAe,IAAI,MAAM,CAAC,UAAU,CAAC;gBACrC,cAAc,IAAI,MAAM,CAAC,SAAS,CAAC;gBACnC,YAAY,IAAI,MAAM,CAAC,OAAO,CAAC;gBAC/B,aAAa,IAAI,MAAM,CAAC,QAAQ,CAAC;gBACjC,OAAO,IAAI,MAAM,CAAC,EAAE,CAAC;gBACrB,UAAU,EAAE,CAAC;gBAEb,gCAAgC;gBAChC,IAAI,MAAM,CAAC,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,QAAQ,GAAG,GAAG,EAAE,CAAC;oBAC3C,MAAM;gBACR,CAAC;YACH,CAAC;QACH,CAAC;QAED,eAAe;QACf,IAAI,CAAC,MAAM,GAAG,EAAE,CAAC;QACjB,IAAI,CAAC,WAAW,EAAE,CAAC;QAEnB,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;QAC9C,IAAI,OAAO,GAAG,EAAE,EAAE,CAAC;YACjB,OAAO,CAAC,IAAI,CAAC,+BAA+B,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;QAC7E,CAAC;QAED,OAAO;YACL,UAAU,EAAE,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,eAAe,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAC7D,SAAS,EAAE,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,cAAc,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAC3D,OAAO,EAAE,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;SACxD,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,SAAS,CAAC,KAAmB;QAC3B,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;QACnC,MAAM,MAAM,GAAG,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;QAExC,OAAO;YACL,MAAM;YACN,OAAO,EAAE,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC;YACvC,KAAK,EAAE,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC;SAChC,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,QAAQ;QACN,OAAO;YACL,WAAW,EAAE,IAAI,CAAC,WAAW;YAC7B,UAAU,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM;YAC9B,OAAO,EAAE,IAAI,CAAC,WAAW,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;YACrE,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,YAAY,EAAE,IAAI,CAAC,YAAY;SAChC,CAAC;IACJ,CAAC;IAED,6EAA6E;IAC7E,kBAAkB;IAClB,6EAA6E;IAErE,YAAY,CAAC,KAAmB;QACtC,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1E,KAAK,IAAI,KAAK,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAC3C,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,aAAa,CAAC,KAAmB;QACvC,iCAAiC;QACjC,MAAM,UAAU,GAAG,CAAC,CAAC;QACrB,MAAM,MAAM,GAAG,IAAI,YAAY,CAAC,UAAU,CAAC,CAAC;QAE5C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,EAAE,CAAC,EAAE,EAAE,CAAC;YACpC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC3E,MAAM,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC;YAChE,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAEO,cAAc,CAAC,KAAmB,EAAE,MAAc;QACxD,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;QACnC,MAAM,SAAS,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;QAC1C,OAAO,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC,GAAG,IAAI,CAAC,CAAC;IAC3C,CAAC;IAEO,UAAU,CAAC,MAAc;QAC/B,oCAAoC;QACpC,IAAI,IAAI,GAAG,CAAC,CAAC;QACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,IAAI,GAAG,CAAC,IAAI,GAAG,EAAE,GAAG,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAChD,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,OAAO,CAAC,MAAoB;QAClC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC;QAChC,MAAM,IAAI,GAAG,IAAI,YAAY,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QAC7C,IAAI,GAAG,GAAG,CAAC,CAAC;QAEZ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,IAAI,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;YACpC,GAAG,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC;QACjB,CAAC;QAED,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,IAAI,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC;QACjB,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,YAAY,CAAC,KAAmB;QACtC,MAAM,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QACxB,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,MAAM,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC;YACnB,IAAI,CAAC,GAAG,MAAM;gBAAE,OAAO,CAAC,CAAC;QAC3B,CAAC;QACD,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;IAC1B,CAAC;IAEO,UAAU,CAAC,OAAiB,EAAE,MAAgB;QACpD,MAAM,UAAU,GAAG,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACrD,IAAI,OAAO,GAAG,CAAC,CAAC;QAEhB,KAAK,IAAI,CAAC,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7C,MAAM,SAAS,GAAG,CAAC,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC7D,MAAM,KAAK,GAAG,OAAO,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,GAAG,SAAS,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;YACrE,OAAO,GAAG,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,GAAG,OAAO,CAAC;YACtE,UAAU,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC;QAC1B,CAAC;QAED,OAAO,UAAU,CAAC;IACpB,CAAC;IAEO,cAAc,CAAC,OAAiB;QACtC,MAAM,OAAO,GAAG,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClD,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,KAAK,IAAI,CAAC,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7C,SAAS,GAAG,OAAO,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,GAAG,SAAS,CAAC;YACvD,OAAO,CAAC,CAAC,CAAC,GAAG,SAAS,CAAC;QACzB,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAEO,aAAa;QACnB,KAAK,IAAI,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChD,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YAC9C,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QACtE,CAAC;IACH,CAAC;IAEO,eAAe,CAAC,KAAsB;QAO5C,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,IAAI,QAAQ,GAAG,CAAC,CAAC;QACjB,IAAI,EAAE,GAAG,CAAC,CAAC;QAEX,MAAM,UAAU,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAC/D,MAAM,SAAS,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;QAE7D,KAAK,MAAM,GAAG,IAAI,KAAK,EAAE,CAAC;YACxB,iBAAiB;YACjB,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;YAC7C,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;YACnC,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,CAAC;YACtD,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;YAElD,gBAAgB;YAChB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,GAAG,GAAG,CAAC,OAAO,CAAC,CAAC;YAEjD,8BAA8B;YAC9B,MAAM,KAAK,GAAG,KAAK,GAAG,GAAG,CAAC,SAAS,CAAC;YACpC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CACpB,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,EAC1C,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAC1B,GAAG,GAAG,CAAC,SAAS,CAAC;YAElB,MAAM,WAAW,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAC5C,UAAU,IAAI,WAAW,CAAC;YAE1B,iBAAiB;YACjB,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC;gBAChD,QAAQ,EAAE,CAAC;YACb,CAAC;YAED,8BAA8B;YAC9B,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,GAAG,UAAU,CAAC,CAAC;YAEjC,aAAa;YACb,IAAI,UAAkB,CAAC;YACvB,IAAI,IAAI,CAAC,MAAM,CAAC,WAAW,KAAK,IAAI,EAAE,CAAC;gBACrC,MAAM,SAAS,GAAG,YAAY,CAAC;gBAC/B,MAAM,YAAY,GAAG,GAAG,CAAC,KAAK,GAAG,IAAI,CAAC,GAAG,CACvC,IAAI,CAAC,GAAG,CAAC,SAAS,GAAG,GAAG,CAAC,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,EACxD,CAAC,IAAI,CAAC,MAAM,CAAC,WAAW,CACzB,CAAC;gBACF,MAAM,GAAG,GAAG,CAAC,SAAS,GAAG,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;gBAC3C,MAAM,GAAG,GAAG,CAAC,YAAY,GAAG,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;gBAC9C,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;YAClC,CAAC;iBAAM,CAAC;gBACN,UAAU,GAAG,CAAC,YAAY,GAAG,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YACjD,CAAC;YACD,SAAS,IAAI,UAAU,CAAC;YAExB,UAAU;YACV,IAAI,QAAQ,GAAG,CAAC,CAAC;YACjB,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;gBACtB,IAAI,CAAC,GAAG,CAAC;oBAAE,QAAQ,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YACzC,CAAC;YACD,OAAO,IAAI,QAAQ,CAAC;YAEpB,iCAAiC;YACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;gBACvE,UAAU,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,WAAW,GAAG,IAAI,CAAC;gBACnD,SAAS,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,UAAU,GAAG,IAAI,CAAC;YACnD,CAAC;QACH,CAAC;QAED,gCAAgC;QAChC,MAAM,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC;QACpC,MAAM,IAAI,GAAG,GAAG,CAAC;QAEjB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,aAAa,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACnD,IAAI,CAAC,cAAc,CAAC,CAAC,CAAC,GAAG,IAAI,GAAG,IAAI,CAAC,cAAc,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YACpF,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,EAAE,GAAG,IAAI,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC;YAErD,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,GAAG,IAAI,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;YACjF,IAAI,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,EAAE,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC;QACrD,CAAC;QAED,OAAO;YACL,UAAU,EAAE,UAAU,GAAG,KAAK,CAAC,MAAM;YACrC,SAAS,EAAE,SAAS,GAAG,KAAK,CAAC,MAAM;YACnC,OAAO,EAAE,OAAO,GAAG,KAAK,CAAC,MAAM;YAC/B,QAAQ,EAAE,QAAQ,GAAG,KAAK,CAAC,MAAM;YACjC,EAAE,EAAE,EAAE,GAAG,KAAK,CAAC,MAAM;SACtB,CAAC;IACJ,CAAC;CACF;AAED;;GAEG;AACH,MAAM,UAAU,SAAS,CAAC,MAA2B;IACnD,OAAO,IAAI,YAAY,CAAC,MAAM,CAAC,CAAC;AAClC,CAAC"}
@@ -0,0 +1,77 @@
1
+ /**
2
+ * Tabular Q-Learning
3
+ *
4
+ * Classic Q-learning algorithm with:
5
+ * - Epsilon-greedy exploration
6
+ * - State hashing for continuous states
7
+ * - Eligibility traces (optional)
8
+ * - Experience replay
9
+ *
10
+ * Suitable for smaller state spaces or discretized environments.
11
+ * Performance Target: <1ms per update
12
+ */
13
+ import type { Trajectory, RLConfig } from '../types.js';
14
+ /**
15
+ * Q-Learning configuration
16
+ */
17
+ export interface QLearningConfig extends RLConfig {
18
+ algorithm: 'q-learning';
19
+ explorationInitial: number;
20
+ explorationFinal: number;
21
+ explorationDecay: number;
22
+ maxStates: number;
23
+ useEligibilityTraces: boolean;
24
+ traceDecay: number;
25
+ }
26
+ /**
27
+ * Default Q-Learning configuration
28
+ */
29
+ export declare const DEFAULT_QLEARNING_CONFIG: QLearningConfig;
30
+ /**
31
+ * Q-Learning Algorithm Implementation
32
+ */
33
+ export declare class QLearning {
34
+ private config;
35
+ private qTable;
36
+ private epsilon;
37
+ private stepCount;
38
+ private numActions;
39
+ private traces;
40
+ private updateCount;
41
+ private avgTDError;
42
+ constructor(config?: Partial<QLearningConfig>);
43
+ /**
44
+ * Update Q-values from trajectory
45
+ */
46
+ update(trajectory: Trajectory): {
47
+ tdError: number;
48
+ };
49
+ /**
50
+ * Get action using epsilon-greedy policy
51
+ */
52
+ getAction(state: Float32Array, explore?: boolean): number;
53
+ /**
54
+ * Get Q-values for a state
55
+ */
56
+ getQValues(state: Float32Array): Float32Array;
57
+ /**
58
+ * Get statistics
59
+ */
60
+ getStats(): Record<string, number>;
61
+ /**
62
+ * Reset Q-table
63
+ */
64
+ reset(): void;
65
+ private hashState;
66
+ private hashAction;
67
+ private getOrCreateEntry;
68
+ private updateTrace;
69
+ private updateWithTraces;
70
+ private pruneQTable;
71
+ private argmax;
72
+ }
73
+ /**
74
+ * Factory function
75
+ */
76
+ export declare function createQLearning(config?: Partial<QLearningConfig>): QLearning;
77
+ //# sourceMappingURL=q-learning.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"q-learning.d.ts","sourceRoot":"","sources":["../../src/algorithms/q-learning.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAExD;;GAEG;AACH,MAAM,WAAW,eAAgB,SAAQ,QAAQ;IAC/C,SAAS,EAAE,YAAY,CAAC;IACxB,kBAAkB,EAAE,MAAM,CAAC;IAC3B,gBAAgB,EAAE,MAAM,CAAC;IACzB,gBAAgB,EAAE,MAAM,CAAC;IACzB,SAAS,EAAE,MAAM,CAAC;IAClB,oBAAoB,EAAE,OAAO,CAAC;IAC9B,UAAU,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,eAAO,MAAM,wBAAwB,EAAE,eAetC,CAAC;AAWF;;GAEG;AACH,qBAAa,SAAS;IACpB,OAAO,CAAC,MAAM,CAAkB;IAGhC,OAAO,CAAC,MAAM,CAAkC;IAGhD,OAAO,CAAC,OAAO,CAAS;IACxB,OAAO,CAAC,SAAS,CAAK;IAGtB,OAAO,CAAC,UAAU,CAAK;IAGvB,OAAO,CAAC,MAAM,CAAwC;IAGtD,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,UAAU,CAAK;gBAEX,MAAM,GAAE,OAAO,CAAC,eAAe,CAAM;IAKjD;;OAEG;IACH,MAAM,CAAC,UAAU,EAAE,UAAU,GAAG;QAAE,OAAO,EAAE,MAAM,CAAA;KAAE;IA8EnD;;OAEG;IACH,SAAS,CAAC,KAAK,EAAE,YAAY,EAAE,OAAO,GAAE,OAAc,GAAG,MAAM;IAe/D;;OAEG;IACH,UAAU,CAAC,KAAK,EAAE,YAAY,GAAG,YAAY;IAW7C;;OAEG;IACH,QAAQ,IAAI,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC;IAUlC;;OAEG;IACH,KAAK,IAAI,IAAI;IAab,OAAO,CAAC,SAAS;IAejB,OAAO,CAAC,UAAU;IAQlB,OAAO,CAAC,gBAAgB;IAexB,OAAO,CAAC,WAAW;IAuBnB,OAAO,CAAC,gBAAgB;IAexB,OAAO,CAAC,WAAW;IAWnB,OAAO,CAAC,MAAM;CAWf;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,MAAM,CAAC,EAAE,OAAO,CAAC,eAAe,CAAC,GAAG,SAAS,CAE5E"}
@@ -0,0 +1,259 @@
1
+ /**
2
+ * Tabular Q-Learning
3
+ *
4
+ * Classic Q-learning algorithm with:
5
+ * - Epsilon-greedy exploration
6
+ * - State hashing for continuous states
7
+ * - Eligibility traces (optional)
8
+ * - Experience replay
9
+ *
10
+ * Suitable for smaller state spaces or discretized environments.
11
+ * Performance Target: <1ms per update
12
+ */
13
+ /**
14
+ * Default Q-Learning configuration
15
+ */
16
+ export const DEFAULT_QLEARNING_CONFIG = {
17
+ algorithm: 'q-learning',
18
+ learningRate: 0.1,
19
+ gamma: 0.99,
20
+ entropyCoef: 0,
21
+ valueLossCoef: 1,
22
+ maxGradNorm: 1,
23
+ epochs: 1,
24
+ miniBatchSize: 1,
25
+ explorationInitial: 1.0,
26
+ explorationFinal: 0.01,
27
+ explorationDecay: 10000,
28
+ maxStates: 10000,
29
+ useEligibilityTraces: false,
30
+ traceDecay: 0.9,
31
+ };
32
+ /**
33
+ * Q-Learning Algorithm Implementation
34
+ */
35
+ export class QLearning {
36
+ config;
37
+ // Q-table
38
+ qTable = new Map();
39
+ // Exploration
40
+ epsilon;
41
+ stepCount = 0;
42
+ // Number of actions
43
+ numActions = 4;
44
+ // Eligibility traces
45
+ traces = new Map();
46
+ // Statistics
47
+ updateCount = 0;
48
+ avgTDError = 0;
49
+ constructor(config = {}) {
50
+ this.config = { ...DEFAULT_QLEARNING_CONFIG, ...config };
51
+ this.epsilon = this.config.explorationInitial;
52
+ }
53
+ /**
54
+ * Update Q-values from trajectory
55
+ */
56
+ update(trajectory) {
57
+ const startTime = performance.now();
58
+ if (trajectory.steps.length === 0) {
59
+ return { tdError: 0 };
60
+ }
61
+ let totalTDError = 0;
62
+ // Reset eligibility traces for new trajectory
63
+ if (this.config.useEligibilityTraces) {
64
+ this.traces.clear();
65
+ }
66
+ for (let i = 0; i < trajectory.steps.length; i++) {
67
+ const step = trajectory.steps[i];
68
+ const stateKey = this.hashState(step.stateBefore);
69
+ const action = this.hashAction(step.action);
70
+ // Get or create Q-entry
71
+ const qEntry = this.getOrCreateEntry(stateKey);
72
+ // Current Q-value
73
+ const currentQ = qEntry.qValues[action];
74
+ // Compute target Q-value
75
+ let targetQ;
76
+ if (i === trajectory.steps.length - 1) {
77
+ // Terminal state
78
+ targetQ = step.reward;
79
+ }
80
+ else {
81
+ const nextStateKey = this.hashState(step.stateAfter);
82
+ const nextEntry = this.getOrCreateEntry(nextStateKey);
83
+ const maxNextQ = Math.max(...nextEntry.qValues);
84
+ targetQ = step.reward + this.config.gamma * maxNextQ;
85
+ }
86
+ // TD error
87
+ const tdError = targetQ - currentQ;
88
+ totalTDError += Math.abs(tdError);
89
+ if (this.config.useEligibilityTraces) {
90
+ // Update eligibility trace
91
+ this.updateTrace(stateKey, action);
92
+ // Update all states with traces
93
+ this.updateWithTraces(tdError);
94
+ }
95
+ else {
96
+ // Simple Q-learning update
97
+ qEntry.qValues[action] += this.config.learningRate * tdError;
98
+ qEntry.visits++;
99
+ qEntry.lastUpdate = Date.now();
100
+ }
101
+ }
102
+ // Decay exploration
103
+ this.stepCount += trajectory.steps.length;
104
+ this.epsilon = Math.max(this.config.explorationFinal, this.config.explorationInitial - this.stepCount / this.config.explorationDecay);
105
+ // Prune Q-table if too large
106
+ if (this.qTable.size > this.config.maxStates) {
107
+ this.pruneQTable();
108
+ }
109
+ this.updateCount++;
110
+ this.avgTDError = totalTDError / trajectory.steps.length;
111
+ const elapsed = performance.now() - startTime;
112
+ if (elapsed > 1) {
113
+ console.warn(`Q-learning update exceeded target: ${elapsed.toFixed(2)}ms > 1ms`);
114
+ }
115
+ return { tdError: this.avgTDError };
116
+ }
117
+ /**
118
+ * Get action using epsilon-greedy policy
119
+ */
120
+ getAction(state, explore = true) {
121
+ if (explore && Math.random() < this.epsilon) {
122
+ return Math.floor(Math.random() * this.numActions);
123
+ }
124
+ const stateKey = this.hashState(state);
125
+ const entry = this.qTable.get(stateKey);
126
+ if (!entry) {
127
+ return Math.floor(Math.random() * this.numActions);
128
+ }
129
+ return this.argmax(entry.qValues);
130
+ }
131
+ /**
132
+ * Get Q-values for a state
133
+ */
134
+ getQValues(state) {
135
+ const stateKey = this.hashState(state);
136
+ const entry = this.qTable.get(stateKey);
137
+ if (!entry) {
138
+ return new Float32Array(this.numActions);
139
+ }
140
+ return new Float32Array(entry.qValues);
141
+ }
142
+ /**
143
+ * Get statistics
144
+ */
145
+ getStats() {
146
+ return {
147
+ updateCount: this.updateCount,
148
+ qTableSize: this.qTable.size,
149
+ epsilon: this.epsilon,
150
+ avgTDError: this.avgTDError,
151
+ stepCount: this.stepCount,
152
+ };
153
+ }
154
+ /**
155
+ * Reset Q-table
156
+ */
157
+ reset() {
158
+ this.qTable.clear();
159
+ this.traces.clear();
160
+ this.epsilon = this.config.explorationInitial;
161
+ this.stepCount = 0;
162
+ this.updateCount = 0;
163
+ this.avgTDError = 0;
164
+ }
165
+ // ==========================================================================
166
+ // Private Methods
167
+ // ==========================================================================
168
+ hashState(state) {
169
+ // Discretize state by binning values
170
+ const bins = 10;
171
+ const parts = [];
172
+ // Use first 8 dimensions for hashing
173
+ for (let i = 0; i < Math.min(8, state.length); i++) {
174
+ const normalized = (state[i] + 1) / 2; // Assume [-1, 1] range
175
+ const bin = Math.floor(Math.max(0, Math.min(bins - 1, normalized * bins)));
176
+ parts.push(bin);
177
+ }
178
+ return parts.join(',');
179
+ }
180
+ hashAction(action) {
181
+ let hash = 0;
182
+ for (let i = 0; i < action.length; i++) {
183
+ hash = (hash * 31 + action.charCodeAt(i)) % this.numActions;
184
+ }
185
+ return hash;
186
+ }
187
+ getOrCreateEntry(stateKey) {
188
+ let entry = this.qTable.get(stateKey);
189
+ if (!entry) {
190
+ entry = {
191
+ qValues: new Float32Array(this.numActions),
192
+ visits: 0,
193
+ lastUpdate: Date.now(),
194
+ };
195
+ this.qTable.set(stateKey, entry);
196
+ }
197
+ return entry;
198
+ }
199
+ updateTrace(stateKey, action) {
200
+ // Decay all existing traces
201
+ for (const [key, trace] of this.traces) {
202
+ for (let a = 0; a < this.numActions; a++) {
203
+ trace[a] *= this.config.gamma * this.config.traceDecay;
204
+ }
205
+ // Remove near-zero traces
206
+ const maxTrace = Math.max(...trace);
207
+ if (maxTrace < 0.001) {
208
+ this.traces.delete(key);
209
+ }
210
+ }
211
+ // Set trace for current state-action
212
+ let trace = this.traces.get(stateKey);
213
+ if (!trace) {
214
+ trace = new Float32Array(this.numActions);
215
+ this.traces.set(stateKey, trace);
216
+ }
217
+ trace[action] = 1.0;
218
+ }
219
+ updateWithTraces(tdError) {
220
+ const lr = this.config.learningRate;
221
+ for (const [stateKey, trace] of this.traces) {
222
+ const entry = this.qTable.get(stateKey);
223
+ if (entry) {
224
+ for (let a = 0; a < this.numActions; a++) {
225
+ entry.qValues[a] += lr * tdError * trace[a];
226
+ }
227
+ entry.visits++;
228
+ entry.lastUpdate = Date.now();
229
+ }
230
+ }
231
+ }
232
+ pruneQTable() {
233
+ // Remove least recently used states
234
+ const entries = Array.from(this.qTable.entries())
235
+ .sort((a, b) => a[1].lastUpdate - b[1].lastUpdate);
236
+ const toRemove = entries.length - Math.floor(this.config.maxStates * 0.8);
237
+ for (let i = 0; i < toRemove; i++) {
238
+ this.qTable.delete(entries[i][0]);
239
+ }
240
+ }
241
+ argmax(values) {
242
+ let maxIdx = 0;
243
+ let maxVal = values[0];
244
+ for (let i = 1; i < values.length; i++) {
245
+ if (values[i] > maxVal) {
246
+ maxVal = values[i];
247
+ maxIdx = i;
248
+ }
249
+ }
250
+ return maxIdx;
251
+ }
252
+ }
253
+ /**
254
+ * Factory function
255
+ */
256
+ export function createQLearning(config) {
257
+ return new QLearning(config);
258
+ }
259
+ //# sourceMappingURL=q-learning.js.map