moflo 4.8.32 → 4.8.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/bin/generate-code-map.mjs +955 -955
  2. package/bin/index-guidance.mjs +905 -905
  3. package/bin/index-tests.mjs +728 -728
  4. package/bin/setup-project.mjs +252 -252
  5. package/package.json +10 -5
  6. package/src/@claude-flow/cli/dist/src/commands/doctor.js +1339 -1107
  7. package/src/@claude-flow/cli/dist/src/index.js +2 -18
  8. package/src/@claude-flow/cli/dist/src/mcp-tools/hooks-tools.js +17 -0
  9. package/src/@claude-flow/cli/dist/src/memory/memory-initializer.js +4 -7
  10. package/src/@claude-flow/cli/dist/src/version.js +6 -0
  11. package/src/@claude-flow/cli/package.json +1 -1
  12. package/src/@claude-flow/neural/README.md +260 -0
  13. package/src/@claude-flow/neural/dist/algorithms/a2c.js +361 -0
  14. package/src/@claude-flow/neural/dist/algorithms/curiosity.js +392 -0
  15. package/src/@claude-flow/neural/dist/algorithms/decision-transformer.js +415 -0
  16. package/src/@claude-flow/neural/dist/algorithms/dqn.js +303 -0
  17. package/src/@claude-flow/neural/dist/algorithms/index.js +74 -0
  18. package/src/@claude-flow/neural/dist/algorithms/ppo.js +331 -0
  19. package/src/@claude-flow/neural/dist/algorithms/q-learning.js +259 -0
  20. package/src/@claude-flow/neural/dist/algorithms/sarsa.js +297 -0
  21. package/src/@claude-flow/neural/dist/application/index.js +7 -0
  22. package/src/@claude-flow/neural/dist/application/services/neural-application-service.js +161 -0
  23. package/src/@claude-flow/neural/dist/domain/entities/pattern.js +134 -0
  24. package/src/@claude-flow/neural/dist/domain/index.js +8 -0
  25. package/src/@claude-flow/neural/dist/domain/services/learning-service.js +195 -0
  26. package/src/@claude-flow/neural/dist/index.js +201 -0
  27. package/src/@claude-flow/neural/dist/modes/balanced.js +234 -0
  28. package/src/@claude-flow/neural/dist/modes/base.js +77 -0
  29. package/src/@claude-flow/neural/dist/modes/batch.js +316 -0
  30. package/src/@claude-flow/neural/dist/modes/edge.js +310 -0
  31. package/src/@claude-flow/neural/dist/modes/index.js +13 -0
  32. package/src/@claude-flow/neural/dist/modes/real-time.js +196 -0
  33. package/src/@claude-flow/neural/dist/modes/research.js +389 -0
  34. package/src/@claude-flow/neural/dist/pattern-learner.js +603 -0
  35. package/src/@claude-flow/neural/dist/reasoning-bank.js +993 -0
  36. package/src/@claude-flow/neural/dist/reasoningbank-adapter.js +463 -0
  37. package/src/@claude-flow/neural/dist/sona-integration.js +326 -0
  38. package/src/@claude-flow/neural/dist/sona-manager.js +695 -0
  39. package/src/@claude-flow/neural/dist/types.js +11 -0
  40. package/src/@claude-flow/neural/package.json +26 -0
@@ -0,0 +1,303 @@
1
+ /**
2
+ * Deep Q-Network (DQN)
3
+ *
4
+ * Implements DQN with enhancements:
5
+ * - Experience replay
6
+ * - Target network
7
+ * - Double DQN (optional)
8
+ * - Dueling architecture (optional)
9
+ * - Epsilon-greedy exploration
10
+ *
11
+ * Performance Target: <10ms per update step
12
+ */
13
+ /**
14
+ * Default DQN configuration
15
+ */
16
+ export const DEFAULT_DQN_CONFIG = {
17
+ algorithm: 'dqn',
18
+ learningRate: 0.0001,
19
+ gamma: 0.99,
20
+ entropyCoef: 0,
21
+ valueLossCoef: 1,
22
+ maxGradNorm: 10,
23
+ epochs: 1,
24
+ miniBatchSize: 32,
25
+ bufferSize: 10000,
26
+ explorationInitial: 1.0,
27
+ explorationFinal: 0.01,
28
+ explorationDecay: 10000,
29
+ targetUpdateFreq: 100,
30
+ doubleDQN: true,
31
+ duelingNetwork: false,
32
+ };
33
+ /**
34
+ * DQN Algorithm Implementation
35
+ */
36
+ export class DQNAlgorithm {
37
+ config;
38
+ // Q-network weights
39
+ qWeights;
40
+ targetWeights;
41
+ // Optimizer state
42
+ qMomentum;
43
+ // Replay buffer (circular)
44
+ buffer = [];
45
+ bufferIdx = 0;
46
+ // Exploration
47
+ epsilon;
48
+ stepCount = 0;
49
+ // Number of actions
50
+ numActions = 4;
51
+ inputDim = 768;
52
+ // Statistics
53
+ updateCount = 0;
54
+ avgLoss = 0;
55
+ constructor(config = {}) {
56
+ this.config = { ...DEFAULT_DQN_CONFIG, ...config };
57
+ this.epsilon = this.config.explorationInitial;
58
+ // Initialize Q-network (2 hidden layers)
59
+ this.qWeights = this.initializeNetwork();
60
+ this.targetWeights = this.copyNetwork(this.qWeights);
61
+ this.qMomentum = this.qWeights.map(w => new Float32Array(w.length));
62
+ }
63
+ /**
64
+ * Add experience from trajectory
65
+ */
66
+ addExperience(trajectory) {
67
+ for (let i = 0; i < trajectory.steps.length; i++) {
68
+ const step = trajectory.steps[i];
69
+ const nextStep = i < trajectory.steps.length - 1
70
+ ? trajectory.steps[i + 1]
71
+ : null;
72
+ const experience = {
73
+ state: step.stateBefore,
74
+ action: this.hashAction(step.action),
75
+ reward: step.reward,
76
+ nextState: step.stateAfter,
77
+ done: nextStep === null,
78
+ };
79
+ // Add to circular buffer
80
+ if (this.buffer.length < this.config.bufferSize) {
81
+ this.buffer.push(experience);
82
+ }
83
+ else {
84
+ this.buffer[this.bufferIdx] = experience;
85
+ }
86
+ this.bufferIdx = (this.bufferIdx + 1) % this.config.bufferSize;
87
+ }
88
+ }
89
+ /**
90
+ * Perform DQN update
91
+ * Target: <10ms
92
+ */
93
+ update() {
94
+ const startTime = performance.now();
95
+ if (this.buffer.length < this.config.miniBatchSize) {
96
+ return { loss: 0, epsilon: this.epsilon };
97
+ }
98
+ // Sample mini-batch
99
+ const batch = this.sampleBatch();
100
+ // Compute TD targets
101
+ let totalLoss = 0;
102
+ const gradients = this.qWeights.map(w => new Float32Array(w.length));
103
+ for (const exp of batch) {
104
+ // Current Q-values
105
+ const qValues = this.forward(exp.state, this.qWeights);
106
+ const currentQ = qValues[exp.action];
107
+ // Target Q-value
108
+ let targetQ;
109
+ if (exp.done) {
110
+ targetQ = exp.reward;
111
+ }
112
+ else {
113
+ if (this.config.doubleDQN) {
114
+ // Double DQN: use online network to select action, target to evaluate
115
+ const nextQOnline = this.forward(exp.nextState, this.qWeights);
116
+ const bestAction = this.argmax(nextQOnline);
117
+ const nextQTarget = this.forward(exp.nextState, this.targetWeights);
118
+ targetQ = exp.reward + this.config.gamma * nextQTarget[bestAction];
119
+ }
120
+ else {
121
+ // Standard DQN
122
+ const nextQ = this.forward(exp.nextState, this.targetWeights);
123
+ targetQ = exp.reward + this.config.gamma * Math.max(...nextQ);
124
+ }
125
+ }
126
+ // TD error
127
+ const tdError = targetQ - currentQ;
128
+ const loss = tdError * tdError;
129
+ totalLoss += loss;
130
+ // Accumulate gradients
131
+ this.accumulateGradients(gradients, exp.state, exp.action, tdError);
132
+ }
133
+ // Apply gradients
134
+ this.applyGradients(gradients, batch.length);
135
+ // Update target network periodically
136
+ this.stepCount++;
137
+ if (this.stepCount % this.config.targetUpdateFreq === 0) {
138
+ this.targetWeights = this.copyNetwork(this.qWeights);
139
+ }
140
+ // Decay exploration
141
+ this.epsilon = Math.max(this.config.explorationFinal, this.config.explorationInitial - this.stepCount / this.config.explorationDecay);
142
+ this.updateCount++;
143
+ this.avgLoss = totalLoss / batch.length;
144
+ const elapsed = performance.now() - startTime;
145
+ if (elapsed > 10) {
146
+ console.warn(`DQN update exceeded target: ${elapsed.toFixed(2)}ms > 10ms`);
147
+ }
148
+ return {
149
+ loss: this.avgLoss,
150
+ epsilon: this.epsilon,
151
+ };
152
+ }
153
+ /**
154
+ * Get action using epsilon-greedy
155
+ */
156
+ getAction(state, explore = true) {
157
+ if (explore && Math.random() < this.epsilon) {
158
+ return Math.floor(Math.random() * this.numActions);
159
+ }
160
+ const qValues = this.forward(state, this.qWeights);
161
+ return this.argmax(qValues);
162
+ }
163
+ /**
164
+ * Get Q-values for a state
165
+ */
166
+ getQValues(state) {
167
+ return this.forward(state, this.qWeights);
168
+ }
169
+ /**
170
+ * Get statistics
171
+ */
172
+ getStats() {
173
+ return {
174
+ updateCount: this.updateCount,
175
+ bufferSize: this.buffer.length,
176
+ epsilon: this.epsilon,
177
+ avgLoss: this.avgLoss,
178
+ stepCount: this.stepCount,
179
+ };
180
+ }
181
+ // ==========================================================================
182
+ // Private Methods
183
+ // ==========================================================================
184
+ initializeNetwork() {
185
+ // Simple 2-layer network: input -> hidden -> output
186
+ const hiddenDim = 64;
187
+ const weights = [];
188
+ // Layer 1: input_dim -> hidden
189
+ const w1 = new Float32Array(this.inputDim * hiddenDim);
190
+ const scale1 = Math.sqrt(2 / this.inputDim);
191
+ for (let i = 0; i < w1.length; i++) {
192
+ w1[i] = (Math.random() - 0.5) * scale1;
193
+ }
194
+ weights.push(w1);
195
+ // Layer 2: hidden -> num_actions
196
+ const w2 = new Float32Array(hiddenDim * this.numActions);
197
+ const scale2 = Math.sqrt(2 / hiddenDim);
198
+ for (let i = 0; i < w2.length; i++) {
199
+ w2[i] = (Math.random() - 0.5) * scale2;
200
+ }
201
+ weights.push(w2);
202
+ return weights;
203
+ }
204
+ copyNetwork(weights) {
205
+ return weights.map(w => new Float32Array(w));
206
+ }
207
+ forward(state, weights) {
208
+ const hiddenDim = 64;
209
+ // Layer 1: ReLU(W1 * x)
210
+ const hidden = new Float32Array(hiddenDim);
211
+ for (let h = 0; h < hiddenDim; h++) {
212
+ let sum = 0;
213
+ for (let i = 0; i < Math.min(state.length, this.inputDim); i++) {
214
+ sum += state[i] * weights[0][i * hiddenDim + h];
215
+ }
216
+ hidden[h] = Math.max(0, sum); // ReLU
217
+ }
218
+ // Layer 2: W2 * hidden (no activation for Q-values)
219
+ const output = new Float32Array(this.numActions);
220
+ for (let a = 0; a < this.numActions; a++) {
221
+ let sum = 0;
222
+ for (let h = 0; h < hiddenDim; h++) {
223
+ sum += hidden[h] * weights[1][h * this.numActions + a];
224
+ }
225
+ output[a] = sum;
226
+ }
227
+ return output;
228
+ }
229
+ accumulateGradients(gradients, state, action, tdError) {
230
+ const hiddenDim = 64;
231
+ // Forward pass to get hidden activations
232
+ const hidden = new Float32Array(hiddenDim);
233
+ for (let h = 0; h < hiddenDim; h++) {
234
+ let sum = 0;
235
+ for (let i = 0; i < Math.min(state.length, this.inputDim); i++) {
236
+ sum += state[i] * this.qWeights[0][i * hiddenDim + h];
237
+ }
238
+ hidden[h] = Math.max(0, sum);
239
+ }
240
+ // Gradient for layer 2 (only for selected action)
241
+ for (let h = 0; h < hiddenDim; h++) {
242
+ gradients[1][h * this.numActions + action] += hidden[h] * tdError;
243
+ }
244
+ // Gradient for layer 1 (backprop through ReLU)
245
+ for (let h = 0; h < hiddenDim; h++) {
246
+ if (hidden[h] > 0) { // ReLU gradient
247
+ const grad = tdError * this.qWeights[1][h * this.numActions + action];
248
+ for (let i = 0; i < Math.min(state.length, this.inputDim); i++) {
249
+ gradients[0][i * hiddenDim + h] += state[i] * grad;
250
+ }
251
+ }
252
+ }
253
+ }
254
+ applyGradients(gradients, batchSize) {
255
+ const lr = this.config.learningRate / batchSize;
256
+ const beta = 0.9;
257
+ for (let layer = 0; layer < gradients.length; layer++) {
258
+ for (let i = 0; i < gradients[layer].length; i++) {
259
+ // Gradient clipping
260
+ const grad = Math.max(Math.min(gradients[layer][i], this.config.maxGradNorm), -this.config.maxGradNorm);
261
+ // Momentum update
262
+ this.qMomentum[layer][i] = beta * this.qMomentum[layer][i] + (1 - beta) * grad;
263
+ this.qWeights[layer][i] += lr * this.qMomentum[layer][i];
264
+ }
265
+ }
266
+ }
267
+ sampleBatch() {
268
+ const batch = [];
269
+ const indices = new Set();
270
+ while (indices.size < this.config.miniBatchSize && indices.size < this.buffer.length) {
271
+ indices.add(Math.floor(Math.random() * this.buffer.length));
272
+ }
273
+ for (const idx of indices) {
274
+ batch.push(this.buffer[idx]);
275
+ }
276
+ return batch;
277
+ }
278
+ hashAction(action) {
279
+ let hash = 0;
280
+ for (let i = 0; i < action.length; i++) {
281
+ hash = (hash * 31 + action.charCodeAt(i)) % this.numActions;
282
+ }
283
+ return hash;
284
+ }
285
+ argmax(values) {
286
+ let maxIdx = 0;
287
+ let maxVal = values[0];
288
+ for (let i = 1; i < values.length; i++) {
289
+ if (values[i] > maxVal) {
290
+ maxVal = values[i];
291
+ maxIdx = i;
292
+ }
293
+ }
294
+ return maxIdx;
295
+ }
296
+ }
297
+ /**
298
+ * Factory function
299
+ */
300
+ export function createDQN(config) {
301
+ return new DQNAlgorithm(config);
302
+ }
303
+ //# sourceMappingURL=dqn.js.map
@@ -0,0 +1,74 @@
1
+ /**
2
+ * RL Algorithms Index
3
+ *
4
+ * Exports all reinforcement learning algorithm implementations.
5
+ */
6
+ // PPO - Proximal Policy Optimization
7
+ export { PPOAlgorithm, createPPO, DEFAULT_PPO_CONFIG, } from './ppo.js';
8
+ // DQN - Deep Q-Network
9
+ export { DQNAlgorithm, createDQN, DEFAULT_DQN_CONFIG, } from './dqn.js';
10
+ // A2C - Advantage Actor-Critic
11
+ export { A2CAlgorithm, createA2C, DEFAULT_A2C_CONFIG, } from './a2c.js';
12
+ // Decision Transformer
13
+ export { DecisionTransformer, createDecisionTransformer, DEFAULT_DT_CONFIG, } from './decision-transformer.js';
14
+ // Q-Learning (Tabular)
15
+ export { QLearning, createQLearning, DEFAULT_QLEARNING_CONFIG, } from './q-learning.js';
16
+ // SARSA
17
+ export { SARSAAlgorithm, createSARSA, DEFAULT_SARSA_CONFIG, } from './sarsa.js';
18
+ // Curiosity-Driven Exploration
19
+ export { CuriosityModule, createCuriosity, DEFAULT_CURIOSITY_CONFIG, } from './curiosity.js';
20
+ import { createPPO, DEFAULT_PPO_CONFIG } from './ppo.js';
21
+ import { createDQN, DEFAULT_DQN_CONFIG } from './dqn.js';
22
+ import { createA2C, DEFAULT_A2C_CONFIG } from './a2c.js';
23
+ import { createDecisionTransformer, DEFAULT_DT_CONFIG } from './decision-transformer.js';
24
+ import { createQLearning, DEFAULT_QLEARNING_CONFIG } from './q-learning.js';
25
+ import { createSARSA, DEFAULT_SARSA_CONFIG } from './sarsa.js';
26
+ import { createCuriosity, DEFAULT_CURIOSITY_CONFIG } from './curiosity.js';
27
+ /**
28
+ * Create an RL algorithm by name
29
+ */
30
+ export function createAlgorithm(algorithm, config) {
31
+ // Use type assertions since config is validated by algorithm switch
32
+ switch (algorithm) {
33
+ case 'ppo':
34
+ return createPPO(config);
35
+ case 'dqn':
36
+ return createDQN(config);
37
+ case 'a2c':
38
+ return createA2C(config);
39
+ case 'decision-transformer':
40
+ return createDecisionTransformer(config);
41
+ case 'q-learning':
42
+ return createQLearning(config);
43
+ case 'sarsa':
44
+ return createSARSA(config);
45
+ case 'curiosity':
46
+ return createCuriosity(config);
47
+ default:
48
+ throw new Error(`Unknown algorithm: ${algorithm}`);
49
+ }
50
+ }
51
+ /**
52
+ * Get default configuration for an algorithm
53
+ */
54
+ export function getDefaultConfig(algorithm) {
55
+ switch (algorithm) {
56
+ case 'ppo':
57
+ return { ...DEFAULT_PPO_CONFIG };
58
+ case 'dqn':
59
+ return { ...DEFAULT_DQN_CONFIG };
60
+ case 'a2c':
61
+ return { ...DEFAULT_A2C_CONFIG };
62
+ case 'decision-transformer':
63
+ return { ...DEFAULT_DT_CONFIG };
64
+ case 'q-learning':
65
+ return { ...DEFAULT_QLEARNING_CONFIG };
66
+ case 'sarsa':
67
+ return { ...DEFAULT_SARSA_CONFIG };
68
+ case 'curiosity':
69
+ return { ...DEFAULT_CURIOSITY_CONFIG };
70
+ default:
71
+ throw new Error(`Unknown algorithm: ${algorithm}`);
72
+ }
73
+ }
74
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1,331 @@
1
+ /**
2
+ * Proximal Policy Optimization (PPO)
3
+ *
4
+ * Implements PPO algorithm for stable policy learning with:
5
+ * - Clipped surrogate objective
6
+ * - GAE (Generalized Advantage Estimation)
7
+ * - Value function clipping
8
+ * - Entropy bonus
9
+ *
10
+ * Performance Target: <10ms per update step
11
+ */
12
+ /**
13
+ * Default PPO configuration
14
+ */
15
+ export const DEFAULT_PPO_CONFIG = {
16
+ algorithm: 'ppo',
17
+ learningRate: 0.0003,
18
+ gamma: 0.99,
19
+ entropyCoef: 0.01,
20
+ valueLossCoef: 0.5,
21
+ maxGradNorm: 0.5,
22
+ epochs: 4,
23
+ miniBatchSize: 64,
24
+ clipRange: 0.2,
25
+ clipRangeVf: null,
26
+ targetKL: 0.01,
27
+ gaeLambda: 0.95,
28
+ };
29
+ /**
30
+ * PPO Algorithm Implementation
31
+ */
32
+ export class PPOAlgorithm {
33
+ config;
34
+ // Policy network weights (simplified linear model for speed)
35
+ policyWeights;
36
+ valueWeights;
37
+ // Optimizer state
38
+ policyMomentum;
39
+ valueMomentum;
40
+ // Experience buffer
41
+ buffer = [];
42
+ // Statistics
43
+ updateCount = 0;
44
+ totalLoss = 0;
45
+ approxKL = 0;
46
+ clipFraction = 0;
47
+ constructor(config = {}) {
48
+ this.config = { ...DEFAULT_PPO_CONFIG, ...config };
49
+ // Initialize weights (768 input dim, simplified)
50
+ const dim = 768;
51
+ this.policyWeights = new Float32Array(dim);
52
+ this.valueWeights = new Float32Array(dim);
53
+ this.policyMomentum = new Float32Array(dim);
54
+ this.valueMomentum = new Float32Array(dim);
55
+ // Xavier initialization
56
+ const scale = Math.sqrt(2 / dim);
57
+ for (let i = 0; i < dim; i++) {
58
+ this.policyWeights[i] = (Math.random() - 0.5) * scale;
59
+ this.valueWeights[i] = (Math.random() - 0.5) * scale;
60
+ }
61
+ }
62
+ /**
63
+ * Add experience from trajectory
64
+ */
65
+ addExperience(trajectory) {
66
+ if (trajectory.steps.length === 0)
67
+ return;
68
+ // Compute values for each step
69
+ const values = trajectory.steps.map(step => this.computeValue(step.stateAfter));
70
+ // Compute advantages using GAE
71
+ const advantages = this.computeGAE(trajectory.steps.map(s => s.reward), values);
72
+ // Compute returns
73
+ const returns = this.computeReturns(trajectory.steps.map(s => s.reward));
74
+ // Add to buffer
75
+ for (let i = 0; i < trajectory.steps.length; i++) {
76
+ const step = trajectory.steps[i];
77
+ this.buffer.push({
78
+ state: step.stateAfter,
79
+ action: this.hashAction(step.action),
80
+ reward: step.reward,
81
+ value: values[i],
82
+ logProb: this.computeLogProb(step.stateAfter, step.action),
83
+ advantage: advantages[i],
84
+ return_: returns[i],
85
+ });
86
+ }
87
+ }
88
+ /**
89
+ * Perform PPO update
90
+ * Target: <10ms
91
+ */
92
+ update() {
93
+ const startTime = performance.now();
94
+ if (this.buffer.length < this.config.miniBatchSize) {
95
+ return { policyLoss: 0, valueLoss: 0, entropy: 0 };
96
+ }
97
+ // Normalize advantages
98
+ const advantages = this.buffer.map(e => e.advantage);
99
+ const advMean = advantages.reduce((a, b) => a + b, 0) / advantages.length;
100
+ const advStd = Math.sqrt(advantages.reduce((a, b) => a + (b - advMean) ** 2, 0) / advantages.length) + 1e-8;
101
+ for (const exp of this.buffer) {
102
+ exp.advantage = (exp.advantage - advMean) / advStd;
103
+ }
104
+ let totalPolicyLoss = 0;
105
+ let totalValueLoss = 0;
106
+ let totalEntropy = 0;
107
+ let totalClipFrac = 0;
108
+ let totalKL = 0;
109
+ let numUpdates = 0;
110
+ // Multiple epochs
111
+ for (let epoch = 0; epoch < this.config.epochs; epoch++) {
112
+ // Shuffle buffer
113
+ this.shuffleBuffer();
114
+ // Process mini-batches
115
+ for (let i = 0; i < this.buffer.length; i += this.config.miniBatchSize) {
116
+ const batch = this.buffer.slice(i, i + this.config.miniBatchSize);
117
+ if (batch.length < this.config.miniBatchSize / 2)
118
+ continue;
119
+ const result = this.updateMiniBatch(batch);
120
+ totalPolicyLoss += result.policyLoss;
121
+ totalValueLoss += result.valueLoss;
122
+ totalEntropy += result.entropy;
123
+ totalClipFrac += result.clipFrac;
124
+ totalKL += result.kl;
125
+ numUpdates++;
126
+ // Early stopping if KL too high
127
+ if (result.kl > this.config.targetKL * 1.5) {
128
+ break;
129
+ }
130
+ }
131
+ }
132
+ // Clear buffer
133
+ this.buffer = [];
134
+ this.updateCount++;
135
+ const elapsed = performance.now() - startTime;
136
+ if (elapsed > 10) {
137
+ console.warn(`PPO update exceeded target: ${elapsed.toFixed(2)}ms > 10ms`);
138
+ }
139
+ return {
140
+ policyLoss: numUpdates > 0 ? totalPolicyLoss / numUpdates : 0,
141
+ valueLoss: numUpdates > 0 ? totalValueLoss / numUpdates : 0,
142
+ entropy: numUpdates > 0 ? totalEntropy / numUpdates : 0,
143
+ };
144
+ }
145
+ /**
146
+ * Get action from policy
147
+ */
148
+ getAction(state) {
149
+ const logits = this.computeLogits(state);
150
+ const probs = this.softmax(logits);
151
+ const action = this.sampleAction(probs);
152
+ return {
153
+ action,
154
+ logProb: Math.log(probs[action] + 1e-8),
155
+ value: this.computeValue(state),
156
+ };
157
+ }
158
+ /**
159
+ * Get statistics
160
+ */
161
+ getStats() {
162
+ return {
163
+ updateCount: this.updateCount,
164
+ bufferSize: this.buffer.length,
165
+ avgLoss: this.updateCount > 0 ? this.totalLoss / this.updateCount : 0,
166
+ approxKL: this.approxKL,
167
+ clipFraction: this.clipFraction,
168
+ };
169
+ }
170
+ // ==========================================================================
171
+ // Private Methods
172
+ // ==========================================================================
173
+ computeValue(state) {
174
+ let value = 0;
175
+ for (let i = 0; i < Math.min(state.length, this.valueWeights.length); i++) {
176
+ value += state[i] * this.valueWeights[i];
177
+ }
178
+ return value;
179
+ }
180
+ computeLogits(state) {
181
+ // Simplified: 4 discrete actions
182
+ const numActions = 4;
183
+ const logits = new Float32Array(numActions);
184
+ for (let a = 0; a < numActions; a++) {
185
+ for (let i = 0; i < Math.min(state.length, this.policyWeights.length); i++) {
186
+ logits[a] += state[i] * this.policyWeights[i] * (1 + a * 0.1);
187
+ }
188
+ }
189
+ return logits;
190
+ }
191
+ computeLogProb(state, action) {
192
+ const logits = this.computeLogits(state);
193
+ const probs = this.softmax(logits);
194
+ const actionIdx = this.hashAction(action);
195
+ return Math.log(probs[actionIdx] + 1e-8);
196
+ }
197
+ hashAction(action) {
198
+ // Simple hash to action index (0-3)
199
+ let hash = 0;
200
+ for (let i = 0; i < action.length; i++) {
201
+ hash = (hash * 31 + action.charCodeAt(i)) % 4;
202
+ }
203
+ return hash;
204
+ }
205
+ softmax(logits) {
206
+ const max = Math.max(...logits);
207
+ const exps = new Float32Array(logits.length);
208
+ let sum = 0;
209
+ for (let i = 0; i < logits.length; i++) {
210
+ exps[i] = Math.exp(logits[i] - max);
211
+ sum += exps[i];
212
+ }
213
+ for (let i = 0; i < exps.length; i++) {
214
+ exps[i] /= sum;
215
+ }
216
+ return exps;
217
+ }
218
+ sampleAction(probs) {
219
+ const r = Math.random();
220
+ let cumSum = 0;
221
+ for (let i = 0; i < probs.length; i++) {
222
+ cumSum += probs[i];
223
+ if (r < cumSum)
224
+ return i;
225
+ }
226
+ return probs.length - 1;
227
+ }
228
+ computeGAE(rewards, values) {
229
+ const advantages = new Array(rewards.length).fill(0);
230
+ let lastGae = 0;
231
+ for (let t = rewards.length - 1; t >= 0; t--) {
232
+ const nextValue = t < rewards.length - 1 ? values[t + 1] : 0;
233
+ const delta = rewards[t] + this.config.gamma * nextValue - values[t];
234
+ lastGae = delta + this.config.gamma * this.config.gaeLambda * lastGae;
235
+ advantages[t] = lastGae;
236
+ }
237
+ return advantages;
238
+ }
239
+ computeReturns(rewards) {
240
+ const returns = new Array(rewards.length).fill(0);
241
+ let cumReturn = 0;
242
+ for (let t = rewards.length - 1; t >= 0; t--) {
243
+ cumReturn = rewards[t] + this.config.gamma * cumReturn;
244
+ returns[t] = cumReturn;
245
+ }
246
+ return returns;
247
+ }
248
+ shuffleBuffer() {
249
+ for (let i = this.buffer.length - 1; i > 0; i--) {
250
+ const j = Math.floor(Math.random() * (i + 1));
251
+ [this.buffer[i], this.buffer[j]] = [this.buffer[j], this.buffer[i]];
252
+ }
253
+ }
254
+ updateMiniBatch(batch) {
255
+ let policyLoss = 0;
256
+ let valueLoss = 0;
257
+ let entropy = 0;
258
+ let clipFrac = 0;
259
+ let kl = 0;
260
+ const policyGrad = new Float32Array(this.policyWeights.length);
261
+ const valueGrad = new Float32Array(this.valueWeights.length);
262
+ for (const exp of batch) {
263
+ // Current policy
264
+ const logits = this.computeLogits(exp.state);
265
+ const probs = this.softmax(logits);
266
+ const newLogProb = Math.log(probs[exp.action] + 1e-8);
267
+ const currentValue = this.computeValue(exp.state);
268
+ // Ratio for PPO
269
+ const ratio = Math.exp(newLogProb - exp.logProb);
270
+ // Clipped surrogate objective
271
+ const surr1 = ratio * exp.advantage;
272
+ const surr2 = Math.max(Math.min(ratio, 1 + this.config.clipRange), 1 - this.config.clipRange) * exp.advantage;
273
+ const policyLossI = -Math.min(surr1, surr2);
274
+ policyLoss += policyLossI;
275
+ // Track clipping
276
+ if (Math.abs(ratio - 1) > this.config.clipRange) {
277
+ clipFrac++;
278
+ }
279
+ // KL divergence approximation
280
+ kl += (exp.logProb - newLogProb);
281
+ // Value loss
282
+ let valueLossI;
283
+ if (this.config.clipRangeVf !== null) {
284
+ const valuePred = currentValue;
285
+ const valueClipped = exp.value + Math.max(Math.min(valuePred - exp.value, this.config.clipRangeVf), -this.config.clipRangeVf);
286
+ const vf1 = (valuePred - exp.return_) ** 2;
287
+ const vf2 = (valueClipped - exp.return_) ** 2;
288
+ valueLossI = Math.max(vf1, vf2);
289
+ }
290
+ else {
291
+ valueLossI = (currentValue - exp.return_) ** 2;
292
+ }
293
+ valueLoss += valueLossI;
294
+ // Entropy
295
+ let entropyI = 0;
296
+ for (const p of probs) {
297
+ if (p > 0)
298
+ entropyI -= p * Math.log(p);
299
+ }
300
+ entropy += entropyI;
301
+ // Compute gradients (simplified)
302
+ for (let i = 0; i < Math.min(exp.state.length, policyGrad.length); i++) {
303
+ policyGrad[i] += exp.state[i] * policyLossI * 0.01;
304
+ valueGrad[i] += exp.state[i] * valueLossI * 0.01;
305
+ }
306
+ }
307
+ // Apply gradients with momentum
308
+ const lr = this.config.learningRate;
309
+ const beta = 0.9;
310
+ for (let i = 0; i < this.policyWeights.length; i++) {
311
+ this.policyMomentum[i] = beta * this.policyMomentum[i] + (1 - beta) * policyGrad[i];
312
+ this.policyWeights[i] -= lr * this.policyMomentum[i];
313
+ this.valueMomentum[i] = beta * this.valueMomentum[i] + (1 - beta) * valueGrad[i];
314
+ this.valueWeights[i] -= lr * this.valueMomentum[i];
315
+ }
316
+ return {
317
+ policyLoss: policyLoss / batch.length,
318
+ valueLoss: valueLoss / batch.length,
319
+ entropy: entropy / batch.length,
320
+ clipFrac: clipFrac / batch.length,
321
+ kl: kl / batch.length,
322
+ };
323
+ }
324
+ }
325
+ /**
326
+ * Factory function
327
+ */
328
+ export function createPPO(config) {
329
+ return new PPOAlgorithm(config);
330
+ }
331
+ //# sourceMappingURL=ppo.js.map