moflo 4.8.32 → 4.8.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/bin/generate-code-map.mjs +1 -1
  2. package/bin/index-guidance.mjs +1 -1
  3. package/bin/index-tests.mjs +1 -1
  4. package/bin/setup-project.mjs +1 -1
  5. package/package.json +8 -4
  6. package/src/@claude-flow/cli/dist/src/commands/doctor.js +1298 -1107
  7. package/src/@claude-flow/cli/dist/src/memory/memory-initializer.js +4 -7
  8. package/src/@claude-flow/neural/README.md +260 -0
  9. package/src/@claude-flow/neural/dist/algorithms/a2c.js +361 -0
  10. package/src/@claude-flow/neural/dist/algorithms/curiosity.js +392 -0
  11. package/src/@claude-flow/neural/dist/algorithms/decision-transformer.js +415 -0
  12. package/src/@claude-flow/neural/dist/algorithms/dqn.js +303 -0
  13. package/src/@claude-flow/neural/dist/algorithms/index.js +74 -0
  14. package/src/@claude-flow/neural/dist/algorithms/ppo.js +331 -0
  15. package/src/@claude-flow/neural/dist/algorithms/q-learning.js +259 -0
  16. package/src/@claude-flow/neural/dist/algorithms/sarsa.js +297 -0
  17. package/src/@claude-flow/neural/dist/application/index.js +7 -0
  18. package/src/@claude-flow/neural/dist/application/services/neural-application-service.js +161 -0
  19. package/src/@claude-flow/neural/dist/domain/entities/pattern.js +134 -0
  20. package/src/@claude-flow/neural/dist/domain/index.js +8 -0
  21. package/src/@claude-flow/neural/dist/domain/services/learning-service.js +195 -0
  22. package/src/@claude-flow/neural/dist/index.js +201 -0
  23. package/src/@claude-flow/neural/dist/modes/balanced.js +234 -0
  24. package/src/@claude-flow/neural/dist/modes/base.js +77 -0
  25. package/src/@claude-flow/neural/dist/modes/batch.js +316 -0
  26. package/src/@claude-flow/neural/dist/modes/edge.js +310 -0
  27. package/src/@claude-flow/neural/dist/modes/index.js +13 -0
  28. package/src/@claude-flow/neural/dist/modes/real-time.js +196 -0
  29. package/src/@claude-flow/neural/dist/modes/research.js +389 -0
  30. package/src/@claude-flow/neural/dist/pattern-learner.js +603 -0
  31. package/src/@claude-flow/neural/dist/reasoning-bank.js +993 -0
  32. package/src/@claude-flow/neural/dist/reasoningbank-adapter.js +463 -0
  33. package/src/@claude-flow/neural/dist/sona-integration.js +316 -0
  34. package/src/@claude-flow/neural/dist/sona-manager.js +695 -0
  35. package/src/@claude-flow/neural/dist/types.js +11 -0
  36. package/src/@claude-flow/neural/package.json +26 -0
@@ -1950,14 +1950,11 @@ export async function searchEntries(options) {
1950
1950
  // Invalid embedding, use keyword score
1951
1951
  }
1952
1952
  }
1953
- // Fallback to keyword matching
1953
+ // Skip entries without valid semantic embeddings — keyword fallback
1954
+ // produces misleading 0.500 scores that degrade search quality.
1955
+ // Entries must have real vector embeddings to participate in semantic search.
1954
1956
  if (score < threshold) {
1955
- const lowerContent = (content || '').toLowerCase();
1956
- const lowerQuery = query.toLowerCase();
1957
- const words = lowerQuery.split(/\s+/);
1958
- const matchCount = words.filter(w => lowerContent.includes(w)).length;
1959
- const keywordScore = matchCount / words.length * 0.5;
1960
- score = Math.max(score, keywordScore);
1957
+ continue;
1961
1958
  }
1962
1959
  if (score >= threshold) {
1963
1960
  results.push({
@@ -0,0 +1,260 @@
1
+ # @claude-flow/neural
2
+
3
+ [![npm version](https://img.shields.io/npm/v/@claude-flow/neural.svg)](https://www.npmjs.com/package/@claude-flow/neural)
4
+ [![npm downloads](https://img.shields.io/npm/dm/@claude-flow/neural.svg)](https://www.npmjs.com/package/@claude-flow/neural)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+ [![TypeScript](https://img.shields.io/badge/TypeScript-5.0+-blue.svg)](https://www.typescriptlang.org/)
7
+ [![AI Learning](https://img.shields.io/badge/AI-Self--Learning-purple.svg)](https://github.com/eric-cielo/moflo)
8
+
9
+ > Self-Optimizing Neural Architecture (SONA) module for Claude Flow V3 - adaptive learning, trajectory tracking, and pattern-based optimization.
10
+
11
+ ## Features
12
+
13
+ - **SONA Learning** - Self-Optimizing Neural Architecture with <0.05ms adaptation time
14
+ - **5 Learning Modes** - Real-time, Balanced, Research, Edge, and Batch modes
15
+ - **9 RL Algorithms** - PPO, A2C, DQN, Q-Learning, SARSA, Decision Transformer, and more
16
+ - **LoRA Integration** - Low-Rank Adaptation for efficient fine-tuning
17
+ - **EWC++ Memory** - Elastic Weight Consolidation for continual learning without forgetting
18
+ - **Trajectory Tracking** - Record and learn from agent execution paths
19
+ - **Pattern Recognition** - Automatic pattern extraction and reuse
20
+
21
+ ## Installation
22
+
23
+ ```bash
24
+ npm install @claude-flow/neural
25
+ ```
26
+
27
+ ## Quick Start
28
+
29
+ ```typescript
30
+ import { SONAManager, createSONAManager } from '@claude-flow/neural';
31
+
32
+ // Create SONA manager
33
+ const sona = createSONAManager('balanced');
34
+ await sona.initialize();
35
+
36
+ // Begin trajectory tracking
37
+ const trajectoryId = sona.beginTrajectory('code-review-task', 'development');
38
+
39
+ // Record steps
40
+ sona.recordStep(trajectoryId, 'analyze-code', 0.8, stateEmbedding, {
41
+ filesAnalyzed: 5,
42
+ issuesFound: 2
43
+ });
44
+
45
+ sona.recordStep(trajectoryId, 'generate-feedback', 0.9, newStateEmbedding);
46
+
47
+ // Complete trajectory
48
+ const trajectory = sona.completeTrajectory(trajectoryId);
49
+
50
+ // Find similar patterns for guidance
51
+ const patterns = await sona.findSimilarPatterns(contextEmbedding, 3);
52
+ ```
53
+
54
+ ## Learning Modes
55
+
56
+ | Mode | Adaptation | Quality | Memory | Use Case |
57
+ |------|------------|---------|--------|----------|
58
+ | **real-time** | <0.5ms | 70%+ | 25MB | Production, low-latency |
59
+ | **balanced** | <18ms | 75%+ | 50MB | General purpose |
60
+ | **research** | <100ms | 95%+ | 100MB | Deep exploration |
61
+ | **edge** | <1ms | 80%+ | 5MB | Resource-constrained |
62
+ | **batch** | <50ms | 85%+ | 75MB | High-throughput |
63
+
64
+ ```typescript
65
+ // Switch modes dynamically
66
+ await sona.setMode('research');
67
+
68
+ // Get current configuration
69
+ const { mode, config, optimizations } = sona.getConfig();
70
+ ```
71
+
72
+ ## API Reference
73
+
74
+ ### SONA Manager
75
+
76
+ ```typescript
77
+ import { SONAManager } from '@claude-flow/neural';
78
+
79
+ const sona = new SONAManager('balanced');
80
+ await sona.initialize();
81
+
82
+ // Trajectory Management
83
+ const trajectoryId = sona.beginTrajectory(context, domain);
84
+ sona.recordStep(trajectoryId, action, reward, stateEmbedding, metadata);
85
+ const trajectory = sona.completeTrajectory(trajectoryId, finalQuality);
86
+
87
+ // Pattern Matching
88
+ const patterns = await sona.findSimilarPatterns(embedding, k);
89
+ const pattern = sona.storePattern({ name, strategy, embedding, domain });
90
+ sona.updatePatternUsage(patternId, quality);
91
+
92
+ // Learning
93
+ await sona.triggerLearning('manual');
94
+ const output = await sona.applyAdaptations(input, domain);
95
+
96
+ // Statistics
97
+ const stats = sona.getStats();
98
+ ```
99
+
100
+ ### RL Algorithms
101
+
102
+ ```typescript
103
+ import { PPO, A2C, DQN, QLearning, SARSA, DecisionTransformer } from '@claude-flow/neural';
104
+
105
+ // Proximal Policy Optimization
106
+ const ppo = new PPO({
107
+ learningRate: 0.0003,
108
+ epsilon: 0.2,
109
+ valueCoef: 0.5
110
+ });
111
+
112
+ // Advantage Actor-Critic
113
+ const a2c = new A2C({
114
+ learningRate: 0.001,
115
+ gamma: 0.99,
116
+ entropyCoef: 0.01
117
+ });
118
+
119
+ // Deep Q-Network
120
+ const dqn = new DQN({
121
+ learningRate: 0.001,
122
+ gamma: 0.99,
123
+ epsilon: 0.1,
124
+ targetUpdateFreq: 100
125
+ });
126
+
127
+ // Decision Transformer
128
+ const dt = new DecisionTransformer({
129
+ contextLength: 20,
130
+ embeddingDim: 256,
131
+ numHeads: 4
132
+ });
133
+ ```
134
+
135
+ ### LoRA Configuration
136
+
137
+ ```typescript
138
+ // Get LoRA config for current mode
139
+ const loraConfig = sona.getLoRAConfig();
140
+ // {
141
+ // rank: 4,
142
+ // alpha: 8,
143
+ // dropout: 0.05,
144
+ // targetModules: ['q_proj', 'v_proj', 'k_proj', 'o_proj'],
145
+ // microLoRA: false
146
+ // }
147
+
148
+ // Initialize LoRA weights for a domain
149
+ const weights = sona.initializeLoRAWeights('code-generation');
150
+ ```
151
+
152
+ ### EWC++ (Elastic Weight Consolidation)
153
+
154
+ ```typescript
155
+ // Get EWC config
156
+ const ewcConfig = sona.getEWCConfig();
157
+ // {
158
+ // lambda: 2000,
159
+ // decay: 0.9,
160
+ // fisherSamples: 100,
161
+ // minFisher: 1e-8,
162
+ // online: true
163
+ // }
164
+
165
+ // Consolidate after learning a new task
166
+ sona.consolidateEWC();
167
+ ```
168
+
169
+ ### Event System
170
+
171
+ ```typescript
172
+ // Subscribe to neural events
173
+ sona.addEventListener((event) => {
174
+ switch (event.type) {
175
+ case 'trajectory_started':
176
+ console.log(`Started: ${event.trajectoryId}`);
177
+ break;
178
+ case 'trajectory_completed':
179
+ console.log(`Completed with quality: ${event.qualityScore}`);
180
+ break;
181
+ case 'pattern_matched':
182
+ console.log(`Pattern ${event.patternId} matched`);
183
+ break;
184
+ case 'learning_triggered':
185
+ console.log(`Learning: ${event.reason}`);
186
+ break;
187
+ case 'mode_changed':
188
+ console.log(`Mode: ${event.fromMode} -> ${event.toMode}`);
189
+ break;
190
+ }
191
+ });
192
+ ```
193
+
194
+ ## Mode Configurations
195
+
196
+ ```typescript
197
+ // Real-time mode (ultra-fast)
198
+ {
199
+ loraRank: 2,
200
+ learningRate: 0.001,
201
+ batchSize: 32,
202
+ trajectoryCapacity: 1000,
203
+ qualityThreshold: 0.7,
204
+ maxLatencyMs: 0.5
205
+ }
206
+
207
+ // Research mode (high quality)
208
+ {
209
+ loraRank: 16,
210
+ learningRate: 0.002,
211
+ batchSize: 64,
212
+ trajectoryCapacity: 10000,
213
+ qualityThreshold: 0.2,
214
+ maxLatencyMs: 100
215
+ }
216
+ ```
217
+
218
+ ## Performance Targets
219
+
220
+ | Metric | Target | Typical |
221
+ |--------|--------|---------|
222
+ | Adaptation latency | <0.05ms | 0.02ms |
223
+ | Pattern retrieval | <1ms | 0.5ms |
224
+ | Learning step | <10ms | 5ms |
225
+ | Quality improvement | +55% | +40-60% |
226
+ | Memory overhead | <50MB | 25-75MB |
227
+
228
+ ## TypeScript Types
229
+
230
+ ```typescript
231
+ import type {
232
+ SONAMode,
233
+ SONAModeConfig,
234
+ Trajectory,
235
+ TrajectoryStep,
236
+ Pattern,
237
+ PatternMatch,
238
+ NeuralStats,
239
+ NeuralEvent,
240
+ LoRAConfig,
241
+ LoRAWeights,
242
+ EWCConfig,
243
+ RLAlgorithm
244
+ } from '@claude-flow/neural';
245
+ ```
246
+
247
+ ## Dependencies
248
+
249
+ - [@claude-flow/memory](../memory) - Memory integration
250
+ - `@ruvector/sona` - SONA learning engine
251
+
252
+ ## Related Packages
253
+
254
+ - [@claude-flow/memory](../memory) - Vector memory for patterns
255
+ - [@claude-flow/integration](../integration) - agentic-flow integration
256
+ - [@claude-flow/performance](../performance) - Benchmarking
257
+
258
+ ## License
259
+
260
+ MIT
@@ -0,0 +1,361 @@
1
+ /**
2
+ * Advantage Actor-Critic (A2C)
3
+ *
4
+ * Implements synchronous A2C algorithm with:
5
+ * - Shared actor-critic network
6
+ * - N-step returns
7
+ * - Entropy regularization
8
+ * - Advantage normalization
9
+ *
10
+ * Performance Target: <10ms per update step
11
+ */
12
+ /**
13
+ * Default A2C configuration
14
+ */
15
+ export const DEFAULT_A2C_CONFIG = {
16
+ algorithm: 'a2c',
17
+ learningRate: 0.0007,
18
+ gamma: 0.99,
19
+ entropyCoef: 0.01,
20
+ valueLossCoef: 0.5,
21
+ maxGradNorm: 0.5,
22
+ epochs: 1,
23
+ miniBatchSize: 32,
24
+ nSteps: 5,
25
+ useGAE: true,
26
+ gaeLambda: 0.95,
27
+ };
28
+ /**
29
+ * A2C Algorithm Implementation
30
+ */
31
+ export class A2CAlgorithm {
32
+ config;
33
+ // Shared network weights
34
+ sharedWeights;
35
+ policyHead;
36
+ valueHead;
37
+ // Optimizer state
38
+ sharedMomentum;
39
+ policyMomentum;
40
+ valueMomentum;
41
+ // Experience buffer for n-step
42
+ buffer = [];
43
+ // Dimensions
44
+ inputDim = 768;
45
+ hiddenDim = 64;
46
+ numActions = 4;
47
+ // Statistics
48
+ updateCount = 0;
49
+ avgPolicyLoss = 0;
50
+ avgValueLoss = 0;
51
+ avgEntropy = 0;
52
+ constructor(config = {}) {
53
+ this.config = { ...DEFAULT_A2C_CONFIG, ...config };
54
+ // Initialize network
55
+ const scale = Math.sqrt(2 / this.inputDim);
56
+ this.sharedWeights = new Float32Array(this.inputDim * this.hiddenDim);
57
+ this.policyHead = new Float32Array(this.hiddenDim * this.numActions);
58
+ this.valueHead = new Float32Array(this.hiddenDim);
59
+ for (let i = 0; i < this.sharedWeights.length; i++) {
60
+ this.sharedWeights[i] = (Math.random() - 0.5) * scale;
61
+ }
62
+ for (let i = 0; i < this.policyHead.length; i++) {
63
+ this.policyHead[i] = (Math.random() - 0.5) * 0.1;
64
+ }
65
+ for (let i = 0; i < this.valueHead.length; i++) {
66
+ this.valueHead[i] = (Math.random() - 0.5) * 0.1;
67
+ }
68
+ // Initialize momentum
69
+ this.sharedMomentum = new Float32Array(this.sharedWeights.length);
70
+ this.policyMomentum = new Float32Array(this.policyHead.length);
71
+ this.valueMomentum = new Float32Array(this.valueHead.length);
72
+ }
73
+ /**
74
+ * Add experience from trajectory
75
+ */
76
+ addExperience(trajectory) {
77
+ for (const step of trajectory.steps) {
78
+ const { probs, value, entropy } = this.evaluate(step.stateAfter);
79
+ const action = this.hashAction(step.action);
80
+ this.buffer.push({
81
+ state: step.stateAfter,
82
+ action,
83
+ reward: step.reward,
84
+ value,
85
+ logProb: Math.log(probs[action] + 1e-8),
86
+ entropy,
87
+ });
88
+ }
89
+ }
90
+ /**
91
+ * Perform A2C update
92
+ * Target: <10ms
93
+ */
94
+ update() {
95
+ const startTime = performance.now();
96
+ if (this.buffer.length < this.config.nSteps) {
97
+ return { policyLoss: 0, valueLoss: 0, entropy: 0 };
98
+ }
99
+ // Compute returns and advantages
100
+ const returns = this.computeReturns();
101
+ const advantages = this.computeAdvantages(returns);
102
+ // Initialize gradients
103
+ const sharedGrad = new Float32Array(this.sharedWeights.length);
104
+ const policyGrad = new Float32Array(this.policyHead.length);
105
+ const valueGrad = new Float32Array(this.valueHead.length);
106
+ let totalPolicyLoss = 0;
107
+ let totalValueLoss = 0;
108
+ let totalEntropy = 0;
109
+ // Process all experiences
110
+ for (let i = 0; i < this.buffer.length; i++) {
111
+ const exp = this.buffer[i];
112
+ const advantage = advantages[i];
113
+ const return_ = returns[i];
114
+ // Get current policy and value
115
+ const { probs, value, hidden } = this.forwardWithHidden(exp.state);
116
+ const logProb = Math.log(probs[exp.action] + 1e-8);
117
+ // Policy loss
118
+ const policyLoss = -logProb * advantage;
119
+ totalPolicyLoss += policyLoss;
120
+ // Value loss
121
+ const valueLoss = (value - return_) ** 2;
122
+ totalValueLoss += valueLoss;
123
+ // Entropy
124
+ let entropy = 0;
125
+ for (const p of probs) {
126
+ if (p > 0)
127
+ entropy -= p * Math.log(p);
128
+ }
129
+ totalEntropy += entropy;
130
+ // Accumulate gradients
131
+ this.accumulateGradients(sharedGrad, policyGrad, valueGrad, exp.state, hidden, exp.action, advantage, value - return_);
132
+ }
133
+ // Add entropy bonus to policy gradient
134
+ for (let i = 0; i < policyGrad.length; i++) {
135
+ policyGrad[i] -= this.config.entropyCoef * totalEntropy / this.buffer.length;
136
+ }
137
+ // Apply gradients
138
+ this.applyGradients(sharedGrad, policyGrad, valueGrad, this.buffer.length);
139
+ // Clear buffer
140
+ this.buffer = [];
141
+ this.updateCount++;
142
+ this.avgPolicyLoss = totalPolicyLoss / this.buffer.length || 0;
143
+ this.avgValueLoss = totalValueLoss / this.buffer.length || 0;
144
+ this.avgEntropy = totalEntropy / this.buffer.length || 0;
145
+ const elapsed = performance.now() - startTime;
146
+ if (elapsed > 10) {
147
+ console.warn(`A2C update exceeded target: ${elapsed.toFixed(2)}ms > 10ms`);
148
+ }
149
+ return {
150
+ policyLoss: this.avgPolicyLoss,
151
+ valueLoss: this.avgValueLoss,
152
+ entropy: this.avgEntropy,
153
+ };
154
+ }
155
+ /**
156
+ * Get action from policy
157
+ */
158
+ getAction(state) {
159
+ const { probs, value } = this.evaluate(state);
160
+ const action = this.sampleAction(probs);
161
+ return { action, value };
162
+ }
163
+ /**
164
+ * Get statistics
165
+ */
166
+ getStats() {
167
+ return {
168
+ updateCount: this.updateCount,
169
+ bufferSize: this.buffer.length,
170
+ avgPolicyLoss: this.avgPolicyLoss,
171
+ avgValueLoss: this.avgValueLoss,
172
+ avgEntropy: this.avgEntropy,
173
+ };
174
+ }
175
+ // ==========================================================================
176
+ // Private Methods
177
+ // ==========================================================================
178
+ evaluate(state) {
179
+ const { probs, value } = this.forward(state);
180
+ let entropy = 0;
181
+ for (const p of probs) {
182
+ if (p > 0)
183
+ entropy -= p * Math.log(p);
184
+ }
185
+ return { probs, value, entropy };
186
+ }
187
+ forward(state) {
188
+ // Shared hidden layer
189
+ const hidden = new Float32Array(this.hiddenDim);
190
+ for (let h = 0; h < this.hiddenDim; h++) {
191
+ let sum = 0;
192
+ for (let i = 0; i < Math.min(state.length, this.inputDim); i++) {
193
+ sum += state[i] * this.sharedWeights[i * this.hiddenDim + h];
194
+ }
195
+ hidden[h] = Math.max(0, sum); // ReLU
196
+ }
197
+ // Policy head
198
+ const logits = new Float32Array(this.numActions);
199
+ for (let a = 0; a < this.numActions; a++) {
200
+ let sum = 0;
201
+ for (let h = 0; h < this.hiddenDim; h++) {
202
+ sum += hidden[h] * this.policyHead[h * this.numActions + a];
203
+ }
204
+ logits[a] = sum;
205
+ }
206
+ const probs = this.softmax(logits);
207
+ // Value head
208
+ let value = 0;
209
+ for (let h = 0; h < this.hiddenDim; h++) {
210
+ value += hidden[h] * this.valueHead[h];
211
+ }
212
+ return { probs, value };
213
+ }
214
+ forwardWithHidden(state) {
215
+ const hidden = new Float32Array(this.hiddenDim);
216
+ for (let h = 0; h < this.hiddenDim; h++) {
217
+ let sum = 0;
218
+ for (let i = 0; i < Math.min(state.length, this.inputDim); i++) {
219
+ sum += state[i] * this.sharedWeights[i * this.hiddenDim + h];
220
+ }
221
+ hidden[h] = Math.max(0, sum);
222
+ }
223
+ const logits = new Float32Array(this.numActions);
224
+ for (let a = 0; a < this.numActions; a++) {
225
+ let sum = 0;
226
+ for (let h = 0; h < this.hiddenDim; h++) {
227
+ sum += hidden[h] * this.policyHead[h * this.numActions + a];
228
+ }
229
+ logits[a] = sum;
230
+ }
231
+ const probs = this.softmax(logits);
232
+ let value = 0;
233
+ for (let h = 0; h < this.hiddenDim; h++) {
234
+ value += hidden[h] * this.valueHead[h];
235
+ }
236
+ return { probs, value, hidden };
237
+ }
238
+ computeReturns() {
239
+ const returns = new Array(this.buffer.length).fill(0);
240
+ let cumReturn = 0;
241
+ // Bootstrap from last value if not terminal
242
+ if (this.buffer.length > 0) {
243
+ cumReturn = this.buffer[this.buffer.length - 1].value;
244
+ }
245
+ for (let t = this.buffer.length - 1; t >= 0; t--) {
246
+ cumReturn = this.buffer[t].reward + this.config.gamma * cumReturn;
247
+ returns[t] = cumReturn;
248
+ }
249
+ return returns;
250
+ }
251
+ computeAdvantages(returns) {
252
+ if (this.config.useGAE) {
253
+ return this.computeGAE();
254
+ }
255
+ // Simple advantage: return - value
256
+ const advantages = new Array(this.buffer.length).fill(0);
257
+ for (let i = 0; i < this.buffer.length; i++) {
258
+ advantages[i] = returns[i] - this.buffer[i].value;
259
+ }
260
+ // Normalize
261
+ const mean = advantages.reduce((a, b) => a + b, 0) / advantages.length;
262
+ const std = Math.sqrt(advantages.reduce((a, b) => a + (b - mean) ** 2, 0) / advantages.length) + 1e-8;
263
+ return advantages.map(a => (a - mean) / std);
264
+ }
265
+ computeGAE() {
266
+ const advantages = new Array(this.buffer.length).fill(0);
267
+ let lastGae = 0;
268
+ for (let t = this.buffer.length - 1; t >= 0; t--) {
269
+ const nextValue = t < this.buffer.length - 1
270
+ ? this.buffer[t + 1].value
271
+ : 0;
272
+ const delta = this.buffer[t].reward + this.config.gamma * nextValue - this.buffer[t].value;
273
+ lastGae = delta + this.config.gamma * this.config.gaeLambda * lastGae;
274
+ advantages[t] = lastGae;
275
+ }
276
+ // Normalize
277
+ const mean = advantages.reduce((a, b) => a + b, 0) / advantages.length;
278
+ const std = Math.sqrt(advantages.reduce((a, b) => a + (b - mean) ** 2, 0) / advantages.length) + 1e-8;
279
+ return advantages.map(a => (a - mean) / std);
280
+ }
281
+ accumulateGradients(sharedGrad, policyGrad, valueGrad, state, hidden, action, advantage, valueError) {
282
+ // Policy gradient
283
+ for (let h = 0; h < this.hiddenDim; h++) {
284
+ policyGrad[h * this.numActions + action] += hidden[h] * advantage;
285
+ }
286
+ // Value gradient
287
+ for (let h = 0; h < this.hiddenDim; h++) {
288
+ valueGrad[h] += hidden[h] * valueError * this.config.valueLossCoef;
289
+ }
290
+ // Shared layer gradient (backprop through both heads)
291
+ for (let h = 0; h < this.hiddenDim; h++) {
292
+ if (hidden[h] > 0) { // ReLU gradient
293
+ const policySignal = advantage * this.policyHead[h * this.numActions + action];
294
+ const valueSignal = valueError * this.valueHead[h] * this.config.valueLossCoef;
295
+ const totalSignal = policySignal + valueSignal;
296
+ for (let i = 0; i < Math.min(state.length, this.inputDim); i++) {
297
+ sharedGrad[i * this.hiddenDim + h] += state[i] * totalSignal;
298
+ }
299
+ }
300
+ }
301
+ }
302
+ applyGradients(sharedGrad, policyGrad, valueGrad, batchSize) {
303
+ const lr = this.config.learningRate / batchSize;
304
+ const beta = 0.9;
305
+ // Apply to shared weights
306
+ for (let i = 0; i < this.sharedWeights.length; i++) {
307
+ const grad = Math.max(Math.min(sharedGrad[i], this.config.maxGradNorm), -this.config.maxGradNorm);
308
+ this.sharedMomentum[i] = beta * this.sharedMomentum[i] + (1 - beta) * grad;
309
+ this.sharedWeights[i] -= lr * this.sharedMomentum[i];
310
+ }
311
+ // Apply to policy head
312
+ for (let i = 0; i < this.policyHead.length; i++) {
313
+ const grad = Math.max(Math.min(policyGrad[i], this.config.maxGradNorm), -this.config.maxGradNorm);
314
+ this.policyMomentum[i] = beta * this.policyMomentum[i] + (1 - beta) * grad;
315
+ this.policyHead[i] -= lr * this.policyMomentum[i];
316
+ }
317
+ // Apply to value head
318
+ for (let i = 0; i < this.valueHead.length; i++) {
319
+ const grad = Math.max(Math.min(valueGrad[i], this.config.maxGradNorm), -this.config.maxGradNorm);
320
+ this.valueMomentum[i] = beta * this.valueMomentum[i] + (1 - beta) * grad;
321
+ this.valueHead[i] -= lr * this.valueMomentum[i];
322
+ }
323
+ }
324
+ softmax(logits) {
325
+ const max = Math.max(...logits);
326
+ const exps = new Float32Array(logits.length);
327
+ let sum = 0;
328
+ for (let i = 0; i < logits.length; i++) {
329
+ exps[i] = Math.exp(logits[i] - max);
330
+ sum += exps[i];
331
+ }
332
+ for (let i = 0; i < exps.length; i++) {
333
+ exps[i] /= sum;
334
+ }
335
+ return exps;
336
+ }
337
+ sampleAction(probs) {
338
+ const r = Math.random();
339
+ let cumSum = 0;
340
+ for (let i = 0; i < probs.length; i++) {
341
+ cumSum += probs[i];
342
+ if (r < cumSum)
343
+ return i;
344
+ }
345
+ return probs.length - 1;
346
+ }
347
+ hashAction(action) {
348
+ let hash = 0;
349
+ for (let i = 0; i < action.length; i++) {
350
+ hash = (hash * 31 + action.charCodeAt(i)) % this.numActions;
351
+ }
352
+ return hash;
353
+ }
354
+ }
355
+ /**
356
+ * Factory function
357
+ */
358
+ export function createA2C(config) {
359
+ return new A2CAlgorithm(config);
360
+ }
361
+ //# sourceMappingURL=a2c.js.map