agentic-flow 1.7.2 → 1.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/.claude/agents/test-neural.md +0 -5
  2. package/.claude/answer.md +1 -0
  3. package/.claude/settings.json +19 -20
  4. package/CHANGELOG.md +0 -91
  5. package/README.md +17 -81
  6. package/dist/agentdb/benchmarks/comprehensive-benchmark.js +664 -0
  7. package/dist/agentdb/benchmarks/frontier-benchmark.js +419 -0
  8. package/dist/agentdb/benchmarks/reflexion-benchmark.js +370 -0
  9. package/dist/agentdb/cli/agentdb-cli.js +717 -0
  10. package/dist/agentdb/controllers/CausalMemoryGraph.js +322 -0
  11. package/dist/agentdb/controllers/CausalRecall.js +281 -0
  12. package/dist/agentdb/controllers/EmbeddingService.js +118 -0
  13. package/dist/agentdb/controllers/ExplainableRecall.js +387 -0
  14. package/dist/agentdb/controllers/NightlyLearner.js +382 -0
  15. package/dist/agentdb/controllers/ReflexionMemory.js +239 -0
  16. package/dist/agentdb/controllers/SkillLibrary.js +276 -0
  17. package/dist/agentdb/controllers/frontier-index.js +9 -0
  18. package/dist/agentdb/controllers/index.js +8 -0
  19. package/dist/agentdb/index.js +32 -0
  20. package/dist/agentdb/optimizations/BatchOperations.js +198 -0
  21. package/dist/agentdb/optimizations/QueryOptimizer.js +225 -0
  22. package/dist/agentdb/optimizations/index.js +7 -0
  23. package/dist/agentdb/tests/frontier-features.test.js +665 -0
  24. package/dist/cli/skills-manager.js +3 -1
  25. package/dist/cli-proxy.js +2 -33
  26. package/dist/mcp/standalone-stdio.js +200 -4
  27. package/dist/memory/SharedMemoryPool.js +211 -0
  28. package/dist/memory/index.js +6 -0
  29. package/dist/reasoningbank/AdvancedMemory.js +239 -0
  30. package/dist/reasoningbank/HybridBackend.js +305 -0
  31. package/dist/reasoningbank/index-new.js +87 -0
  32. package/dist/reasoningbank/index.js +23 -44
  33. package/dist/utils/cli.js +0 -22
  34. package/docs/AGENTDB_TESTING.md +411 -0
  35. package/docs/v1.7.1-QUICK-START.md +399 -0
  36. package/package.json +4 -4
  37. package/scripts/run-validation.sh +165 -0
  38. package/scripts/test-agentdb.sh +153 -0
  39. package/.claude/skills/agentdb-memory-patterns/SKILL.md +0 -166
  40. package/.claude/skills/agentdb-vector-search/SKILL.md +0 -126
  41. package/.claude/skills/agentic-flow/agentdb-memory-patterns/SKILL.md +0 -166
  42. package/.claude/skills/agentic-flow/agentdb-vector-search/SKILL.md +0 -126
  43. package/.claude/skills/agentic-flow/reasoningbank-intelligence/SKILL.md +0 -201
  44. package/.claude/skills/agentic-flow/swarm-orchestration/SKILL.md +0 -179
  45. package/.claude/skills/reasoningbank-intelligence/SKILL.md +0 -201
  46. package/.claude/skills/skill-builder/README.md +0 -308
  47. package/.claude/skills/skill-builder/SKILL.md +0 -910
  48. package/.claude/skills/skill-builder/docs/SPECIFICATION.md +0 -358
  49. package/.claude/skills/skill-builder/resources/schemas/skill-frontmatter.schema.json +0 -41
  50. package/.claude/skills/skill-builder/resources/templates/full-skill.template +0 -118
  51. package/.claude/skills/skill-builder/resources/templates/minimal-skill.template +0 -38
  52. package/.claude/skills/skill-builder/scripts/generate-skill.sh +0 -334
  53. package/.claude/skills/skill-builder/scripts/validate-skill.sh +0 -198
  54. package/.claude/skills/swarm-orchestration/SKILL.md +0 -179
  55. package/docs/AGENTDB_INTEGRATION.md +0 -379
@@ -0,0 +1,370 @@
1
+ /**
2
+ * Reflexion Memory Benchmark Suite
3
+ *
4
+ * Tests:
5
+ * 1. Latency: p95 end-to-end ≤ 50ms for k-NN over 50k memories
6
+ * 2. Hit Rate: Top-3 recall includes prior failure that predicts fix ≥ 60%
7
+ * 3. Improvement Tracking: Measure learning curves over episodes
8
+ */
9
+ import Database from 'better-sqlite3';
10
+ import { ReflexionMemory } from '../controllers/ReflexionMemory';
11
+ import { EmbeddingService } from '../controllers/EmbeddingService';
12
+ import * as fs from 'fs';
13
+ import * as path from 'path';
14
+ export class ReflexionBenchmark {
15
+ db;
16
+ memory;
17
+ embedder;
18
+ results = [];
19
+ constructor(dbPath = ':memory:') {
20
+ this.db = new Database(dbPath);
21
+ this.embedder = new EmbeddingService({
22
+ model: 'all-MiniLM-L6-v2',
23
+ dimension: 384,
24
+ provider: 'transformers'
25
+ });
26
+ this.memory = new ReflexionMemory(this.db, this.embedder);
27
+ }
28
+ async initialize() {
29
+ // Load schema
30
+ const schemaPath = path.join(__dirname, '../schemas/schema.sql');
31
+ const schema = fs.readFileSync(schemaPath, 'utf-8');
32
+ this.db.exec(schema);
33
+ // Initialize embedder
34
+ await this.embedder.initialize();
35
+ }
36
+ /**
37
+ * Run all benchmarks
38
+ */
39
+ async runAll() {
40
+ console.log('🧪 Starting Reflexion Memory Benchmark Suite\n');
41
+ console.log('━'.repeat(70));
42
+ await this.testLatency();
43
+ await this.testHitRate();
44
+ await this.testImprovementTracking();
45
+ await this.testPruning();
46
+ this.printResults();
47
+ return this.results;
48
+ }
49
+ /**
50
+ * Test 1: Latency Budget
51
+ * Goal: p95 ≤ 50ms for k-NN over 50k memories
52
+ */
53
+ async testLatency() {
54
+ console.log('\n📊 Test 1: Latency Budget');
55
+ console.log('Goal: p95 end-to-end ≤ 50ms for k-NN over 50k memories\n');
56
+ const memoryCount = 50000;
57
+ const queryCount = 100;
58
+ // Generate test episodes
59
+ console.log(`Generating ${memoryCount} test episodes...`);
60
+ const startGen = Date.now();
61
+ for (let i = 0; i < memoryCount; i++) {
62
+ const episode = {
63
+ sessionId: `session-${Math.floor(i / 100)}`,
64
+ task: this.generateTaskName(i),
65
+ input: `Input for task ${i}`,
66
+ output: `Output for task ${i}`,
67
+ critique: this.generateCritique(i),
68
+ reward: Math.random(),
69
+ success: Math.random() > 0.5
70
+ };
71
+ await this.memory.storeEpisode(episode);
72
+ if ((i + 1) % 10000 === 0) {
73
+ const elapsed = Date.now() - startGen;
74
+ console.log(` Progress: ${i + 1}/${memoryCount} (${(elapsed / 1000).toFixed(1)}s)`);
75
+ }
76
+ }
77
+ const genTime = Date.now() - startGen;
78
+ console.log(`✓ Generated ${memoryCount} episodes in ${(genTime / 1000).toFixed(2)}s\n`);
79
+ // Run retrieval queries
80
+ console.log(`Running ${queryCount} retrieval queries...`);
81
+ const latencies = [];
82
+ for (let i = 0; i < queryCount; i++) {
83
+ const task = this.generateTaskName(Math.floor(Math.random() * 10));
84
+ const start = Date.now();
85
+ await this.memory.retrieveRelevant({ task, k: 5 });
86
+ const latency = Date.now() - start;
87
+ latencies.push(latency);
88
+ }
89
+ // Calculate statistics
90
+ latencies.sort((a, b) => a - b);
91
+ const p50 = latencies[Math.floor(queryCount * 0.50)];
92
+ const p95 = latencies[Math.floor(queryCount * 0.95)];
93
+ const p99 = latencies[Math.floor(queryCount * 0.99)];
94
+ const avg = latencies.reduce((a, b) => a + b, 0) / latencies.length;
95
+ console.log(`\n📈 Latency Results:`);
96
+ console.log(` Average: ${avg.toFixed(2)}ms`);
97
+ console.log(` p50: ${p50}ms`);
98
+ console.log(` p95: ${p95}ms`);
99
+ console.log(` p99: ${p99}ms`);
100
+ const passed = p95 <= 50;
101
+ console.log(`\n${passed ? '✅ PASSED' : '❌ FAILED'}: p95 ${p95}ms ${passed ? '≤' : '>'} 50ms`);
102
+ this.results.push({
103
+ testName: 'Latency Budget',
104
+ passed,
105
+ metrics: { avg, p50, p95, p99, memoryCount, queryCount },
106
+ details: `p95 latency: ${p95}ms (target: ≤50ms)`
107
+ });
108
+ }
109
+ /**
110
+ * Test 2: Hit Rate
111
+ * Goal: Top-3 includes prior failure that predicts fix ≥ 60%
112
+ */
113
+ async testHitRate() {
114
+ console.log('\n━'.repeat(70));
115
+ console.log('\n📊 Test 2: Hit Rate');
116
+ console.log('Goal: Top-3 recall includes prior failure ≥ 60%\n');
117
+ const tasks = [
118
+ 'implement_binary_search',
119
+ 'create_rest_api',
120
+ 'parse_json_data',
121
+ 'handle_async_errors',
122
+ 'optimize_database_query'
123
+ ];
124
+ let totalTests = 0;
125
+ let hits = 0;
126
+ for (const task of tasks) {
127
+ // Create failure episodes with specific critiques
128
+ const failures = [
129
+ { critique: 'Edge case: empty array not handled', reward: 0.2 },
130
+ { critique: 'Performance: O(n²) instead of O(log n)', reward: 0.3 },
131
+ { critique: 'Bug: off-by-one error in loop', reward: 0.1 }
132
+ ];
133
+ for (const failure of failures) {
134
+ await this.memory.storeEpisode({
135
+ sessionId: `test-${task}`,
136
+ task,
137
+ input: 'test input',
138
+ output: 'failed output',
139
+ critique: failure.critique,
140
+ reward: failure.reward,
141
+ success: false
142
+ });
143
+ }
144
+ // Create a successful episode
145
+ await this.memory.storeEpisode({
146
+ sessionId: `test-${task}`,
147
+ task,
148
+ input: 'test input',
149
+ output: 'successful output',
150
+ critique: 'Fixed: handled empty array edge case',
151
+ reward: 0.9,
152
+ success: true
153
+ });
154
+ // Query for top-3 failures
155
+ const retrieved = await this.memory.retrieveRelevant({
156
+ task,
157
+ k: 3,
158
+ onlyFailures: true
159
+ });
160
+ totalTests++;
161
+ // Check if we got relevant failures
162
+ const hasRelevantFailure = retrieved.some(ep => ep.critique && ep.critique.length > 0);
163
+ if (hasRelevantFailure) {
164
+ hits++;
165
+ console.log(`✓ ${task}: Found ${retrieved.length} relevant failures`);
166
+ }
167
+ else {
168
+ console.log(`✗ ${task}: No relevant failures in top-3`);
169
+ }
170
+ }
171
+ const hitRate = hits / totalTests;
172
+ console.log(`\n📈 Hit Rate Results:`);
173
+ console.log(` Total Tests: ${totalTests}`);
174
+ console.log(` Hits: ${hits}`);
175
+ console.log(` Hit Rate: ${(hitRate * 100).toFixed(1)}%`);
176
+ const passed = hitRate >= 0.6;
177
+ console.log(`\n${passed ? '✅ PASSED' : '❌ FAILED'}: Hit rate ${(hitRate * 100).toFixed(1)}% ${passed ? '≥' : '<'} 60%`);
178
+ this.results.push({
179
+ testName: 'Hit Rate',
180
+ passed,
181
+ metrics: { hitRate, totalTests, hits },
182
+ details: `Hit rate: ${(hitRate * 100).toFixed(1)}% (target: ≥60%)`
183
+ });
184
+ }
185
+ /**
186
+ * Test 3: Improvement Tracking
187
+ * Goal: Agents learn and improve over attempts
188
+ */
189
+ async testImprovementTracking() {
190
+ console.log('\n━'.repeat(70));
191
+ console.log('\n📊 Test 3: Improvement Tracking');
192
+ console.log('Goal: Measure learning curves over episodes\n');
193
+ const task = 'implement_sorting_algorithm';
194
+ const attempts = 10;
195
+ const rewards = [];
196
+ // Simulate learning: rewards should trend upward
197
+ for (let i = 0; i < attempts; i++) {
198
+ const baseReward = 0.3;
199
+ const improvement = i * 0.07; // 7% improvement per attempt
200
+ const noise = Math.random() * 0.1 - 0.05;
201
+ const reward = Math.min(1.0, baseReward + improvement + noise);
202
+ await this.memory.storeEpisode({
203
+ sessionId: 'learning-test',
204
+ task,
205
+ input: `attempt ${i + 1}`,
206
+ output: `output ${i + 1}`,
207
+ critique: i < 5 ? `Issue: needs improvement` : `Better: applied learnings`,
208
+ reward,
209
+ success: reward > 0.7
210
+ });
211
+ rewards.push(reward);
212
+ }
213
+ // Calculate improvement trend
214
+ const stats = this.memory.getTaskStats(task);
215
+ console.log(`📈 Learning Progress:`);
216
+ rewards.forEach((r, i) => {
217
+ const bar = '█'.repeat(Math.floor(r * 30));
218
+ console.log(` Attempt ${i + 1}: ${bar} ${(r * 100).toFixed(1)}%`);
219
+ });
220
+ console.log(`\n📊 Statistics:`);
221
+ console.log(` Total Attempts: ${stats.totalAttempts}`);
222
+ console.log(` Success Rate: ${(stats.successRate * 100).toFixed(1)}%`);
223
+ console.log(` Average Reward: ${(stats.avgReward * 100).toFixed(1)}%`);
224
+ console.log(` Improvement Trend: ${(stats.improvementTrend * 100).toFixed(1)}%`);
225
+ const passed = stats.improvementTrend > 0;
226
+ console.log(`\n${passed ? '✅ PASSED' : '❌ FAILED'}: Positive learning trend`);
227
+ this.results.push({
228
+ testName: 'Improvement Tracking',
229
+ passed,
230
+ metrics: {
231
+ attempts: stats.totalAttempts,
232
+ successRate: stats.successRate,
233
+ avgReward: stats.avgReward,
234
+ improvementTrend: stats.improvementTrend
235
+ },
236
+ details: `Improvement trend: ${(stats.improvementTrend * 100).toFixed(1)}%`
237
+ });
238
+ }
239
+ /**
240
+ * Test 4: Pruning Efficiency
241
+ * Goal: Remove low-quality memories while preserving good ones
242
+ */
243
+ async testPruning() {
244
+ console.log('\n━'.repeat(70));
245
+ console.log('\n📊 Test 4: Pruning Efficiency');
246
+ console.log('Goal: Remove low-quality memories efficiently\n');
247
+ const task = 'pruning_test_task';
248
+ // Create mix of high and low quality episodes
249
+ const highQuality = 20;
250
+ const lowQuality = 80;
251
+ for (let i = 0; i < highQuality; i++) {
252
+ await this.memory.storeEpisode({
253
+ sessionId: 'pruning-test',
254
+ task,
255
+ input: `high quality ${i}`,
256
+ output: `good output ${i}`,
257
+ critique: 'Excellent work',
258
+ reward: 0.8 + Math.random() * 0.2,
259
+ success: true
260
+ });
261
+ }
262
+ for (let i = 0; i < lowQuality; i++) {
263
+ await this.memory.storeEpisode({
264
+ sessionId: 'pruning-test',
265
+ task,
266
+ input: `low quality ${i}`,
267
+ output: `poor output ${i}`,
268
+ critique: 'Needs work',
269
+ reward: 0.1 + Math.random() * 0.2,
270
+ success: false
271
+ });
272
+ }
273
+ const beforeCount = this.db.prepare('SELECT COUNT(*) as count FROM episodes WHERE task = ?')
274
+ .get(task);
275
+ console.log(`Before pruning: ${beforeCount.count} episodes`);
276
+ // Prune low-quality episodes
277
+ const pruned = this.memory.pruneEpisodes({
278
+ minReward: 0.5,
279
+ maxAgeDays: 1,
280
+ keepMinPerTask: 5
281
+ });
282
+ const afterCount = this.db.prepare('SELECT COUNT(*) as count FROM episodes WHERE task = ?')
283
+ .get(task);
284
+ console.log(`After pruning: ${afterCount.count} episodes`);
285
+ console.log(`Removed: ${pruned} episodes`);
286
+ const remainingQuality = this.db.prepare('SELECT AVG(reward) as avg_reward FROM episodes WHERE task = ?').get(task);
287
+ console.log(`\n📊 Results:`);
288
+ console.log(` Pruned: ${pruned} episodes`);
289
+ console.log(` Retained: ${afterCount.count} episodes`);
290
+ console.log(` Remaining Quality: ${(remainingQuality.avg_reward * 100).toFixed(1)}%`);
291
+ const passed = pruned > 0 && afterCount.count >= 5 && remainingQuality.avg_reward >= 0.5;
292
+ console.log(`\n${passed ? '✅ PASSED' : '❌ FAILED'}: Pruning maintained quality`);
293
+ this.results.push({
294
+ testName: 'Pruning Efficiency',
295
+ passed,
296
+ metrics: {
297
+ pruned,
298
+ retained: afterCount.count,
299
+ avgQuality: remainingQuality.avg_reward
300
+ },
301
+ details: `Pruned ${pruned} episodes, retained ${afterCount.count} with ${(remainingQuality.avg_reward * 100).toFixed(1)}% quality`
302
+ });
303
+ }
304
+ /**
305
+ * Print summary results
306
+ */
307
+ printResults() {
308
+ console.log('\n' + '━'.repeat(70));
309
+ console.log('\n📊 BENCHMARK SUMMARY\n');
310
+ console.log('━'.repeat(70));
311
+ const passed = this.results.filter(r => r.passed).length;
312
+ const total = this.results.length;
313
+ this.results.forEach((result, i) => {
314
+ const status = result.passed ? '✅ PASS' : '❌ FAIL';
315
+ console.log(`\n${i + 1}. ${result.testName}: ${status}`);
316
+ console.log(` ${result.details}`);
317
+ });
318
+ console.log('\n' + '━'.repeat(70));
319
+ console.log(`\n🎯 Overall: ${passed}/${total} tests passed (${((passed / total) * 100).toFixed(1)}%)`);
320
+ if (passed === total) {
321
+ console.log('\n✨ All benchmarks passed! Reflexion memory is production-ready.\n');
322
+ }
323
+ else {
324
+ console.log('\n⚠️ Some benchmarks failed. Review results above.\n');
325
+ }
326
+ }
327
+ /**
328
+ * Generate test data helpers
329
+ */
330
+ generateTaskName(index) {
331
+ const tasks = [
332
+ 'implement_binary_search',
333
+ 'create_rest_api',
334
+ 'parse_json_data',
335
+ 'optimize_query',
336
+ 'handle_errors',
337
+ 'validate_input',
338
+ 'format_output',
339
+ 'cache_results',
340
+ 'log_events',
341
+ 'test_coverage'
342
+ ];
343
+ return tasks[index % tasks.length];
344
+ }
345
+ generateCritique(index) {
346
+ const critiques = [
347
+ 'Edge case not handled',
348
+ 'Performance could be improved',
349
+ 'Error handling missing',
350
+ 'Input validation needed',
351
+ 'Output format incorrect'
352
+ ];
353
+ return critiques[index % critiques.length];
354
+ }
355
+ /**
356
+ * Cleanup
357
+ */
358
+ close() {
359
+ this.db.close();
360
+ }
361
+ }
362
+ // Run benchmark if called directly
363
+ if (require.main === module) {
364
+ (async () => {
365
+ const benchmark = new ReflexionBenchmark();
366
+ await benchmark.initialize();
367
+ await benchmark.runAll();
368
+ benchmark.close();
369
+ })().catch(console.error);
370
+ }