agentic-flow 1.7.2 → 1.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/test-neural.md +0 -5
- package/.claude/answer.md +1 -0
- package/.claude/settings.json +19 -20
- package/CHANGELOG.md +0 -91
- package/README.md +17 -81
- package/dist/agentdb/benchmarks/comprehensive-benchmark.js +664 -0
- package/dist/agentdb/benchmarks/frontier-benchmark.js +419 -0
- package/dist/agentdb/benchmarks/reflexion-benchmark.js +370 -0
- package/dist/agentdb/cli/agentdb-cli.js +717 -0
- package/dist/agentdb/controllers/CausalMemoryGraph.js +322 -0
- package/dist/agentdb/controllers/CausalRecall.js +281 -0
- package/dist/agentdb/controllers/EmbeddingService.js +118 -0
- package/dist/agentdb/controllers/ExplainableRecall.js +387 -0
- package/dist/agentdb/controllers/NightlyLearner.js +382 -0
- package/dist/agentdb/controllers/ReflexionMemory.js +239 -0
- package/dist/agentdb/controllers/SkillLibrary.js +276 -0
- package/dist/agentdb/controllers/frontier-index.js +9 -0
- package/dist/agentdb/controllers/index.js +8 -0
- package/dist/agentdb/index.js +32 -0
- package/dist/agentdb/optimizations/BatchOperations.js +198 -0
- package/dist/agentdb/optimizations/QueryOptimizer.js +225 -0
- package/dist/agentdb/optimizations/index.js +7 -0
- package/dist/agentdb/tests/frontier-features.test.js +665 -0
- package/dist/cli/skills-manager.js +3 -1
- package/dist/cli-proxy.js +2 -33
- package/dist/mcp/standalone-stdio.js +200 -4
- package/dist/memory/SharedMemoryPool.js +211 -0
- package/dist/memory/index.js +6 -0
- package/dist/reasoningbank/AdvancedMemory.js +239 -0
- package/dist/reasoningbank/HybridBackend.js +305 -0
- package/dist/reasoningbank/index-new.js +87 -0
- package/dist/reasoningbank/index.js +23 -44
- package/dist/utils/cli.js +0 -22
- package/docs/AGENTDB_TESTING.md +411 -0
- package/docs/v1.7.1-QUICK-START.md +399 -0
- package/package.json +4 -4
- package/scripts/run-validation.sh +165 -0
- package/scripts/test-agentdb.sh +153 -0
- package/.claude/skills/agentdb-memory-patterns/SKILL.md +0 -166
- package/.claude/skills/agentdb-vector-search/SKILL.md +0 -126
- package/.claude/skills/agentic-flow/agentdb-memory-patterns/SKILL.md +0 -166
- package/.claude/skills/agentic-flow/agentdb-vector-search/SKILL.md +0 -126
- package/.claude/skills/agentic-flow/reasoningbank-intelligence/SKILL.md +0 -201
- package/.claude/skills/agentic-flow/swarm-orchestration/SKILL.md +0 -179
- package/.claude/skills/reasoningbank-intelligence/SKILL.md +0 -201
- package/.claude/skills/skill-builder/README.md +0 -308
- package/.claude/skills/skill-builder/SKILL.md +0 -910
- package/.claude/skills/skill-builder/docs/SPECIFICATION.md +0 -358
- package/.claude/skills/skill-builder/resources/schemas/skill-frontmatter.schema.json +0 -41
- package/.claude/skills/skill-builder/resources/templates/full-skill.template +0 -118
- package/.claude/skills/skill-builder/resources/templates/minimal-skill.template +0 -38
- package/.claude/skills/skill-builder/scripts/generate-skill.sh +0 -334
- package/.claude/skills/skill-builder/scripts/validate-skill.sh +0 -198
- package/.claude/skills/swarm-orchestration/SKILL.md +0 -179
- package/docs/AGENTDB_INTEGRATION.md +0 -379
|
@@ -0,0 +1,665 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Comprehensive Tests for Frontier Features
|
|
3
|
+
*
|
|
4
|
+
* Validates:
|
|
5
|
+
* 1. CausalMemoryGraph - causal inference, uplift calculation, A/B testing
|
|
6
|
+
* 2. ExplainableRecall - minimal hitting sets, Merkle proofs, provenance
|
|
7
|
+
*
|
|
8
|
+
* NO MOCKING - Real SQLite database, real algorithms, real results
|
|
9
|
+
*/
|
|
10
|
+
import Database from 'better-sqlite3';
|
|
11
|
+
import * as fs from 'fs';
|
|
12
|
+
import * as path from 'path';
|
|
13
|
+
import { CausalMemoryGraph } from '../controllers/CausalMemoryGraph';
|
|
14
|
+
import { ExplainableRecall } from '../controllers/ExplainableRecall';
|
|
15
|
+
describe('Frontier Features - CausalMemoryGraph', () => {
|
|
16
|
+
let db;
|
|
17
|
+
let causalGraph;
|
|
18
|
+
beforeEach(() => {
|
|
19
|
+
// Create in-memory database
|
|
20
|
+
db = new Database(':memory:');
|
|
21
|
+
// Load core schema
|
|
22
|
+
const coreSchema = fs.readFileSync(path.join(__dirname, '../schemas/schema.sql'), 'utf-8');
|
|
23
|
+
db.exec(coreSchema);
|
|
24
|
+
// Load frontier schema
|
|
25
|
+
const frontierSchema = fs.readFileSync(path.join(__dirname, '../schemas/frontier-schema.sql'), 'utf-8');
|
|
26
|
+
db.exec(frontierSchema);
|
|
27
|
+
causalGraph = new CausalMemoryGraph(db);
|
|
28
|
+
// Insert test episodes
|
|
29
|
+
const episodes = [
|
|
30
|
+
{ session_id: 'session1', task: 'code_review', reward: 0.8, success: 1 },
|
|
31
|
+
{ session_id: 'session1', task: 'bug_fix', reward: 0.9, success: 1 },
|
|
32
|
+
{ session_id: 'session2', task: 'feature_impl', reward: 0.7, success: 1 },
|
|
33
|
+
{ session_id: 'session2', task: 'testing', reward: 0.85, success: 1 },
|
|
34
|
+
{ session_id: 'session3', task: 'refactoring', reward: 0.75, success: 1 },
|
|
35
|
+
];
|
|
36
|
+
const stmt = db.prepare(`
|
|
37
|
+
INSERT INTO episodes (session_id, task, reward, success)
|
|
38
|
+
VALUES (?, ?, ?, ?)
|
|
39
|
+
`);
|
|
40
|
+
episodes.forEach(ep => {
|
|
41
|
+
stmt.run(ep.session_id, ep.task, ep.reward, ep.success);
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
afterEach(() => {
|
|
45
|
+
db.close();
|
|
46
|
+
});
|
|
47
|
+
test('should add causal edge with all properties', () => {
|
|
48
|
+
const edge = {
|
|
49
|
+
fromMemoryId: 1,
|
|
50
|
+
fromMemoryType: 'episode',
|
|
51
|
+
toMemoryId: 2,
|
|
52
|
+
toMemoryType: 'episode',
|
|
53
|
+
similarity: 0.85,
|
|
54
|
+
uplift: 0.15,
|
|
55
|
+
confidence: 0.9,
|
|
56
|
+
sampleSize: 100,
|
|
57
|
+
evidenceIds: ['exp1', 'exp2'],
|
|
58
|
+
mechanism: 'code_review improves bug_fix success'
|
|
59
|
+
};
|
|
60
|
+
const edgeId = causalGraph.addCausalEdge(edge);
|
|
61
|
+
expect(edgeId).toBeGreaterThan(0);
|
|
62
|
+
// Verify stored correctly
|
|
63
|
+
const stored = db.prepare('SELECT * FROM causal_edges WHERE id = ?').get(edgeId);
|
|
64
|
+
expect(stored.from_memory_id).toBe(1);
|
|
65
|
+
expect(stored.to_memory_id).toBe(2);
|
|
66
|
+
expect(stored.similarity).toBe(0.85);
|
|
67
|
+
expect(stored.uplift).toBe(0.15);
|
|
68
|
+
expect(stored.confidence).toBe(0.9);
|
|
69
|
+
expect(stored.sample_size).toBe(100);
|
|
70
|
+
expect(JSON.parse(stored.evidence_ids)).toEqual(['exp1', 'exp2']);
|
|
71
|
+
expect(stored.mechanism).toBe('code_review improves bug_fix success');
|
|
72
|
+
});
|
|
73
|
+
test('should create and track A/B experiment', () => {
|
|
74
|
+
const experiment = {
|
|
75
|
+
name: 'Test Code Review Impact',
|
|
76
|
+
hypothesis: 'Code review reduces bugs',
|
|
77
|
+
treatmentId: 1,
|
|
78
|
+
treatmentType: 'episode',
|
|
79
|
+
controlId: 3,
|
|
80
|
+
startTime: Date.now(),
|
|
81
|
+
sampleSize: 0,
|
|
82
|
+
status: 'running'
|
|
83
|
+
};
|
|
84
|
+
const expId = causalGraph.createExperiment(experiment);
|
|
85
|
+
expect(expId).toBeGreaterThan(0);
|
|
86
|
+
// Verify stored
|
|
87
|
+
const stored = db.prepare('SELECT * FROM causal_experiments WHERE id = ?').get(expId);
|
|
88
|
+
expect(stored.name).toBe('Test Code Review Impact');
|
|
89
|
+
expect(stored.status).toBe('running');
|
|
90
|
+
expect(stored.sample_size).toBe(0);
|
|
91
|
+
});
|
|
92
|
+
test('should record observations and update sample size', () => {
|
|
93
|
+
const expId = causalGraph.createExperiment({
|
|
94
|
+
name: 'Test Experiment',
|
|
95
|
+
hypothesis: 'Treatment improves outcome',
|
|
96
|
+
treatmentId: 1,
|
|
97
|
+
treatmentType: 'episode',
|
|
98
|
+
startTime: Date.now(),
|
|
99
|
+
sampleSize: 0,
|
|
100
|
+
status: 'running'
|
|
101
|
+
});
|
|
102
|
+
// Record treatment observation
|
|
103
|
+
causalGraph.recordObservation({
|
|
104
|
+
experimentId: expId,
|
|
105
|
+
episodeId: 1,
|
|
106
|
+
isTreatment: true,
|
|
107
|
+
outcomeValue: 0.9,
|
|
108
|
+
outcomeType: 'reward'
|
|
109
|
+
});
|
|
110
|
+
// Record control observation
|
|
111
|
+
causalGraph.recordObservation({
|
|
112
|
+
experimentId: expId,
|
|
113
|
+
episodeId: 2,
|
|
114
|
+
isTreatment: false,
|
|
115
|
+
outcomeValue: 0.7,
|
|
116
|
+
outcomeType: 'reward'
|
|
117
|
+
});
|
|
118
|
+
// Check sample size updated
|
|
119
|
+
const experiment = db.prepare('SELECT * FROM causal_experiments WHERE id = ?').get(expId);
|
|
120
|
+
expect(experiment.sample_size).toBe(2);
|
|
121
|
+
// Check observations stored
|
|
122
|
+
const observations = db.prepare('SELECT * FROM causal_observations WHERE experiment_id = ?').all(expId);
|
|
123
|
+
expect(observations).toHaveLength(2);
|
|
124
|
+
});
|
|
125
|
+
test('should calculate uplift with statistical significance', () => {
|
|
126
|
+
const expId = causalGraph.createExperiment({
|
|
127
|
+
name: 'Uplift Test',
|
|
128
|
+
hypothesis: 'Treatment increases reward',
|
|
129
|
+
treatmentId: 1,
|
|
130
|
+
treatmentType: 'episode',
|
|
131
|
+
startTime: Date.now(),
|
|
132
|
+
sampleSize: 0,
|
|
133
|
+
status: 'running'
|
|
134
|
+
});
|
|
135
|
+
// Record 50 treatment observations (mean ~0.8)
|
|
136
|
+
for (let i = 0; i < 50; i++) {
|
|
137
|
+
causalGraph.recordObservation({
|
|
138
|
+
experimentId: expId,
|
|
139
|
+
episodeId: 1,
|
|
140
|
+
isTreatment: true,
|
|
141
|
+
outcomeValue: 0.75 + Math.random() * 0.1, // 0.75-0.85
|
|
142
|
+
outcomeType: 'reward'
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
// Record 50 control observations (mean ~0.6)
|
|
146
|
+
for (let i = 0; i < 50; i++) {
|
|
147
|
+
causalGraph.recordObservation({
|
|
148
|
+
experimentId: expId,
|
|
149
|
+
episodeId: 2,
|
|
150
|
+
isTreatment: false,
|
|
151
|
+
outcomeValue: 0.55 + Math.random() * 0.1, // 0.55-0.65
|
|
152
|
+
outcomeType: 'reward'
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
const result = causalGraph.calculateUplift(expId);
|
|
156
|
+
// Validate results
|
|
157
|
+
expect(result.uplift).toBeGreaterThan(0.1); // Treatment > Control
|
|
158
|
+
expect(result.uplift).toBeLessThan(0.3); // Reasonable range
|
|
159
|
+
expect(result.pValue).toBeLessThan(0.05); // Statistically significant
|
|
160
|
+
expect(result.confidenceInterval[0]).toBeLessThan(result.uplift);
|
|
161
|
+
expect(result.confidenceInterval[1]).toBeGreaterThan(result.uplift);
|
|
162
|
+
// Verify experiment updated
|
|
163
|
+
const experiment = db.prepare('SELECT * FROM causal_experiments WHERE id = ?').get(expId);
|
|
164
|
+
expect(experiment.status).toBe('completed');
|
|
165
|
+
expect(experiment.uplift).toBeCloseTo(result.uplift, 5);
|
|
166
|
+
expect(experiment.p_value).toBeCloseTo(result.pValue, 5);
|
|
167
|
+
});
|
|
168
|
+
test('should query causal effects by confidence and uplift', () => {
|
|
169
|
+
// Add multiple causal edges
|
|
170
|
+
causalGraph.addCausalEdge({
|
|
171
|
+
fromMemoryId: 1,
|
|
172
|
+
fromMemoryType: 'episode',
|
|
173
|
+
toMemoryId: 2,
|
|
174
|
+
toMemoryType: 'episode',
|
|
175
|
+
similarity: 0.8,
|
|
176
|
+
uplift: 0.2,
|
|
177
|
+
confidence: 0.9,
|
|
178
|
+
sampleSize: 100
|
|
179
|
+
});
|
|
180
|
+
causalGraph.addCausalEdge({
|
|
181
|
+
fromMemoryId: 1,
|
|
182
|
+
fromMemoryType: 'episode',
|
|
183
|
+
toMemoryId: 3,
|
|
184
|
+
toMemoryType: 'episode',
|
|
185
|
+
similarity: 0.7,
|
|
186
|
+
uplift: 0.05, // Low uplift
|
|
187
|
+
confidence: 0.95,
|
|
188
|
+
sampleSize: 80
|
|
189
|
+
});
|
|
190
|
+
causalGraph.addCausalEdge({
|
|
191
|
+
fromMemoryId: 1,
|
|
192
|
+
fromMemoryType: 'episode',
|
|
193
|
+
toMemoryId: 4,
|
|
194
|
+
toMemoryType: 'episode',
|
|
195
|
+
similarity: 0.85,
|
|
196
|
+
uplift: 0.25,
|
|
197
|
+
confidence: 0.4, // Low confidence
|
|
198
|
+
sampleSize: 50
|
|
199
|
+
});
|
|
200
|
+
// Query with filters
|
|
201
|
+
const effects = causalGraph.queryCausalEffects({
|
|
202
|
+
interventionMemoryId: 1,
|
|
203
|
+
interventionMemoryType: 'episode',
|
|
204
|
+
minConfidence: 0.8,
|
|
205
|
+
minUplift: 0.1
|
|
206
|
+
});
|
|
207
|
+
// Should only return first edge (high confidence + high uplift)
|
|
208
|
+
expect(effects).toHaveLength(1);
|
|
209
|
+
expect(effects[0].toMemoryId).toBe(2);
|
|
210
|
+
expect(effects[0].confidence).toBe(0.9);
|
|
211
|
+
expect(effects[0].uplift).toBe(0.2);
|
|
212
|
+
});
|
|
213
|
+
test('should find multi-hop causal chains', () => {
|
|
214
|
+
// Create chain: 1 -> 2 -> 3
|
|
215
|
+
causalGraph.addCausalEdge({
|
|
216
|
+
fromMemoryId: 1,
|
|
217
|
+
fromMemoryType: 'episode',
|
|
218
|
+
toMemoryId: 2,
|
|
219
|
+
toMemoryType: 'episode',
|
|
220
|
+
similarity: 0.8,
|
|
221
|
+
uplift: 0.1,
|
|
222
|
+
confidence: 0.9,
|
|
223
|
+
sampleSize: 100
|
|
224
|
+
});
|
|
225
|
+
causalGraph.addCausalEdge({
|
|
226
|
+
fromMemoryId: 2,
|
|
227
|
+
fromMemoryType: 'episode',
|
|
228
|
+
toMemoryId: 3,
|
|
229
|
+
toMemoryType: 'episode',
|
|
230
|
+
similarity: 0.75,
|
|
231
|
+
uplift: 0.15,
|
|
232
|
+
confidence: 0.85,
|
|
233
|
+
sampleSize: 80
|
|
234
|
+
});
|
|
235
|
+
// Also add direct edge 1 -> 3 (lower uplift)
|
|
236
|
+
causalGraph.addCausalEdge({
|
|
237
|
+
fromMemoryId: 1,
|
|
238
|
+
fromMemoryType: 'episode',
|
|
239
|
+
toMemoryId: 3,
|
|
240
|
+
toMemoryType: 'episode',
|
|
241
|
+
similarity: 0.7,
|
|
242
|
+
uplift: 0.05,
|
|
243
|
+
confidence: 0.8,
|
|
244
|
+
sampleSize: 60
|
|
245
|
+
});
|
|
246
|
+
const chains = causalGraph.getCausalChain(1, 3, 5);
|
|
247
|
+
// Should find both paths
|
|
248
|
+
expect(chains.length).toBeGreaterThanOrEqual(1);
|
|
249
|
+
// The 2-hop path should have higher total uplift
|
|
250
|
+
const twoHopChain = chains.find(c => c.path.length === 3); // [1, 2, 3]
|
|
251
|
+
expect(twoHopChain).toBeDefined();
|
|
252
|
+
expect(twoHopChain.path).toEqual([1, 2, 3]);
|
|
253
|
+
expect(twoHopChain.totalUplift).toBeCloseTo(0.25, 1); // 0.1 + 0.15
|
|
254
|
+
expect(twoHopChain.confidence).toBeGreaterThanOrEqual(0.85); // Min of chain
|
|
255
|
+
});
|
|
256
|
+
test('should detect potential confounders', () => {
|
|
257
|
+
// Create scenario where episode 3 might be a confounder
|
|
258
|
+
// for the relationship between episode 1 and episode 2
|
|
259
|
+
const edgeId = causalGraph.addCausalEdge({
|
|
260
|
+
fromMemoryId: 1,
|
|
261
|
+
fromMemoryType: 'episode',
|
|
262
|
+
toMemoryId: 2,
|
|
263
|
+
toMemoryType: 'episode',
|
|
264
|
+
similarity: 0.8,
|
|
265
|
+
uplift: 0.2,
|
|
266
|
+
confidence: 0.9,
|
|
267
|
+
sampleSize: 100
|
|
268
|
+
});
|
|
269
|
+
const result = causalGraph.detectConfounders(edgeId);
|
|
270
|
+
// Should return confounders array (may be empty in test data)
|
|
271
|
+
expect(result).toHaveProperty('confounders');
|
|
272
|
+
expect(Array.isArray(result.confounders)).toBe(true);
|
|
273
|
+
// Each confounder should have required properties
|
|
274
|
+
result.confounders.forEach(conf => {
|
|
275
|
+
expect(conf).toHaveProperty('memoryId');
|
|
276
|
+
expect(conf).toHaveProperty('correlationWithTreatment');
|
|
277
|
+
expect(conf).toHaveProperty('correlationWithOutcome');
|
|
278
|
+
expect(conf).toHaveProperty('confounderScore');
|
|
279
|
+
expect(conf.confounderScore).toBeGreaterThan(0.3);
|
|
280
|
+
});
|
|
281
|
+
});
|
|
282
|
+
test('should calculate causal gain vs baseline', () => {
|
|
283
|
+
// Add causal edges from treatment (episode 1) to outcomes
|
|
284
|
+
causalGraph.addCausalEdge({
|
|
285
|
+
fromMemoryId: 1,
|
|
286
|
+
fromMemoryType: 'episode',
|
|
287
|
+
toMemoryId: 2,
|
|
288
|
+
toMemoryType: 'episode',
|
|
289
|
+
similarity: 0.85,
|
|
290
|
+
uplift: 0.2,
|
|
291
|
+
confidence: 0.9,
|
|
292
|
+
mechanism: 'improves outcome'
|
|
293
|
+
});
|
|
294
|
+
const result = causalGraph.calculateCausalGain(1, 'reward');
|
|
295
|
+
// Should calculate difference between treated and untreated episodes
|
|
296
|
+
expect(result).toHaveProperty('causalGain');
|
|
297
|
+
expect(result).toHaveProperty('confidence');
|
|
298
|
+
expect(result).toHaveProperty('mechanism');
|
|
299
|
+
expect(result.confidence).toBeGreaterThanOrEqual(0);
|
|
300
|
+
expect(result.confidence).toBeLessThanOrEqual(1);
|
|
301
|
+
expect(typeof result.causalGain).toBe('number');
|
|
302
|
+
});
|
|
303
|
+
});
|
|
304
|
+
describe('Frontier Features - ExplainableRecall', () => {
|
|
305
|
+
let db;
|
|
306
|
+
let explainableRecall;
|
|
307
|
+
beforeEach(() => {
|
|
308
|
+
db = new Database(':memory:');
|
|
309
|
+
const coreSchema = fs.readFileSync(path.join(__dirname, '../schemas/schema.sql'), 'utf-8');
|
|
310
|
+
db.exec(coreSchema);
|
|
311
|
+
const frontierSchema = fs.readFileSync(path.join(__dirname, '../schemas/frontier-schema.sql'), 'utf-8');
|
|
312
|
+
db.exec(frontierSchema);
|
|
313
|
+
explainableRecall = new ExplainableRecall(db);
|
|
314
|
+
// Insert test episodes
|
|
315
|
+
for (let i = 1; i <= 10; i++) {
|
|
316
|
+
db.prepare(`
|
|
317
|
+
INSERT INTO episodes (session_id, task, reward, success)
|
|
318
|
+
VALUES (?, ?, ?, ?)
|
|
319
|
+
`).run(`session${i}`, `task${i}`, 0.8, 1);
|
|
320
|
+
}
|
|
321
|
+
});
|
|
322
|
+
afterEach(() => {
|
|
323
|
+
db.close();
|
|
324
|
+
});
|
|
325
|
+
test('should create recall certificate with minimal hitting set', () => {
|
|
326
|
+
const chunks = [
|
|
327
|
+
{ id: '1', type: 'episode', content: 'Implement authentication', relevance: 0.9 },
|
|
328
|
+
{ id: '2', type: 'episode', content: 'Add JWT tokens', relevance: 0.85 },
|
|
329
|
+
{ id: '3', type: 'episode', content: 'Hash passwords', relevance: 0.8 },
|
|
330
|
+
{ id: '4', type: 'episode', content: 'Setup database', relevance: 0.7 }
|
|
331
|
+
];
|
|
332
|
+
const requirements = [
|
|
333
|
+
'authentication',
|
|
334
|
+
'security',
|
|
335
|
+
'tokens'
|
|
336
|
+
];
|
|
337
|
+
const certificate = explainableRecall.createCertificate({
|
|
338
|
+
queryId: 'q1',
|
|
339
|
+
queryText: 'How to implement secure authentication?',
|
|
340
|
+
chunks,
|
|
341
|
+
requirements,
|
|
342
|
+
accessLevel: 'internal'
|
|
343
|
+
});
|
|
344
|
+
// Validate certificate structure
|
|
345
|
+
expect(certificate).toHaveProperty('id');
|
|
346
|
+
expect(certificate.queryId).toBe('q1');
|
|
347
|
+
expect(certificate.chunkIds).toEqual(['1', '2', '3', '4']);
|
|
348
|
+
expect(certificate.chunkTypes).toEqual(['episode', 'episode', 'episode', 'episode']);
|
|
349
|
+
// Validate minimal hitting set
|
|
350
|
+
expect(certificate.minimalWhy).toBeDefined();
|
|
351
|
+
expect(certificate.minimalWhy.length).toBeGreaterThan(0);
|
|
352
|
+
expect(certificate.minimalWhy.length).toBeLessThanOrEqual(chunks.length);
|
|
353
|
+
// Redundancy ratio should be >= 1
|
|
354
|
+
expect(certificate.redundancyRatio).toBeGreaterThanOrEqual(1);
|
|
355
|
+
expect(certificate.redundancyRatio).toBe(chunks.length / certificate.minimalWhy.length);
|
|
356
|
+
// Completeness should be 0-1
|
|
357
|
+
expect(certificate.completenessScore).toBeGreaterThanOrEqual(0);
|
|
358
|
+
expect(certificate.completenessScore).toBeLessThanOrEqual(1);
|
|
359
|
+
// Merkle root should exist
|
|
360
|
+
expect(certificate.merkleRoot).toBeDefined();
|
|
361
|
+
expect(certificate.merkleRoot.length).toBeGreaterThan(0);
|
|
362
|
+
// Source hashes should match chunks
|
|
363
|
+
expect(certificate.sourceHashes).toHaveLength(chunks.length);
|
|
364
|
+
// Access level
|
|
365
|
+
expect(certificate.accessLevel).toBe('internal');
|
|
366
|
+
// Latency tracking
|
|
367
|
+
expect(certificate.latencyMs).toBeGreaterThan(0);
|
|
368
|
+
});
|
|
369
|
+
test('should verify certificate integrity', () => {
|
|
370
|
+
const chunks = [
|
|
371
|
+
{ id: '1', type: 'episode', content: 'Test content 1', relevance: 0.9 },
|
|
372
|
+
{ id: '2', type: 'episode', content: 'Test content 2', relevance: 0.8 }
|
|
373
|
+
];
|
|
374
|
+
const certificate = explainableRecall.createCertificate({
|
|
375
|
+
queryId: 'q2',
|
|
376
|
+
queryText: 'Test query',
|
|
377
|
+
chunks,
|
|
378
|
+
requirements: ['test'],
|
|
379
|
+
accessLevel: 'public'
|
|
380
|
+
});
|
|
381
|
+
// Verify the certificate
|
|
382
|
+
const verification = explainableRecall.verifyCertificate(certificate.id);
|
|
383
|
+
expect(verification.valid).toBe(true);
|
|
384
|
+
expect(verification.issues).toHaveLength(0);
|
|
385
|
+
});
|
|
386
|
+
test('should detect tampered certificate', () => {
|
|
387
|
+
const chunks = [
|
|
388
|
+
{ id: '1', type: 'episode', content: 'Original content', relevance: 0.9 }
|
|
389
|
+
];
|
|
390
|
+
const certificate = explainableRecall.createCertificate({
|
|
391
|
+
queryId: 'q3',
|
|
392
|
+
queryText: 'Test query',
|
|
393
|
+
chunks,
|
|
394
|
+
requirements: ['test'],
|
|
395
|
+
accessLevel: 'internal'
|
|
396
|
+
});
|
|
397
|
+
// Tamper with the certificate in database
|
|
398
|
+
db.prepare(`
|
|
399
|
+
UPDATE recall_certificates
|
|
400
|
+
SET chunk_ids = ?
|
|
401
|
+
WHERE id = ?
|
|
402
|
+
`).run(JSON.stringify(['1', '999']), certificate.id);
|
|
403
|
+
// Verification should fail
|
|
404
|
+
const verification = explainableRecall.verifyCertificate(certificate.id);
|
|
405
|
+
expect(verification.valid).toBe(false);
|
|
406
|
+
expect(verification.issues.length).toBeGreaterThan(0);
|
|
407
|
+
expect(verification.issues[0]).toContain('Merkle root');
|
|
408
|
+
});
|
|
409
|
+
test('should provide justification for each chunk', () => {
|
|
410
|
+
const chunks = [
|
|
411
|
+
{ id: '1', type: 'episode', content: 'Setup API endpoint', relevance: 0.9 },
|
|
412
|
+
{ id: '2', type: 'episode', content: 'Add validation', relevance: 0.85 }
|
|
413
|
+
];
|
|
414
|
+
const certificate = explainableRecall.createCertificate({
|
|
415
|
+
queryId: 'q4',
|
|
416
|
+
queryText: 'How to create API?',
|
|
417
|
+
chunks,
|
|
418
|
+
requirements: ['api', 'validation'],
|
|
419
|
+
accessLevel: 'internal'
|
|
420
|
+
});
|
|
421
|
+
// Get justification for first chunk
|
|
422
|
+
const justification = explainableRecall.getJustification(certificate.id, '1');
|
|
423
|
+
expect(justification).toBeDefined();
|
|
424
|
+
expect(justification.chunkId).toBe('1');
|
|
425
|
+
expect(justification.chunkType).toBe('episode');
|
|
426
|
+
expect(justification.reason).toBeDefined();
|
|
427
|
+
expect(justification.necessityScore).toBeGreaterThanOrEqual(0);
|
|
428
|
+
expect(justification.necessityScore).toBeLessThanOrEqual(1);
|
|
429
|
+
expect(justification.pathElements).toBeDefined();
|
|
430
|
+
expect(Array.isArray(justification.pathElements)).toBe(true);
|
|
431
|
+
});
|
|
432
|
+
test('should track provenance lineage', () => {
|
|
433
|
+
// Create source with provenance
|
|
434
|
+
const sourceId = explainableRecall.createProvenance({
|
|
435
|
+
sourceType: 'episode',
|
|
436
|
+
sourceId: 1,
|
|
437
|
+
creator: 'test_user'
|
|
438
|
+
});
|
|
439
|
+
expect(sourceId).toBeGreaterThan(0);
|
|
440
|
+
// Get the content hash
|
|
441
|
+
const source = db.prepare('SELECT * FROM provenance_sources WHERE id = ?').get(sourceId);
|
|
442
|
+
expect(source).toBeDefined();
|
|
443
|
+
expect(source.content_hash).toBeDefined();
|
|
444
|
+
// Track lineage
|
|
445
|
+
const lineage = explainableRecall.getProvenanceLineage(source.content_hash);
|
|
446
|
+
expect(lineage).toHaveLength(1);
|
|
447
|
+
expect(lineage[0].sourceType).toBe('episode');
|
|
448
|
+
expect(lineage[0].sourceId).toBe(1);
|
|
449
|
+
expect(lineage[0].creator).toBe('test_user');
|
|
450
|
+
});
|
|
451
|
+
test('should audit certificate for quality metrics', () => {
|
|
452
|
+
const chunks = [
|
|
453
|
+
{ id: '1', type: 'episode', content: 'Content 1', relevance: 0.95 },
|
|
454
|
+
{ id: '2', type: 'episode', content: 'Content 2', relevance: 0.9 },
|
|
455
|
+
{ id: '3', type: 'episode', content: 'Content 3', relevance: 0.85 },
|
|
456
|
+
{ id: '4', type: 'episode', content: 'Content 4', relevance: 0.5 } // Low relevance
|
|
457
|
+
];
|
|
458
|
+
const certificate = explainableRecall.createCertificate({
|
|
459
|
+
queryId: 'q5',
|
|
460
|
+
queryText: 'Test audit query',
|
|
461
|
+
chunks,
|
|
462
|
+
requirements: ['req1', 'req2'],
|
|
463
|
+
accessLevel: 'internal'
|
|
464
|
+
});
|
|
465
|
+
const audit = explainableRecall.auditCertificate(certificate.id);
|
|
466
|
+
// Validate audit structure
|
|
467
|
+
expect(audit).toHaveProperty('certificateId');
|
|
468
|
+
expect(audit).toHaveProperty('queryText');
|
|
469
|
+
expect(audit).toHaveProperty('totalChunks');
|
|
470
|
+
expect(audit).toHaveProperty('minimalSetSize');
|
|
471
|
+
expect(audit).toHaveProperty('redundancyRatio');
|
|
472
|
+
expect(audit).toHaveProperty('completenessScore');
|
|
473
|
+
expect(audit).toHaveProperty('avgNecessityScore');
|
|
474
|
+
expect(audit).toHaveProperty('provenanceVerified');
|
|
475
|
+
expect(audit).toHaveProperty('qualityScore');
|
|
476
|
+
expect(audit.certificateId).toBe(certificate.id);
|
|
477
|
+
expect(audit.totalChunks).toBe(4);
|
|
478
|
+
expect(audit.minimalSetSize).toBeGreaterThan(0);
|
|
479
|
+
expect(audit.redundancyRatio).toBeGreaterThanOrEqual(1);
|
|
480
|
+
expect(audit.qualityScore).toBeGreaterThanOrEqual(0);
|
|
481
|
+
expect(audit.qualityScore).toBeLessThanOrEqual(1);
|
|
482
|
+
expect(audit.provenanceVerified).toBe(true);
|
|
483
|
+
});
|
|
484
|
+
test('should handle empty chunks gracefully', () => {
|
|
485
|
+
const certificate = explainableRecall.createCertificate({
|
|
486
|
+
queryId: 'q6',
|
|
487
|
+
queryText: 'Empty query',
|
|
488
|
+
chunks: [],
|
|
489
|
+
requirements: ['test'],
|
|
490
|
+
accessLevel: 'public'
|
|
491
|
+
});
|
|
492
|
+
expect(certificate.chunkIds).toHaveLength(0);
|
|
493
|
+
expect(certificate.minimalWhy).toHaveLength(0);
|
|
494
|
+
expect(certificate.redundancyRatio).toBe(0);
|
|
495
|
+
expect(certificate.completenessScore).toBe(0);
|
|
496
|
+
});
|
|
497
|
+
test('should calculate correct minimal hitting set', () => {
|
|
498
|
+
// Test case: Each chunk covers different requirements
|
|
499
|
+
const chunks = [
|
|
500
|
+
{ id: '1', type: 'episode', content: 'auth implementation', relevance: 0.9 },
|
|
501
|
+
{ id: '2', type: 'episode', content: 'database setup', relevance: 0.85 },
|
|
502
|
+
{ id: '3', type: 'episode', content: 'API endpoints', relevance: 0.8 }
|
|
503
|
+
];
|
|
504
|
+
const requirements = ['auth', 'database', 'api'];
|
|
505
|
+
const certificate = explainableRecall.createCertificate({
|
|
506
|
+
queryId: 'q7',
|
|
507
|
+
queryText: 'Full stack implementation',
|
|
508
|
+
chunks,
|
|
509
|
+
requirements,
|
|
510
|
+
accessLevel: 'internal'
|
|
511
|
+
});
|
|
512
|
+
// All chunks should be in minimal set (each covers unique requirement)
|
|
513
|
+
expect(certificate.minimalWhy.length).toBe(3);
|
|
514
|
+
expect(certificate.redundancyRatio).toBe(1.0);
|
|
515
|
+
});
|
|
516
|
+
test('should generate valid Merkle proofs', () => {
|
|
517
|
+
const chunks = Array.from({ length: 8 }, (_, i) => ({
|
|
518
|
+
id: `${i + 1}`,
|
|
519
|
+
type: 'episode',
|
|
520
|
+
content: `Content ${i + 1}`,
|
|
521
|
+
relevance: 0.9 - i * 0.05
|
|
522
|
+
}));
|
|
523
|
+
const certificate = explainableRecall.createCertificate({
|
|
524
|
+
queryId: 'q8',
|
|
525
|
+
queryText: 'Merkle test',
|
|
526
|
+
chunks,
|
|
527
|
+
requirements: ['test'],
|
|
528
|
+
accessLevel: 'internal'
|
|
529
|
+
});
|
|
530
|
+
// Merkle root should be deterministic
|
|
531
|
+
expect(certificate.merkleRoot).toBeDefined();
|
|
532
|
+
expect(certificate.merkleRoot.length).toBe(64); // SHA-256 hex = 64 chars
|
|
533
|
+
// Source hashes should match chunks
|
|
534
|
+
expect(certificate.sourceHashes).toHaveLength(8);
|
|
535
|
+
certificate.sourceHashes.forEach(hash => {
|
|
536
|
+
expect(hash.length).toBe(64);
|
|
537
|
+
});
|
|
538
|
+
});
|
|
539
|
+
});
|
|
540
|
+
describe('Frontier Features - Integration Tests', () => {
|
|
541
|
+
let db;
|
|
542
|
+
let causalGraph;
|
|
543
|
+
let explainableRecall;
|
|
544
|
+
beforeEach(() => {
|
|
545
|
+
db = new Database(':memory:');
|
|
546
|
+
const coreSchema = fs.readFileSync(path.join(__dirname, '../schemas/schema.sql'), 'utf-8');
|
|
547
|
+
db.exec(coreSchema);
|
|
548
|
+
const frontierSchema = fs.readFileSync(path.join(__dirname, '../schemas/frontier-schema.sql'), 'utf-8');
|
|
549
|
+
db.exec(frontierSchema);
|
|
550
|
+
causalGraph = new CausalMemoryGraph(db);
|
|
551
|
+
explainableRecall = new ExplainableRecall(db);
|
|
552
|
+
});
|
|
553
|
+
afterEach(() => {
|
|
554
|
+
db.close();
|
|
555
|
+
});
|
|
556
|
+
test('should combine causal reasoning with explainable recall', () => {
|
|
557
|
+
// 1. Insert episodes
|
|
558
|
+
for (let i = 1; i <= 5; i++) {
|
|
559
|
+
db.prepare(`
|
|
560
|
+
INSERT INTO episodes (session_id, task, reward, success)
|
|
561
|
+
VALUES (?, ?, ?, ?)
|
|
562
|
+
`).run(`session${i}`, `task${i}`, 0.7 + i * 0.05, 1);
|
|
563
|
+
}
|
|
564
|
+
// 2. Add causal edges
|
|
565
|
+
causalGraph.addCausalEdge({
|
|
566
|
+
fromMemoryId: 1,
|
|
567
|
+
fromMemoryType: 'episode',
|
|
568
|
+
toMemoryId: 2,
|
|
569
|
+
toMemoryType: 'episode',
|
|
570
|
+
similarity: 0.85,
|
|
571
|
+
uplift: 0.15,
|
|
572
|
+
confidence: 0.9,
|
|
573
|
+
mechanism: 'task1 improves task2'
|
|
574
|
+
});
|
|
575
|
+
// 3. Create retrieval with provenance
|
|
576
|
+
const chunks = [
|
|
577
|
+
{ id: '1', type: 'episode', content: 'First task', relevance: 0.9 },
|
|
578
|
+
{ id: '2', type: 'episode', content: 'Second task', relevance: 0.85 }
|
|
579
|
+
];
|
|
580
|
+
const certificate = explainableRecall.createCertificate({
|
|
581
|
+
queryId: 'integrated_q1',
|
|
582
|
+
queryText: 'Show me tasks with causal relationships',
|
|
583
|
+
chunks,
|
|
584
|
+
requirements: ['causal', 'evidence'],
|
|
585
|
+
accessLevel: 'internal'
|
|
586
|
+
});
|
|
587
|
+
// 4. Verify we can query both systems
|
|
588
|
+
const causalEffects = causalGraph.queryCausalEffects({
|
|
589
|
+
interventionMemoryId: 1,
|
|
590
|
+
interventionMemoryType: 'episode',
|
|
591
|
+
minConfidence: 0.8
|
|
592
|
+
});
|
|
593
|
+
expect(causalEffects.length).toBeGreaterThan(0);
|
|
594
|
+
expect(certificate).toBeDefined();
|
|
595
|
+
expect(certificate.minimalWhy.length).toBeGreaterThan(0);
|
|
596
|
+
// 5. Audit the certificate
|
|
597
|
+
const audit = explainableRecall.auditCertificate(certificate.id);
|
|
598
|
+
expect(audit.provenanceVerified).toBe(true);
|
|
599
|
+
});
|
|
600
|
+
test('should handle large-scale causal experiments efficiently', () => {
|
|
601
|
+
// Create experiment with many observations
|
|
602
|
+
const expId = causalGraph.createExperiment({
|
|
603
|
+
name: 'Large Scale Test',
|
|
604
|
+
hypothesis: 'Treatment improves outcome at scale',
|
|
605
|
+
treatmentId: 1,
|
|
606
|
+
treatmentType: 'episode',
|
|
607
|
+
startTime: Date.now(),
|
|
608
|
+
sampleSize: 0,
|
|
609
|
+
status: 'running'
|
|
610
|
+
});
|
|
611
|
+
const startTime = Date.now();
|
|
612
|
+
// Record 1000 observations
|
|
613
|
+
for (let i = 0; i < 1000; i++) {
|
|
614
|
+
causalGraph.recordObservation({
|
|
615
|
+
experimentId: expId,
|
|
616
|
+
episodeId: i % 10 + 1,
|
|
617
|
+
isTreatment: i % 2 === 0,
|
|
618
|
+
outcomeValue: (i % 2 === 0 ? 0.8 : 0.6) + Math.random() * 0.1,
|
|
619
|
+
outcomeType: 'reward'
|
|
620
|
+
});
|
|
621
|
+
}
|
|
622
|
+
const recordTime = Date.now() - startTime;
|
|
623
|
+
// Calculate uplift
|
|
624
|
+
const calcStart = Date.now();
|
|
625
|
+
const result = causalGraph.calculateUplift(expId);
|
|
626
|
+
const calcTime = Date.now() - calcStart;
|
|
627
|
+
// Performance assertions
|
|
628
|
+
expect(recordTime).toBeLessThan(1000); // < 1 second to record 1000 observations
|
|
629
|
+
expect(calcTime).toBeLessThan(100); // < 100ms to calculate uplift
|
|
630
|
+
// Result assertions
|
|
631
|
+
expect(result.uplift).toBeGreaterThan(0.1);
|
|
632
|
+
expect(result.pValue).toBeLessThan(0.05);
|
|
633
|
+
});
|
|
634
|
+
test('should handle concurrent certificate creation', () => {
|
|
635
|
+
const chunks = Array.from({ length: 10 }, (_, i) => ({
|
|
636
|
+
id: `${i + 1}`,
|
|
637
|
+
type: 'episode',
|
|
638
|
+
content: `Content ${i + 1}`,
|
|
639
|
+
relevance: 0.9 - i * 0.05
|
|
640
|
+
}));
|
|
641
|
+
// Create multiple certificates concurrently
|
|
642
|
+
const certificates = [];
|
|
643
|
+
for (let i = 0; i < 10; i++) {
|
|
644
|
+
const cert = explainableRecall.createCertificate({
|
|
645
|
+
queryId: `concurrent_q${i}`,
|
|
646
|
+
queryText: `Concurrent query ${i}`,
|
|
647
|
+
chunks: chunks.slice(0, i + 1),
|
|
648
|
+
requirements: ['test'],
|
|
649
|
+
accessLevel: 'internal'
|
|
650
|
+
});
|
|
651
|
+
certificates.push(cert);
|
|
652
|
+
}
|
|
653
|
+
// All should succeed
|
|
654
|
+
expect(certificates).toHaveLength(10);
|
|
655
|
+
certificates.forEach((cert, idx) => {
|
|
656
|
+
expect(cert.queryId).toBe(`concurrent_q${idx}`);
|
|
657
|
+
expect(cert.chunkIds.length).toBe(idx + 1);
|
|
658
|
+
});
|
|
659
|
+
// All should be verifiable
|
|
660
|
+
certificates.forEach(cert => {
|
|
661
|
+
const verification = explainableRecall.verifyCertificate(cert.id);
|
|
662
|
+
expect(verification.valid).toBe(true);
|
|
663
|
+
});
|
|
664
|
+
});
|
|
665
|
+
});
|
|
@@ -503,7 +503,9 @@ async function handleSkillsCreate(args) {
|
|
|
503
503
|
let count = 0;
|
|
504
504
|
for (const skillContent of skills) {
|
|
505
505
|
const skillName = extractSkillName(skillContent);
|
|
506
|
-
|
|
506
|
+
// Claude Code requires skills at TOP LEVEL: .claude/skills/[skill-name]/
|
|
507
|
+
// NOT in subdirectories: .claude/skills/namespace/[skill-name]/
|
|
508
|
+
const skillDir = join(projectSkillsDir, skillName);
|
|
507
509
|
mkdirSync(skillDir, { recursive: true });
|
|
508
510
|
writeFileSync(join(skillDir, 'SKILL.md'), skillContent, 'utf-8');
|
|
509
511
|
count++;
|