cognitive-core 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/.claude/settings.json +111 -2
  2. package/.sessionlog/settings.json +4 -0
  3. package/dist/index.d.ts +1 -1
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +5 -1
  6. package/dist/index.js.map +1 -1
  7. package/dist/learning/index.d.ts +1 -1
  8. package/dist/learning/index.d.ts.map +1 -1
  9. package/dist/learning/index.js.map +1 -1
  10. package/dist/learning/unified-pipeline.d.ts +30 -0
  11. package/dist/learning/unified-pipeline.d.ts.map +1 -1
  12. package/dist/learning/unified-pipeline.js +207 -0
  13. package/dist/learning/unified-pipeline.js.map +1 -1
  14. package/dist/memory/candidate-retrieval.d.ts.map +1 -1
  15. package/dist/memory/candidate-retrieval.js +3 -1
  16. package/dist/memory/candidate-retrieval.js.map +1 -1
  17. package/dist/utils/error-classifier.js +8 -8
  18. package/dist/utils/error-classifier.js.map +1 -1
  19. package/dist/workspace/efficacy-toolkit.d.ts +164 -0
  20. package/dist/workspace/efficacy-toolkit.d.ts.map +1 -0
  21. package/dist/workspace/efficacy-toolkit.js +281 -0
  22. package/dist/workspace/efficacy-toolkit.js.map +1 -0
  23. package/dist/workspace/index.d.ts +2 -1
  24. package/dist/workspace/index.d.ts.map +1 -1
  25. package/dist/workspace/index.js +3 -1
  26. package/dist/workspace/index.js.map +1 -1
  27. package/dist/workspace/templates/index.d.ts +3 -0
  28. package/dist/workspace/templates/index.d.ts.map +1 -1
  29. package/dist/workspace/templates/index.js +6 -0
  30. package/dist/workspace/templates/index.js.map +1 -1
  31. package/dist/workspace/templates/playbook-decay-detection.d.ts +46 -0
  32. package/dist/workspace/templates/playbook-decay-detection.d.ts.map +1 -0
  33. package/dist/workspace/templates/playbook-decay-detection.js +197 -0
  34. package/dist/workspace/templates/playbook-decay-detection.js.map +1 -0
  35. package/dist/workspace/templates/playbook-efficacy-audit.d.ts +46 -0
  36. package/dist/workspace/templates/playbook-efficacy-audit.d.ts.map +1 -0
  37. package/dist/workspace/templates/playbook-efficacy-audit.js +160 -0
  38. package/dist/workspace/templates/playbook-efficacy-audit.js.map +1 -0
  39. package/dist/workspace/templates/playbook-lifecycle-review.d.ts +51 -0
  40. package/dist/workspace/templates/playbook-lifecycle-review.d.ts.map +1 -0
  41. package/dist/workspace/templates/playbook-lifecycle-review.js +187 -0
  42. package/dist/workspace/templates/playbook-lifecycle-review.js.map +1 -0
  43. package/package.json +7 -1
  44. package/src/index.ts +27 -0
  45. package/src/learning/index.ts +1 -0
  46. package/src/learning/unified-pipeline.ts +271 -1
  47. package/src/memory/candidate-retrieval.ts +2 -1
  48. package/src/utils/error-classifier.ts +8 -8
  49. package/src/workspace/efficacy-toolkit.ts +496 -0
  50. package/src/workspace/index.ts +29 -0
  51. package/src/workspace/templates/index.ts +24 -0
  52. package/src/workspace/templates/playbook-decay-detection.ts +272 -0
  53. package/src/workspace/templates/playbook-efficacy-audit.ts +246 -0
  54. package/src/workspace/templates/playbook-lifecycle-review.ts +274 -0
  55. package/tests/fixtures/behavioral-trajectories.ts +210 -0
  56. package/tests/integration/pipeline-data-correctness.test.ts +794 -0
  57. package/tests/learning/meta-learner.test.ts +418 -0
  58. package/tests/learning/pipeline-memory-updates.test.ts +721 -0
  59. package/tests/learning/unified-pipeline-efficacy.test.ts +232 -0
  60. package/tests/memory/candidate-retrieval.test.ts +167 -0
  61. package/tests/memory/meta.test.ts +399 -0
  62. package/tests/search/evaluator.test.ts +257 -0
  63. package/tests/search/verification-runner.test.ts +357 -0
  64. package/tests/utils/error-classifier.test.ts +149 -0
  65. package/tests/utils/trajectory-helpers.test.ts +163 -0
  66. package/tests/workspace/efficacy-toolkit.test.ts +404 -0
  67. package/tests/workspace/templates/playbook-efficacy.test.ts +377 -0
@@ -0,0 +1,399 @@
1
+ import { describe, it, expect, beforeEach, afterEach } from 'vitest';
2
+ import { MetaMemory, createMetaMemory } from '../../src/memory/meta.js';
3
+ import { createSqlitePersistence } from '../../src/persistence/index.js';
4
+ import { createMetaObservation, createMetaStrategy } from '../../src/types/meta.js';
5
+ import type { MetaObservation, RoutingDecisionType } from '../../src/types/meta.js';
6
+ import { mkdtemp, rm } from 'node:fs/promises';
7
+ import { join } from 'node:path';
8
+ import { tmpdir } from 'node:os';
9
+
10
+ function makeObservation(overrides?: {
11
+ trajectoryId?: string;
12
+ decision?: RoutingDecisionType;
13
+ success?: boolean;
14
+ confidence?: number;
15
+ playbooksApplied?: string[];
16
+ retrievalQuality?: 'helpful' | 'neutral' | 'misleading';
17
+ timestamp?: Date;
18
+ }): MetaObservation {
19
+ return createMetaObservation({
20
+ trajectoryId: overrides?.trajectoryId ?? `traj-${Math.random().toString(36).slice(2, 6)}`,
21
+ routing: {
22
+ decision: overrides?.decision ?? 'direct',
23
+ confidence: overrides?.confidence ?? 0.8,
24
+ reasoning: 'test routing',
25
+ },
26
+ memoryUsage: {
27
+ experiencesRetrieved: 3,
28
+ playbooksApplied: overrides?.playbooksApplied ?? [],
29
+ retrievalQuality: overrides?.retrievalQuality ?? 'neutral',
30
+ },
31
+ execution: {
32
+ decompositionUsed: false,
33
+ refinementIterations: 0,
34
+ backtrackingOccurred: false,
35
+ toolsUsed: ['Read', 'Edit'],
36
+ },
37
+ outcome: {
38
+ success: overrides?.success ?? true,
39
+ quality: overrides?.success === false ? 'poor' : 'good',
40
+ effortActual: 5,
41
+ effortEstimate: 8,
42
+ },
43
+ lessons: {
44
+ whatWorked: ['direct approach'],
45
+ whatFailed: [],
46
+ suggestions: [],
47
+ },
48
+ timestamp: overrides?.timestamp,
49
+ });
50
+ }
51
+
52
+ describe('MetaMemory', () => {
53
+ let tempDir: string;
54
+ let persistence: any;
55
+ let meta: MetaMemory;
56
+
57
+ beforeEach(async () => {
58
+ tempDir = await mkdtemp(join(tmpdir(), 'meta-mem-test-'));
59
+ persistence = createSqlitePersistence({ baseDir: tempDir });
60
+ await persistence.init();
61
+ meta = createMetaMemory(persistence);
62
+ await meta.init();
63
+ });
64
+
65
+ afterEach(async () => {
66
+ persistence.close();
67
+ await rm(tempDir, { recursive: true, force: true });
68
+ });
69
+
70
+ describe('recordObservation and retrieval', () => {
71
+ it('should store and retrieve observations', async () => {
72
+ const obs = makeObservation({ trajectoryId: 'traj-1' });
73
+ await meta.recordObservation(obs);
74
+
75
+ const all = await meta.getObservations();
76
+ expect(all).toHaveLength(1);
77
+ expect(all[0].trajectoryId).toBe('traj-1');
78
+ });
79
+
80
+ it('should retrieve by trajectory ID', async () => {
81
+ await meta.recordObservation(makeObservation({ trajectoryId: 'traj-A' }));
82
+ await meta.recordObservation(makeObservation({ trajectoryId: 'traj-B' }));
83
+
84
+ const result = await meta.getByTrajectoryId('traj-A');
85
+ expect(result).toBeDefined();
86
+ expect(result!.trajectoryId).toBe('traj-A');
87
+ });
88
+
89
+ it('should return undefined for non-existent trajectory ID', async () => {
90
+ const result = await meta.getByTrajectoryId('nonexistent');
91
+ expect(result).toBeUndefined();
92
+ });
93
+ });
94
+
95
+ describe('observation filtering', () => {
96
+ it('should filter by routing decision', async () => {
97
+ await meta.recordObservation(makeObservation({ decision: 'direct' }));
98
+ await meta.recordObservation(makeObservation({ decision: 'explore' }));
99
+ await meta.recordObservation(makeObservation({ decision: 'direct' }));
100
+
101
+ const results = await meta.getObservations({ routingDecision: 'direct' });
102
+ expect(results).toHaveLength(2);
103
+ expect(results.every((o) => o.routing.decision === 'direct')).toBe(true);
104
+ });
105
+
106
+ it('should filter by outcome', async () => {
107
+ await meta.recordObservation(makeObservation({ success: true }));
108
+ await meta.recordObservation(makeObservation({ success: false }));
109
+ await meta.recordObservation(makeObservation({ success: true }));
110
+
111
+ const successes = await meta.getObservations({ outcome: 'success' });
112
+ expect(successes).toHaveLength(2);
113
+
114
+ const failures = await meta.getObservations({ outcome: 'failure' });
115
+ expect(failures).toHaveLength(1);
116
+ });
117
+
118
+ it('should filter by date range', async () => {
119
+ const oldDate = new Date('2025-01-01');
120
+ const newDate = new Date('2026-03-01');
121
+ await meta.recordObservation(makeObservation({ timestamp: oldDate }));
122
+ await meta.recordObservation(makeObservation({ timestamp: newDate }));
123
+
124
+ const results = await meta.getObservations({ minDate: new Date('2026-01-01') });
125
+ expect(results).toHaveLength(1);
126
+ });
127
+
128
+ it('should limit results', async () => {
129
+ for (let i = 0; i < 5; i++) {
130
+ await meta.recordObservation(makeObservation());
131
+ }
132
+
133
+ const results = await meta.getObservations({ limit: 3 });
134
+ expect(results).toHaveLength(3);
135
+ });
136
+ });
137
+
138
+ describe('strategy management', () => {
139
+ it('should add and retrieve a strategy', async () => {
140
+ const strategy = createMetaStrategy({
141
+ name: 'test-strategy',
142
+ condition: { taskCharacteristics: ['complex'], memoryState: [] },
143
+ adjustment: {
144
+ routingBias: { explore: 0.2 },
145
+ retrievalModification: 'expand',
146
+ executionHint: 'decompose',
147
+ },
148
+ });
149
+
150
+ await meta.addStrategy(strategy);
151
+ const retrieved = await meta.getStrategy(strategy.id);
152
+
153
+ expect(retrieved).toBeDefined();
154
+ expect(retrieved!.name).toBe('test-strategy');
155
+ expect(retrieved!.condition.taskCharacteristics).toContain('complex');
156
+ });
157
+
158
+ it('should list all strategies', async () => {
159
+ for (let i = 0; i < 3; i++) {
160
+ await meta.addStrategy(
161
+ createMetaStrategy({
162
+ name: `strategy-${i}`,
163
+ condition: { taskCharacteristics: [], memoryState: [] },
164
+ adjustment: { routingBias: {}, retrievalModification: '', executionHint: '' },
165
+ })
166
+ );
167
+ }
168
+
169
+ const all = await meta.getAllStrategies();
170
+ expect(all).toHaveLength(3);
171
+ });
172
+
173
+ it('should return undefined for non-existent strategy', async () => {
174
+ const result = await meta.getStrategy('nonexistent');
175
+ expect(result).toBeUndefined();
176
+ });
177
+ });
178
+
179
+ describe('recordStrategyApplication', () => {
180
+ it('should update application count and success rate', async () => {
181
+ const strategy = createMetaStrategy({
182
+ name: 'test',
183
+ condition: { taskCharacteristics: [], memoryState: [] },
184
+ adjustment: { routingBias: {}, retrievalModification: '', executionHint: '' },
185
+ });
186
+ await meta.addStrategy(strategy);
187
+
188
+ await meta.recordStrategyApplication(strategy.id, true);
189
+ const updated = await meta.getStrategy(strategy.id);
190
+
191
+ expect(updated!.applicationCount).toBe(1);
192
+ expect(updated!.successRate).toBeGreaterThan(0);
193
+ });
194
+
195
+ it('should use exponential moving average for success rate', async () => {
196
+ const strategy = createMetaStrategy({
197
+ name: 'ema-test',
198
+ condition: { taskCharacteristics: [], memoryState: [] },
199
+ adjustment: { routingBias: {}, retrievalModification: '', executionHint: '' },
200
+ });
201
+ strategy.successRate = 0.5; // Set initial
202
+ await meta.addStrategy(strategy);
203
+
204
+ await meta.recordStrategyApplication(strategy.id, true);
205
+ const after = await meta.getStrategy(strategy.id);
206
+
207
+ // EMA: 0.5 * (1 - 0.1) + 1 * 0.1 = 0.55
208
+ expect(after!.successRate).toBeCloseTo(0.55, 2);
209
+ });
210
+
211
+ it('should no-op for nonexistent strategy', async () => {
212
+ // Should not throw
213
+ await meta.recordStrategyApplication('nonexistent', true);
214
+ });
215
+ });
216
+
217
+ describe('getRoutingAdjustments', () => {
218
+ it('should return empty when no strategies match', async () => {
219
+ await meta.addStrategy(
220
+ createMetaStrategy({
221
+ name: 'specific',
222
+ condition: {
223
+ taskCharacteristics: ['database migration'],
224
+ memoryState: ['no experiences'],
225
+ },
226
+ adjustment: { routingBias: {}, retrievalModification: '', executionHint: '' },
227
+ })
228
+ );
229
+
230
+ const results = await meta.getRoutingAdjustments({
231
+ taskCharacteristics: ['UI change'],
232
+ memoryState: ['plenty of experiences'],
233
+ });
234
+ expect(results).toHaveLength(0);
235
+ });
236
+
237
+ it('should match strategies with empty conditions (always applies)', async () => {
238
+ await meta.addStrategy(
239
+ createMetaStrategy({
240
+ name: 'universal',
241
+ condition: { taskCharacteristics: [], memoryState: [] },
242
+ adjustment: { routingBias: {}, retrievalModification: '', executionHint: 'always' },
243
+ })
244
+ );
245
+
246
+ const results = await meta.getRoutingAdjustments({
247
+ taskCharacteristics: ['anything'],
248
+ memoryState: ['anything'],
249
+ });
250
+ expect(results).toHaveLength(1);
251
+ });
252
+
253
+ it('should match via case-insensitive substring', async () => {
254
+ await meta.addStrategy(
255
+ createMetaStrategy({
256
+ name: 'multi-file',
257
+ condition: {
258
+ taskCharacteristics: ['multi-file'],
259
+ memoryState: [],
260
+ },
261
+ adjustment: { routingBias: {}, retrievalModification: '', executionHint: '' },
262
+ })
263
+ );
264
+
265
+ const results = await meta.getRoutingAdjustments({
266
+ taskCharacteristics: ['This is a Multi-File Change'],
267
+ memoryState: [],
268
+ });
269
+ expect(results).toHaveLength(1);
270
+ });
271
+ });
272
+
273
+ describe('analyzeRoutingEffectiveness', () => {
274
+ it('should aggregate success rates by routing decision', async () => {
275
+ // 3 successful direct, 2 failed direct
276
+ for (let i = 0; i < 5; i++) {
277
+ await meta.recordObservation(makeObservation({ decision: 'direct', success: i < 3 }));
278
+ }
279
+
280
+ const analysis = await meta.analyzeRoutingEffectiveness();
281
+ expect(analysis.byDecision.direct.total).toBeGreaterThanOrEqual(5);
282
+ expect(analysis.byDecision.direct.successful).toBeGreaterThanOrEqual(3);
283
+ });
284
+
285
+ it('should generate recommendations for low success rate', async () => {
286
+ // 5 failed explore decisions
287
+ for (let i = 0; i < 5; i++) {
288
+ await meta.recordObservation(makeObservation({ decision: 'explore', success: false }));
289
+ }
290
+
291
+ const analysis = await meta.analyzeRoutingEffectiveness();
292
+ expect(analysis.recommendations.length).toBeGreaterThan(0);
293
+ expect(analysis.recommendations[0]).toContain('explore');
294
+ });
295
+ });
296
+
297
+ describe('analyzeRetrievalQuality', () => {
298
+ it('should track helpful and misleading playbook patterns', async () => {
299
+ await meta.recordObservation(
300
+ makeObservation({ playbooksApplied: ['pb-1'], retrievalQuality: 'helpful' })
301
+ );
302
+ await meta.recordObservation(
303
+ makeObservation({ playbooksApplied: ['pb-1'], retrievalQuality: 'helpful' })
304
+ );
305
+ await meta.recordObservation(
306
+ makeObservation({ playbooksApplied: ['pb-2'], retrievalQuality: 'misleading' })
307
+ );
308
+
309
+ const analysis = await meta.analyzeRetrievalQuality();
310
+ expect(analysis.helpfulPatterns).toContain('pb-1');
311
+ expect(analysis.misleadingPatterns).toContain('pb-2');
312
+ });
313
+
314
+ it('should recommend refinement for consistently misleading playbooks', async () => {
315
+ for (let i = 0; i < 4; i++) {
316
+ await meta.recordObservation(
317
+ makeObservation({ playbooksApplied: ['pb-bad'], retrievalQuality: 'misleading' })
318
+ );
319
+ }
320
+
321
+ const analysis = await meta.analyzeRetrievalQuality();
322
+ expect(analysis.recommendations.length).toBeGreaterThan(0);
323
+ expect(analysis.recommendations[0]).toContain('pb-bad');
324
+ });
325
+ });
326
+
327
+ describe('generateMetaStrategies', () => {
328
+ it('should generate low-confidence-explore strategy when pattern detected', async () => {
329
+ // 5+ low-confidence observations with low success
330
+ for (let i = 0; i < 6; i++) {
331
+ await meta.recordObservation(
332
+ makeObservation({ confidence: 0.3, success: i === 0 }) // 1/6 success
333
+ );
334
+ }
335
+
336
+ const strategies = await meta.generateMetaStrategies();
337
+ const lowConf = strategies.find((s) => s.name === 'low-confidence-explore');
338
+ expect(lowConf).toBeDefined();
339
+ expect(lowConf!.adjustment.routingBias.explore).toBe(0.2);
340
+ });
341
+
342
+ it('should not duplicate existing strategies', async () => {
343
+ for (let i = 0; i < 6; i++) {
344
+ await meta.recordObservation(makeObservation({ confidence: 0.3, success: false }));
345
+ }
346
+
347
+ // Generate once
348
+ await meta.generateMetaStrategies();
349
+ // Generate again — should not duplicate
350
+ const second = await meta.generateMetaStrategies();
351
+ const lowConf = second.filter((s) => s.name === 'low-confidence-explore');
352
+ expect(lowConf).toHaveLength(0); // Already existed, not re-created
353
+ });
354
+ });
355
+
356
+ describe('getStats', () => {
357
+ it('should return zeroes for empty store', async () => {
358
+ const stats = await meta.getStats();
359
+ expect(stats.totalObservations).toBe(0);
360
+ expect(stats.totalStrategies).toBe(0);
361
+ expect(stats.averageSuccessRate).toBe(0);
362
+ expect(stats.recentTrend).toBe('stable');
363
+ });
364
+
365
+ it('should count observations by routing decision', async () => {
366
+ await meta.recordObservation(makeObservation({ decision: 'direct' }));
367
+ await meta.recordObservation(makeObservation({ decision: 'direct' }));
368
+ await meta.recordObservation(makeObservation({ decision: 'explore' }));
369
+
370
+ const stats = await meta.getStats();
371
+ expect(stats.totalObservations).toBe(3);
372
+ expect(stats.observationsByRouting.direct).toBe(2);
373
+ expect(stats.observationsByRouting.explore).toBe(1);
374
+ });
375
+
376
+ it('should compute average success rate', async () => {
377
+ await meta.recordObservation(makeObservation({ success: true }));
378
+ await meta.recordObservation(makeObservation({ success: true }));
379
+ await meta.recordObservation(makeObservation({ success: false }));
380
+
381
+ const stats = await meta.getStats();
382
+ expect(stats.averageSuccessRate).toBeCloseTo(2 / 3, 2);
383
+ });
384
+ });
385
+
386
+ describe('pruneOldObservations', () => {
387
+ it('should remove observations older than maxAge', async () => {
388
+ const oldDate = new Date(Date.now() - 60 * 24 * 60 * 60 * 1000); // 60 days ago
389
+ await meta.recordObservation(makeObservation({ timestamp: oldDate }));
390
+ await meta.recordObservation(makeObservation()); // now
391
+
392
+ const removed = await meta.pruneOldObservations(30 * 24 * 60 * 60 * 1000); // 30 day cutoff
393
+ expect(removed).toBeGreaterThanOrEqual(1);
394
+
395
+ const remaining = await meta.getObservations();
396
+ expect(remaining).toHaveLength(1);
397
+ });
398
+ });
399
+ });
@@ -0,0 +1,257 @@
1
+ import { describe, it, expect, beforeEach } from 'vitest';
2
+ import {
3
+ SolutionEvaluator,
4
+ createSolutionEvaluator,
5
+ type VerificationResult,
6
+ } from '../../src/search/evaluator.js';
7
+ import { createTrajectory } from '../../src/types/trajectory.js';
8
+ import { createTask } from '../../src/types/task.js';
9
+ import { createStep } from '../../src/types/step.js';
10
+ import { successOutcome, failureOutcome } from '../../src/types/outcome.js';
11
+ import type { Trajectory } from '../../src/types/trajectory.js';
12
+ import type { Task } from '../../src/types/task.js';
13
+
14
+ function makeTask(overrides?: Partial<Task>): Task {
15
+ return createTask({
16
+ domain: 'code',
17
+ description: 'fix a bug in the login module',
18
+ ...overrides,
19
+ });
20
+ }
21
+
22
+ function makeTrajectory(opts?: {
23
+ success?: boolean;
24
+ stepCount?: number;
25
+ errorSteps?: number;
26
+ withAttribution?: boolean;
27
+ }): Trajectory {
28
+ const steps = [];
29
+ const count = opts?.stepCount ?? 3;
30
+ const errorCount = opts?.errorSteps ?? 0;
31
+
32
+ for (let i = 0; i < count; i++) {
33
+ steps.push(
34
+ createStep({
35
+ action: `Step ${i + 1}`,
36
+ observation: i < errorCount ? 'error: something failed' : 'ok',
37
+ attributionScore: opts?.withAttribution ? 0.2 : undefined,
38
+ })
39
+ );
40
+ }
41
+
42
+ return createTrajectory({
43
+ task: makeTask(),
44
+ steps,
45
+ outcome: (opts?.success ?? true) ? successOutcome('done') : failureOutcome('failed'),
46
+ agentId: 'agent-1',
47
+ });
48
+ }
49
+
50
+ describe('SolutionEvaluator', () => {
51
+ let evaluator: SolutionEvaluator;
52
+
53
+ beforeEach(() => {
54
+ evaluator = createSolutionEvaluator(null);
55
+ });
56
+
57
+ describe('heuristic evaluation (fallback)', () => {
58
+ it('should rate successful trajectory with few steps well', async () => {
59
+ const result = await evaluator.evaluate(
60
+ makeTrajectory({ success: true, stepCount: 3 }),
61
+ makeTask()
62
+ );
63
+
64
+ expect(result.method).toBe('heuristic');
65
+ expect(result.score).toBeGreaterThanOrEqual(0.7);
66
+ expect(result.acceptable).toBe(true);
67
+ expect(result.issues).toHaveLength(0);
68
+ });
69
+
70
+ it('should penalize failed trajectory', async () => {
71
+ const result = await evaluator.evaluate(
72
+ makeTrajectory({ success: false }),
73
+ makeTask()
74
+ );
75
+
76
+ expect(result.score).toBeLessThan(0.6);
77
+ expect(result.acceptable).toBe(false);
78
+ expect(result.issues.length).toBeGreaterThan(0);
79
+ expect(result.issues[0].type).toBe('error');
80
+ });
81
+
82
+ it('should penalize empty trajectory', async () => {
83
+ const trajectory = createTrajectory({
84
+ task: makeTask(),
85
+ steps: [],
86
+ outcome: failureOutcome('no steps taken'),
87
+ agentId: 'a',
88
+ });
89
+
90
+ const result = await evaluator.evaluate(trajectory, makeTask());
91
+ expect(result.score).toBeLessThan(0.3);
92
+ expect(result.issues.some((i) => i.type === 'incomplete')).toBe(true);
93
+ });
94
+
95
+ it('should penalize very long trajectories', async () => {
96
+ const result = await evaluator.evaluate(
97
+ makeTrajectory({ success: true, stepCount: 25 }),
98
+ makeTask()
99
+ );
100
+
101
+ expect(result.issues.some((i) => i.type === 'inefficient')).toBe(true);
102
+ });
103
+
104
+ it('should penalize trajectories with error steps', async () => {
105
+ const result = await evaluator.evaluate(
106
+ makeTrajectory({ success: true, stepCount: 5, errorSteps: 3 }),
107
+ makeTask()
108
+ );
109
+
110
+ expect(result.score).toBeLessThan(
111
+ // Compare with clean trajectory
112
+ (await evaluator.evaluate(makeTrajectory({ success: true, stepCount: 5 }), makeTask())).score
113
+ );
114
+ });
115
+
116
+ it('should boost score for steps with high attribution', async () => {
117
+ const withAttr = await evaluator.evaluate(
118
+ makeTrajectory({ success: true, withAttribution: true }),
119
+ makeTask()
120
+ );
121
+ const withoutAttr = await evaluator.evaluate(
122
+ makeTrajectory({ success: true }),
123
+ makeTask()
124
+ );
125
+
126
+ expect(withAttr.score).toBeGreaterThanOrEqual(withoutAttr.score);
127
+ });
128
+
129
+ it('should clamp score between 0 and 1', async () => {
130
+ // Very bad: failed + empty + errors
131
+ const trajectory = createTrajectory({
132
+ task: makeTask(),
133
+ steps: [],
134
+ outcome: failureOutcome('total failure'),
135
+ agentId: 'a',
136
+ });
137
+
138
+ const result = await evaluator.evaluate(trajectory, makeTask());
139
+ expect(result.score).toBeGreaterThanOrEqual(0);
140
+ expect(result.score).toBeLessThanOrEqual(1);
141
+ });
142
+ });
143
+
144
+ describe('verification-based evaluation', () => {
145
+ it('should use registered verifier for matching domain', async () => {
146
+ const verifier = async (_t: Trajectory, _task: Task): Promise<VerificationResult> => ({
147
+ passed: true,
148
+ confidence: 0.95,
149
+ issues: [],
150
+ details: 'all tests pass',
151
+ });
152
+
153
+ evaluator.registerVerifier('code', verifier);
154
+
155
+ const result = await evaluator.evaluate(makeTrajectory(), makeTask({ domain: 'code' }));
156
+
157
+ expect(result.method).toBe('verification');
158
+ expect(result.acceptable).toBe(true);
159
+ expect(result.score).toBeGreaterThanOrEqual(0.7);
160
+ });
161
+
162
+ it('should fall through to heuristic when verification confidence too low', async () => {
163
+ const verifier = async (): Promise<VerificationResult> => ({
164
+ passed: true,
165
+ confidence: 0.3, // Below default threshold of 0.8
166
+ });
167
+
168
+ evaluator.registerVerifier('code', verifier);
169
+
170
+ const result = await evaluator.evaluate(makeTrajectory(), makeTask({ domain: 'code' }));
171
+ expect(result.method).toBe('heuristic');
172
+ });
173
+
174
+ it('should fall through when verifier throws', async () => {
175
+ const verifier = async (): Promise<VerificationResult> => {
176
+ throw new Error('verification crashed');
177
+ };
178
+
179
+ evaluator.registerVerifier('code', verifier);
180
+
181
+ const result = await evaluator.evaluate(makeTrajectory(), makeTask({ domain: 'code' }));
182
+ expect(result.method).toBe('heuristic');
183
+ });
184
+
185
+ it('should handle failed verification with issues', async () => {
186
+ const verifier = async (): Promise<VerificationResult> => ({
187
+ passed: false,
188
+ confidence: 0.9,
189
+ issues: [
190
+ { type: 'incorrect', description: 'test failed: should return 42', severity: 'major' },
191
+ ],
192
+ });
193
+
194
+ evaluator.registerVerifier('code', verifier);
195
+
196
+ const result = await evaluator.evaluate(makeTrajectory(), makeTask({ domain: 'code' }));
197
+
198
+ expect(result.method).toBe('verification');
199
+ expect(result.acceptable).toBe(false);
200
+ expect(result.score).toBeLessThan(0.5);
201
+ expect(result.issues).toHaveLength(1);
202
+ });
203
+
204
+ it('should not use verifier for wrong domain', async () => {
205
+ const verifier = async (): Promise<VerificationResult> => ({
206
+ passed: true,
207
+ confidence: 0.95,
208
+ });
209
+
210
+ evaluator.registerVerifier('python', verifier);
211
+
212
+ // Task domain is 'code', not 'python'
213
+ const result = await evaluator.evaluate(makeTrajectory(), makeTask({ domain: 'code' }));
214
+ expect(result.method).toBe('heuristic');
215
+ });
216
+ });
217
+
218
+ describe('alwaysUseAgent config', () => {
219
+ it('should skip verification shortcut when alwaysUseAgent=true', async () => {
220
+ const alwaysAgentEval = createSolutionEvaluator(null, { alwaysUseAgent: true });
221
+
222
+ const verifier = async (): Promise<VerificationResult> => ({
223
+ passed: true,
224
+ confidence: 0.99,
225
+ });
226
+ alwaysAgentEval.registerVerifier('code', verifier);
227
+
228
+ // With no agent or taskRunner, falls through to heuristic
229
+ const result = await alwaysAgentEval.evaluate(makeTrajectory(), makeTask({ domain: 'code' }));
230
+ expect(result.method).toBe('heuristic');
231
+ });
232
+ });
233
+
234
+ describe('parseQuality edge cases', () => {
235
+ // Test through parseAgentEvaluation indirectly using mock agent
236
+ it('should handle unknown quality strings gracefully', async () => {
237
+ // Falls through to heuristic since no agent available
238
+ const result = await evaluator.evaluate(makeTrajectory(), makeTask());
239
+ expect(['excellent', 'good', 'needs_work', 'poor']).toContain(result.quality);
240
+ });
241
+ });
242
+ });
243
+
244
+ describe('scoreToQuality (used by evaluator)', () => {
245
+ it('should map scores to correct quality levels', async () => {
246
+ const { scoreToQuality } = await import('../../src/search/refinement-types.js');
247
+
248
+ expect(scoreToQuality(0.9)).toBe('excellent');
249
+ expect(scoreToQuality(0.85)).toBe('excellent');
250
+ expect(scoreToQuality(0.75)).toBe('good');
251
+ expect(scoreToQuality(0.7)).toBe('good');
252
+ expect(scoreToQuality(0.5)).toBe('needs_work');
253
+ expect(scoreToQuality(0.4)).toBe('needs_work');
254
+ expect(scoreToQuality(0.3)).toBe('poor');
255
+ expect(scoreToQuality(0.0)).toBe('poor');
256
+ });
257
+ });