outcome-cli 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +261 -0
- package/package.json +95 -0
- package/src/agents/README.md +139 -0
- package/src/agents/adapters/anthropic.adapter.ts +166 -0
- package/src/agents/adapters/dalle.adapter.ts +145 -0
- package/src/agents/adapters/gemini.adapter.ts +134 -0
- package/src/agents/adapters/imagen.adapter.ts +106 -0
- package/src/agents/adapters/nano-banana.adapter.ts +129 -0
- package/src/agents/adapters/openai.adapter.ts +165 -0
- package/src/agents/adapters/veo.adapter.ts +130 -0
- package/src/agents/agent.schema.property.test.ts +379 -0
- package/src/agents/agent.schema.test.ts +148 -0
- package/src/agents/agent.schema.ts +263 -0
- package/src/agents/index.ts +60 -0
- package/src/agents/registered-agent.schema.ts +356 -0
- package/src/agents/registry.ts +97 -0
- package/src/agents/tournament-configs.property.test.ts +266 -0
- package/src/cli/README.md +145 -0
- package/src/cli/commands/define.ts +79 -0
- package/src/cli/commands/list.ts +46 -0
- package/src/cli/commands/logs.ts +83 -0
- package/src/cli/commands/run.ts +416 -0
- package/src/cli/commands/verify.ts +110 -0
- package/src/cli/index.ts +81 -0
- package/src/config/README.md +128 -0
- package/src/config/env.ts +262 -0
- package/src/config/index.ts +19 -0
- package/src/eval/README.md +318 -0
- package/src/eval/ai-judge.test.ts +435 -0
- package/src/eval/ai-judge.ts +368 -0
- package/src/eval/code-validators.ts +414 -0
- package/src/eval/evaluateOutcome.property.test.ts +1174 -0
- package/src/eval/evaluateOutcome.ts +591 -0
- package/src/eval/immigration-validators.ts +122 -0
- package/src/eval/index.ts +90 -0
- package/src/eval/judge-cache.ts +402 -0
- package/src/eval/tournament-validators.property.test.ts +439 -0
- package/src/eval/validators.property.test.ts +1118 -0
- package/src/eval/validators.ts +1199 -0
- package/src/eval/weighted-scorer.ts +285 -0
- package/src/index.ts +17 -0
- package/src/league/README.md +188 -0
- package/src/league/health-check.ts +353 -0
- package/src/league/index.ts +93 -0
- package/src/league/killAgent.ts +151 -0
- package/src/league/league.test.ts +1151 -0
- package/src/league/runLeague.ts +843 -0
- package/src/league/scoreAgent.ts +175 -0
- package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
- package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
- package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
- package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
- package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
- package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
- package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
- package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
- package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
- package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
- package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
- package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
- package/src/modules/omnibridge/api/.gitkeep +1 -0
- package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
- package/src/modules/omnibridge/auth/.gitkeep +1 -0
- package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
- package/src/modules/omnibridge/auth/session-vault.ts +577 -0
- package/src/modules/omnibridge/core/.gitkeep +1 -0
- package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
- package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
- package/src/modules/omnibridge/core/types.ts +610 -0
- package/src/modules/omnibridge/execution/.gitkeep +1 -0
- package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
- package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
- package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
- package/src/modules/omnibridge/index.ts +212 -0
- package/src/modules/omnibridge/omnibridge.ts +510 -0
- package/src/modules/omnibridge/verification/.gitkeep +1 -0
- package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
- package/src/outcomes/README.md +75 -0
- package/src/outcomes/acquire-pilot-customer.ts +297 -0
- package/src/outcomes/code-delivery-outcomes.ts +89 -0
- package/src/outcomes/code-outcomes.ts +256 -0
- package/src/outcomes/code_review_battle.test.ts +135 -0
- package/src/outcomes/code_review_battle.ts +135 -0
- package/src/outcomes/cold_email_battle.ts +97 -0
- package/src/outcomes/content_creation_battle.ts +160 -0
- package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
- package/src/outcomes/index.ts +107 -0
- package/src/outcomes/lead_gen_battle.test.ts +113 -0
- package/src/outcomes/lead_gen_battle.ts +99 -0
- package/src/outcomes/outcome.schema.property.test.ts +229 -0
- package/src/outcomes/outcome.schema.ts +187 -0
- package/src/outcomes/qualified_sales_interest.ts +118 -0
- package/src/outcomes/swarm_planner.property.test.ts +370 -0
- package/src/outcomes/swarm_planner.ts +96 -0
- package/src/outcomes/web_extraction.ts +234 -0
- package/src/runtime/README.md +220 -0
- package/src/runtime/agentRunner.test.ts +341 -0
- package/src/runtime/agentRunner.ts +746 -0
- package/src/runtime/claudeAdapter.ts +232 -0
- package/src/runtime/costTracker.ts +123 -0
- package/src/runtime/index.ts +34 -0
- package/src/runtime/modelAdapter.property.test.ts +305 -0
- package/src/runtime/modelAdapter.ts +144 -0
- package/src/runtime/openaiAdapter.ts +235 -0
- package/src/utils/README.md +122 -0
- package/src/utils/command-runner.ts +134 -0
- package/src/utils/cost-guard.ts +379 -0
- package/src/utils/errors.test.ts +290 -0
- package/src/utils/errors.ts +442 -0
- package/src/utils/index.ts +37 -0
- package/src/utils/logger.test.ts +361 -0
- package/src/utils/logger.ts +419 -0
- package/src/utils/output-parsers.ts +216 -0
|
@@ -0,0 +1,769 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Verification Engine Property Tests
|
|
3
|
+
*
|
|
4
|
+
* Property-based tests for hallucination detection, diff analysis,
|
|
5
|
+
* and confidence scoring.
|
|
6
|
+
*
|
|
7
|
+
* Requirements: 8.4, 8.5, 8.6
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { describe, test, expect, beforeEach } from 'vitest';
|
|
11
|
+
import * as fc from 'fast-check';
|
|
12
|
+
import {
|
|
13
|
+
VerificationEngine,
|
|
14
|
+
createVerificationEngine,
|
|
15
|
+
} from '../verification/verification-engine.js';
|
|
16
|
+
import type { ActionLogEntry } from '../core/types.js';
|
|
17
|
+
|
|
18
|
+
// =============================================================================
|
|
19
|
+
// Test Setup
|
|
20
|
+
// =============================================================================
|
|
21
|
+
|
|
22
|
+
// Note: Each property test creates its own verificationEngine instance to ensure isolation
|
|
23
|
+
// The beforeEach instance is only used for tests that explicitly need shared state
|
|
24
|
+
|
|
25
|
+
let verificationEngine: VerificationEngine;
|
|
26
|
+
|
|
27
|
+
beforeEach(() => {
|
|
28
|
+
verificationEngine = createVerificationEngine({
|
|
29
|
+
reliabilityThreshold: 0.7,
|
|
30
|
+
actionSuccessWeight: 0.3,
|
|
31
|
+
extractPresenceWeight: 0.25,
|
|
32
|
+
alignmentWeight: 0.3,
|
|
33
|
+
completenessWeight: 0.15,
|
|
34
|
+
});
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
// =============================================================================
|
|
38
|
+
// Arbitraries
|
|
39
|
+
// =============================================================================
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Generate arbitrary session IDs.
|
|
43
|
+
*/
|
|
44
|
+
const sessionIdArbitrary = fc.stringMatching(/^session_[a-z0-9]{8,16}$/);
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Generate arbitrary intent IDs.
|
|
48
|
+
*/
|
|
49
|
+
const intentIdArbitrary = fc.stringMatching(
|
|
50
|
+
/^(ACTION|INPUT|DISPLAY|NAV)_ID:[A-Z_]{3,20}$/
|
|
51
|
+
);
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Generate arbitrary action types.
|
|
55
|
+
*/
|
|
56
|
+
const actionTypeArbitrary = fc.constantFrom(
|
|
57
|
+
'click',
|
|
58
|
+
'type',
|
|
59
|
+
'navigate',
|
|
60
|
+
'wait',
|
|
61
|
+
'extract'
|
|
62
|
+
) as fc.Arbitrary<'click' | 'type' | 'navigate' | 'wait' | 'extract'>;
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Generate arbitrary non-extract action types.
|
|
66
|
+
*/
|
|
67
|
+
const nonExtractActionTypeArbitrary = fc.constantFrom(
|
|
68
|
+
'click',
|
|
69
|
+
'type',
|
|
70
|
+
'navigate',
|
|
71
|
+
'wait'
|
|
72
|
+
) as fc.Arbitrary<'click' | 'type' | 'navigate' | 'wait'>;
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Generate arbitrary action results.
|
|
76
|
+
*/
|
|
77
|
+
const actionResultArbitrary = fc.constantFrom('success', 'failure') as fc.Arbitrary<
|
|
78
|
+
'success' | 'failure'
|
|
79
|
+
>;
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Generate arbitrary action log entries with timestamp.
|
|
83
|
+
*/
|
|
84
|
+
const actionLogEntryArbitrary = fc.record({
|
|
85
|
+
timestamp: fc.integer({ min: 1000000000000, max: 2000000000000 }),
|
|
86
|
+
sessionId: sessionIdArbitrary,
|
|
87
|
+
action: actionTypeArbitrary,
|
|
88
|
+
intentId: intentIdArbitrary,
|
|
89
|
+
value: fc.option(fc.string({ minLength: 0, maxLength: 100 }), { nil: undefined }),
|
|
90
|
+
result: actionResultArbitrary,
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Generate arbitrary action log entries without extract actions.
|
|
95
|
+
*/
|
|
96
|
+
const nonExtractActionLogEntryArbitrary = fc.record({
|
|
97
|
+
timestamp: fc.integer({ min: 1000000000000, max: 2000000000000 }),
|
|
98
|
+
sessionId: sessionIdArbitrary,
|
|
99
|
+
action: nonExtractActionTypeArbitrary,
|
|
100
|
+
intentId: intentIdArbitrary,
|
|
101
|
+
value: fc.option(fc.string({ minLength: 0, maxLength: 100 }), { nil: undefined }),
|
|
102
|
+
result: actionResultArbitrary,
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Generate arbitrary failed action log entries.
|
|
107
|
+
*/
|
|
108
|
+
const failedActionLogEntryArbitrary = fc.record({
|
|
109
|
+
timestamp: fc.integer({ min: 1000000000000, max: 2000000000000 }),
|
|
110
|
+
sessionId: sessionIdArbitrary,
|
|
111
|
+
action: actionTypeArbitrary,
|
|
112
|
+
intentId: intentIdArbitrary,
|
|
113
|
+
value: fc.option(fc.string({ minLength: 0, maxLength: 100 }), { nil: undefined }),
|
|
114
|
+
result: fc.constant('failure' as const),
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Generate arbitrary claimed results with data.
|
|
119
|
+
*/
|
|
120
|
+
const claimedResultWithDataArbitrary = fc.record({
|
|
121
|
+
data: fc.record({
|
|
122
|
+
items: fc.array(fc.string({ minLength: 1, maxLength: 20 }), { minLength: 1, maxLength: 5 }),
|
|
123
|
+
total: fc.integer({ min: 1, max: 100 }),
|
|
124
|
+
}),
|
|
125
|
+
metadata: fc.record({
|
|
126
|
+
confidence: fc.float({ min: 0, max: 1 }),
|
|
127
|
+
executionTimeMs: fc.integer({ min: 0, max: 10000 }),
|
|
128
|
+
actionsPerformed: fc.integer({ min: 0, max: 100 }),
|
|
129
|
+
}),
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Generate arbitrary successful action log entries with extract.
|
|
134
|
+
*/
|
|
135
|
+
const successfulExtractActionArbitrary = fc.record({
|
|
136
|
+
timestamp: fc.integer({ min: 1000000000000, max: 2000000000000 }),
|
|
137
|
+
sessionId: sessionIdArbitrary,
|
|
138
|
+
action: fc.constant('extract' as const),
|
|
139
|
+
intentId: intentIdArbitrary,
|
|
140
|
+
value: fc.option(fc.string({ minLength: 0, maxLength: 100 }), { nil: undefined }),
|
|
141
|
+
result: fc.constant('success' as const),
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
// =============================================================================
|
|
145
|
+
// Property 19: Hallucination Detection
|
|
146
|
+
// =============================================================================
|
|
147
|
+
|
|
148
|
+
describe('Property 19: Hallucination Detection', () => {
|
|
149
|
+
/**
|
|
150
|
+
* **Feature: omnibridge, Property 19: Hallucination Detection**
|
|
151
|
+
*
|
|
152
|
+
* *For any* agent result where the claimed outcome cannot be derived from
|
|
153
|
+
* the recorded action sequence, the verification SHALL flag
|
|
154
|
+
* `isHallucination: true`.
|
|
155
|
+
*
|
|
156
|
+
* **Validates: Requirements 8.5**
|
|
157
|
+
*/
|
|
158
|
+
test(
|
|
159
|
+
'Property 19: claimed data without extract actions is flagged as hallucination',
|
|
160
|
+
async () => {
|
|
161
|
+
await fc.assert(
|
|
162
|
+
fc.asyncProperty(
|
|
163
|
+
fc.array(nonExtractActionLogEntryArbitrary, { minLength: 1, maxLength: 10 }),
|
|
164
|
+
claimedResultWithDataArbitrary,
|
|
165
|
+
async (actionLog, claimedResult) => {
|
|
166
|
+
const result = verificationEngine.detectHallucination(
|
|
167
|
+
claimedResult,
|
|
168
|
+
actionLog
|
|
169
|
+
);
|
|
170
|
+
|
|
171
|
+
// Should be flagged as hallucination since no extract actions
|
|
172
|
+
expect(result.isHallucination).toBe(true);
|
|
173
|
+
expect(result.reasons.length).toBeGreaterThan(0);
|
|
174
|
+
expect(result.reasons).toContain(
|
|
175
|
+
'Data claimed but no extract actions performed'
|
|
176
|
+
);
|
|
177
|
+
|
|
178
|
+
return true;
|
|
179
|
+
}
|
|
180
|
+
),
|
|
181
|
+
{ numRuns: 100 }
|
|
182
|
+
);
|
|
183
|
+
}
|
|
184
|
+
);
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Claimed data with extract actions is NOT flagged as hallucination.
|
|
188
|
+
*/
|
|
189
|
+
test(
|
|
190
|
+
'claimed data with extract actions is NOT flagged as hallucination',
|
|
191
|
+
async () => {
|
|
192
|
+
await fc.assert(
|
|
193
|
+
fc.asyncProperty(
|
|
194
|
+
successfulExtractActionArbitrary,
|
|
195
|
+
fc.array(actionLogEntryArbitrary, { minLength: 0, maxLength: 5 }),
|
|
196
|
+
claimedResultWithDataArbitrary,
|
|
197
|
+
async (extractAction, otherActions, claimedResult) => {
|
|
198
|
+
// Combine extract action with other actions
|
|
199
|
+
const actionLog = [extractAction, ...otherActions];
|
|
200
|
+
|
|
201
|
+
const result = verificationEngine.detectHallucination(
|
|
202
|
+
claimedResult,
|
|
203
|
+
actionLog
|
|
204
|
+
);
|
|
205
|
+
|
|
206
|
+
// Should NOT be flagged for missing extract actions
|
|
207
|
+
expect(result.reasons).not.toContain(
|
|
208
|
+
'Data claimed but no extract actions performed'
|
|
209
|
+
);
|
|
210
|
+
|
|
211
|
+
return true;
|
|
212
|
+
}
|
|
213
|
+
),
|
|
214
|
+
{ numRuns: 100 }
|
|
215
|
+
);
|
|
216
|
+
}
|
|
217
|
+
);
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* All failed actions with success claim is flagged as hallucination.
|
|
221
|
+
*/
|
|
222
|
+
test(
|
|
223
|
+
'all failed actions with success claim is flagged as hallucination',
|
|
224
|
+
async () => {
|
|
225
|
+
await fc.assert(
|
|
226
|
+
fc.asyncProperty(
|
|
227
|
+
fc.array(failedActionLogEntryArbitrary, { minLength: 1, maxLength: 10 }),
|
|
228
|
+
claimedResultWithDataArbitrary,
|
|
229
|
+
async (actionLog, claimedResult) => {
|
|
230
|
+
const result = verificationEngine.detectHallucination(
|
|
231
|
+
claimedResult,
|
|
232
|
+
actionLog
|
|
233
|
+
);
|
|
234
|
+
|
|
235
|
+
// Should be flagged as hallucination since all actions failed
|
|
236
|
+
expect(result.isHallucination).toBe(true);
|
|
237
|
+
expect(result.reasons).toContain(
|
|
238
|
+
'Success claimed but all actions failed'
|
|
239
|
+
);
|
|
240
|
+
|
|
241
|
+
return true;
|
|
242
|
+
}
|
|
243
|
+
),
|
|
244
|
+
{ numRuns: 100 }
|
|
245
|
+
);
|
|
246
|
+
}
|
|
247
|
+
);
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* No actions with claimed result is flagged as hallucination.
|
|
251
|
+
*/
|
|
252
|
+
test(
|
|
253
|
+
'no actions with claimed result is flagged as hallucination',
|
|
254
|
+
async () => {
|
|
255
|
+
await fc.assert(
|
|
256
|
+
fc.asyncProperty(claimedResultWithDataArbitrary, async (claimedResult) => {
|
|
257
|
+
const result = verificationEngine.detectHallucination(claimedResult, []);
|
|
258
|
+
|
|
259
|
+
// Should be flagged as hallucination since no actions
|
|
260
|
+
expect(result.isHallucination).toBe(true);
|
|
261
|
+
expect(result.reasons).toContain(
|
|
262
|
+
'Result claimed with no actions performed'
|
|
263
|
+
);
|
|
264
|
+
|
|
265
|
+
return true;
|
|
266
|
+
}),
|
|
267
|
+
{ numRuns: 100 }
|
|
268
|
+
);
|
|
269
|
+
}
|
|
270
|
+
);
|
|
271
|
+
|
|
272
|
+
/**
|
|
273
|
+
* Null claimed result is never flagged as hallucination.
|
|
274
|
+
*/
|
|
275
|
+
test(
|
|
276
|
+
'null claimed result is never flagged as hallucination',
|
|
277
|
+
async () => {
|
|
278
|
+
await fc.assert(
|
|
279
|
+
fc.asyncProperty(
|
|
280
|
+
fc.array(actionLogEntryArbitrary, { minLength: 0, maxLength: 10 }),
|
|
281
|
+
async (actionLog) => {
|
|
282
|
+
const result = verificationEngine.detectHallucination(null, actionLog);
|
|
283
|
+
|
|
284
|
+
// Null result should never be a hallucination
|
|
285
|
+
expect(result.isHallucination).toBe(false);
|
|
286
|
+
expect(result.reasons.length).toBe(0);
|
|
287
|
+
|
|
288
|
+
return true;
|
|
289
|
+
}
|
|
290
|
+
),
|
|
291
|
+
{ numRuns: 100 }
|
|
292
|
+
);
|
|
293
|
+
}
|
|
294
|
+
);
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Undefined claimed result is never flagged as hallucination.
|
|
298
|
+
*/
|
|
299
|
+
test(
|
|
300
|
+
'undefined claimed result is never flagged as hallucination',
|
|
301
|
+
async () => {
|
|
302
|
+
await fc.assert(
|
|
303
|
+
fc.asyncProperty(
|
|
304
|
+
fc.array(actionLogEntryArbitrary, { minLength: 0, maxLength: 10 }),
|
|
305
|
+
async (actionLog) => {
|
|
306
|
+
const result = verificationEngine.detectHallucination(undefined, actionLog);
|
|
307
|
+
|
|
308
|
+
// Undefined result should never be a hallucination
|
|
309
|
+
expect(result.isHallucination).toBe(false);
|
|
310
|
+
expect(result.reasons.length).toBe(0);
|
|
311
|
+
|
|
312
|
+
return true;
|
|
313
|
+
}
|
|
314
|
+
),
|
|
315
|
+
{ numRuns: 100 }
|
|
316
|
+
);
|
|
317
|
+
}
|
|
318
|
+
);
|
|
319
|
+
|
|
320
|
+
/**
|
|
321
|
+
* Action summary is correctly calculated.
|
|
322
|
+
*/
|
|
323
|
+
test(
|
|
324
|
+
'action summary is correctly calculated',
|
|
325
|
+
async () => {
|
|
326
|
+
await fc.assert(
|
|
327
|
+
fc.asyncProperty(
|
|
328
|
+
fc.array(actionLogEntryArbitrary, { minLength: 1, maxLength: 20 }),
|
|
329
|
+
async (actionLog) => {
|
|
330
|
+
const result = verificationEngine.detectHallucination(null, actionLog);
|
|
331
|
+
|
|
332
|
+
// Verify action summary
|
|
333
|
+
expect(result.actionSummary.totalActions).toBe(actionLog.length);
|
|
334
|
+
|
|
335
|
+
const expectedSuccessful = actionLog.filter(
|
|
336
|
+
(a) => a.result === 'success'
|
|
337
|
+
).length;
|
|
338
|
+
expect(result.actionSummary.successfulActions).toBe(expectedSuccessful);
|
|
339
|
+
|
|
340
|
+
const expectedFailed = actionLog.filter(
|
|
341
|
+
(a) => a.result === 'failure'
|
|
342
|
+
).length;
|
|
343
|
+
expect(result.actionSummary.failedActions).toBe(expectedFailed);
|
|
344
|
+
|
|
345
|
+
const expectedExtract = actionLog.filter(
|
|
346
|
+
(a) => a.action === 'extract'
|
|
347
|
+
).length;
|
|
348
|
+
expect(result.actionSummary.extractActions).toBe(expectedExtract);
|
|
349
|
+
|
|
350
|
+
return true;
|
|
351
|
+
}
|
|
352
|
+
),
|
|
353
|
+
{ numRuns: 100 }
|
|
354
|
+
);
|
|
355
|
+
}
|
|
356
|
+
);
|
|
357
|
+
|
|
358
|
+
/**
|
|
359
|
+
* Hallucination confidence is between 0 and 1.
|
|
360
|
+
*/
|
|
361
|
+
test(
|
|
362
|
+
'hallucination confidence is between 0 and 1',
|
|
363
|
+
async () => {
|
|
364
|
+
await fc.assert(
|
|
365
|
+
fc.asyncProperty(
|
|
366
|
+
fc.array(actionLogEntryArbitrary, { minLength: 0, maxLength: 10 }),
|
|
367
|
+
fc.oneof(
|
|
368
|
+
fc.constant(null),
|
|
369
|
+
fc.constant(undefined),
|
|
370
|
+
claimedResultWithDataArbitrary
|
|
371
|
+
),
|
|
372
|
+
async (actionLog, claimedResult) => {
|
|
373
|
+
const result = verificationEngine.detectHallucination(
|
|
374
|
+
claimedResult,
|
|
375
|
+
actionLog
|
|
376
|
+
);
|
|
377
|
+
|
|
378
|
+
expect(result.confidence).toBeGreaterThanOrEqual(0);
|
|
379
|
+
expect(result.confidence).toBeLessThanOrEqual(1);
|
|
380
|
+
|
|
381
|
+
return true;
|
|
382
|
+
}
|
|
383
|
+
),
|
|
384
|
+
{ numRuns: 100 }
|
|
385
|
+
);
|
|
386
|
+
}
|
|
387
|
+
);
|
|
388
|
+
});
|
|
389
|
+
|
|
390
|
+
// =============================================================================
|
|
391
|
+
// Diff Analysis Tests (Requirement 8.4)
|
|
392
|
+
// =============================================================================
|
|
393
|
+
|
|
394
|
+
describe('Diff Analysis for Conflicts', () => {
|
|
395
|
+
/**
|
|
396
|
+
* Divergence point is correctly identified.
|
|
397
|
+
*/
|
|
398
|
+
test(
|
|
399
|
+
'divergence point is correctly identified',
|
|
400
|
+
async () => {
|
|
401
|
+
await fc.assert(
|
|
402
|
+
fc.asyncProperty(
|
|
403
|
+
fc.array(actionLogEntryArbitrary, { minLength: 2, maxLength: 10 }),
|
|
404
|
+
fc.integer({ min: 0, max: 9 }),
|
|
405
|
+
async (commonActions, divergeIndexRaw) => {
|
|
406
|
+
const divergeIndex = divergeIndexRaw % commonActions.length;
|
|
407
|
+
|
|
408
|
+
// Create two logs that share common prefix
|
|
409
|
+
const logA: ActionLogEntry[] = [];
|
|
410
|
+
const logB: ActionLogEntry[] = [];
|
|
411
|
+
|
|
412
|
+
for (let i = 0; i < divergeIndex; i++) {
|
|
413
|
+
logA.push(commonActions[i]);
|
|
414
|
+
logB.push(commonActions[i]);
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
// Add divergent actions
|
|
418
|
+
if (divergeIndex < commonActions.length) {
|
|
419
|
+
logA.push({
|
|
420
|
+
...commonActions[divergeIndex],
|
|
421
|
+
result: 'success',
|
|
422
|
+
});
|
|
423
|
+
logB.push({
|
|
424
|
+
...commonActions[divergeIndex],
|
|
425
|
+
result: 'failure',
|
|
426
|
+
});
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
const analysis = verificationEngine.analyzeDiff(logA, logB);
|
|
430
|
+
|
|
431
|
+
expect(analysis.divergencePoint).toBe(divergeIndex);
|
|
432
|
+
|
|
433
|
+
return true;
|
|
434
|
+
}
|
|
435
|
+
),
|
|
436
|
+
{ numRuns: 100 }
|
|
437
|
+
);
|
|
438
|
+
}
|
|
439
|
+
);
|
|
440
|
+
|
|
441
|
+
/**
|
|
442
|
+
* Identical logs have divergence at the end.
|
|
443
|
+
*/
|
|
444
|
+
test(
|
|
445
|
+
'identical logs have divergence at the end',
|
|
446
|
+
async () => {
|
|
447
|
+
await fc.assert(
|
|
448
|
+
fc.asyncProperty(
|
|
449
|
+
fc.array(actionLogEntryArbitrary, { minLength: 1, maxLength: 10 }),
|
|
450
|
+
async (actions) => {
|
|
451
|
+
const analysis = verificationEngine.analyzeDiff(actions, actions);
|
|
452
|
+
|
|
453
|
+
// Divergence should be at the end for identical logs
|
|
454
|
+
expect(analysis.divergencePoint).toBe(actions.length);
|
|
455
|
+
|
|
456
|
+
// Recommendation depends on whether agents completed the task
|
|
457
|
+
// If both have only failures, it's 'both_invalid'
|
|
458
|
+
// If both have successes, it's 'tie'
|
|
459
|
+
const hasSuccesses = actions.some(a => a.result === 'success');
|
|
460
|
+
const hasExtracts = actions.some(a => a.action === 'extract');
|
|
461
|
+
|
|
462
|
+
if (!hasSuccesses) {
|
|
463
|
+
expect(analysis.recommendation).toBe('both_invalid');
|
|
464
|
+
} else if (hasExtracts && hasSuccesses) {
|
|
465
|
+
expect(analysis.recommendation).toBe('tie');
|
|
466
|
+
}
|
|
467
|
+
// Other cases may vary based on metrics
|
|
468
|
+
|
|
469
|
+
return true;
|
|
470
|
+
}
|
|
471
|
+
),
|
|
472
|
+
{ numRuns: 100 }
|
|
473
|
+
);
|
|
474
|
+
}
|
|
475
|
+
);
|
|
476
|
+
|
|
477
|
+
/**
|
|
478
|
+
* Agent with higher success rate is recommended.
|
|
479
|
+
*/
|
|
480
|
+
test(
|
|
481
|
+
'agent with higher success rate is recommended',
|
|
482
|
+
async () => {
|
|
483
|
+
await fc.assert(
|
|
484
|
+
fc.asyncProperty(
|
|
485
|
+
fc.array(actionLogEntryArbitrary, { minLength: 3, maxLength: 10 }),
|
|
486
|
+
async (baseActions) => {
|
|
487
|
+
// Create logA with all successes
|
|
488
|
+
const logA: ActionLogEntry[] = baseActions.map((a) => ({
|
|
489
|
+
...a,
|
|
490
|
+
result: 'success' as const,
|
|
491
|
+
action: 'extract' as const, // Ensure task completion
|
|
492
|
+
}));
|
|
493
|
+
|
|
494
|
+
// Create logB with all failures
|
|
495
|
+
const logB: ActionLogEntry[] = baseActions.map((a) => ({
|
|
496
|
+
...a,
|
|
497
|
+
result: 'failure' as const,
|
|
498
|
+
}));
|
|
499
|
+
|
|
500
|
+
const analysis = verificationEngine.analyzeDiff(logA, logB);
|
|
501
|
+
|
|
502
|
+
// Agent A should be recommended (higher success rate)
|
|
503
|
+
expect(analysis.recommendation).toBe('agent_a');
|
|
504
|
+
expect(analysis.agentAMetrics.successRate).toBeGreaterThan(
|
|
505
|
+
analysis.agentBMetrics.successRate
|
|
506
|
+
);
|
|
507
|
+
|
|
508
|
+
return true;
|
|
509
|
+
}
|
|
510
|
+
),
|
|
511
|
+
{ numRuns: 100 }
|
|
512
|
+
);
|
|
513
|
+
}
|
|
514
|
+
);
|
|
515
|
+
|
|
516
|
+
/**
|
|
517
|
+
* Both invalid when neither completes task.
|
|
518
|
+
*/
|
|
519
|
+
test(
|
|
520
|
+
'both invalid when neither has successful actions',
|
|
521
|
+
async () => {
|
|
522
|
+
await fc.assert(
|
|
523
|
+
fc.asyncProperty(
|
|
524
|
+
fc.array(failedActionLogEntryArbitrary, { minLength: 1, maxLength: 5 }),
|
|
525
|
+
fc.array(failedActionLogEntryArbitrary, { minLength: 1, maxLength: 5 }),
|
|
526
|
+
async (logA, logB) => {
|
|
527
|
+
const analysis = verificationEngine.analyzeDiff(logA, logB);
|
|
528
|
+
|
|
529
|
+
expect(analysis.recommendation).toBe('both_invalid');
|
|
530
|
+
|
|
531
|
+
return true;
|
|
532
|
+
}
|
|
533
|
+
),
|
|
534
|
+
{ numRuns: 100 }
|
|
535
|
+
);
|
|
536
|
+
}
|
|
537
|
+
);
|
|
538
|
+
|
|
539
|
+
/**
|
|
540
|
+
* Recommendation confidence is between 0 and 1.
|
|
541
|
+
*/
|
|
542
|
+
test(
|
|
543
|
+
'recommendation confidence is between 0 and 1',
|
|
544
|
+
async () => {
|
|
545
|
+
await fc.assert(
|
|
546
|
+
fc.asyncProperty(
|
|
547
|
+
fc.array(actionLogEntryArbitrary, { minLength: 1, maxLength: 10 }),
|
|
548
|
+
fc.array(actionLogEntryArbitrary, { minLength: 1, maxLength: 10 }),
|
|
549
|
+
async (logA, logB) => {
|
|
550
|
+
const analysis = verificationEngine.analyzeDiff(logA, logB);
|
|
551
|
+
|
|
552
|
+
expect(analysis.recommendationConfidence).toBeGreaterThanOrEqual(0);
|
|
553
|
+
expect(analysis.recommendationConfidence).toBeLessThanOrEqual(1);
|
|
554
|
+
|
|
555
|
+
return true;
|
|
556
|
+
}
|
|
557
|
+
),
|
|
558
|
+
{ numRuns: 100 }
|
|
559
|
+
);
|
|
560
|
+
}
|
|
561
|
+
);
|
|
562
|
+
});
|
|
563
|
+
|
|
564
|
+
// =============================================================================
|
|
565
|
+
// Confidence Scoring Tests (Requirement 8.6)
|
|
566
|
+
// =============================================================================
|
|
567
|
+
|
|
568
|
+
describe('Confidence Scoring', () => {
|
|
569
|
+
/**
|
|
570
|
+
* Confidence score is between 0 and 1.
|
|
571
|
+
*/
|
|
572
|
+
test(
|
|
573
|
+
'confidence score is between 0 and 1',
|
|
574
|
+
async () => {
|
|
575
|
+
await fc.assert(
|
|
576
|
+
fc.asyncProperty(
|
|
577
|
+
fc.array(actionLogEntryArbitrary, { minLength: 0, maxLength: 10 }),
|
|
578
|
+
fc.oneof(
|
|
579
|
+
fc.constant(null),
|
|
580
|
+
fc.constant(undefined),
|
|
581
|
+
claimedResultWithDataArbitrary
|
|
582
|
+
),
|
|
583
|
+
async (actionLog, claimedResult) => {
|
|
584
|
+
const result = verificationEngine.calculateConfidence(
|
|
585
|
+
claimedResult,
|
|
586
|
+
actionLog
|
|
587
|
+
);
|
|
588
|
+
|
|
589
|
+
expect(result.score).toBeGreaterThanOrEqual(0);
|
|
590
|
+
expect(result.score).toBeLessThanOrEqual(1);
|
|
591
|
+
|
|
592
|
+
return true;
|
|
593
|
+
}
|
|
594
|
+
),
|
|
595
|
+
{ numRuns: 100 }
|
|
596
|
+
);
|
|
597
|
+
}
|
|
598
|
+
);
|
|
599
|
+
|
|
600
|
+
/**
|
|
601
|
+
* Higher success rate leads to higher confidence.
|
|
602
|
+
*/
|
|
603
|
+
test(
|
|
604
|
+
'higher success rate leads to higher confidence',
|
|
605
|
+
async () => {
|
|
606
|
+
await fc.assert(
|
|
607
|
+
fc.asyncProperty(
|
|
608
|
+
fc.array(actionLogEntryArbitrary, { minLength: 5, maxLength: 10 }),
|
|
609
|
+
async (baseActions) => {
|
|
610
|
+
// Create log with all successes
|
|
611
|
+
const successLog: ActionLogEntry[] = baseActions.map((a) => ({
|
|
612
|
+
...a,
|
|
613
|
+
result: 'success' as const,
|
|
614
|
+
}));
|
|
615
|
+
|
|
616
|
+
// Create log with all failures
|
|
617
|
+
const failureLog: ActionLogEntry[] = baseActions.map((a) => ({
|
|
618
|
+
...a,
|
|
619
|
+
result: 'failure' as const,
|
|
620
|
+
}));
|
|
621
|
+
|
|
622
|
+
const successResult = verificationEngine.calculateConfidence(
|
|
623
|
+
null,
|
|
624
|
+
successLog
|
|
625
|
+
);
|
|
626
|
+
const failureResult = verificationEngine.calculateConfidence(
|
|
627
|
+
null,
|
|
628
|
+
failureLog
|
|
629
|
+
);
|
|
630
|
+
|
|
631
|
+
// Success log should have higher confidence
|
|
632
|
+
expect(successResult.score).toBeGreaterThan(failureResult.score);
|
|
633
|
+
|
|
634
|
+
return true;
|
|
635
|
+
}
|
|
636
|
+
),
|
|
637
|
+
{ numRuns: 100 }
|
|
638
|
+
);
|
|
639
|
+
}
|
|
640
|
+
);
|
|
641
|
+
|
|
642
|
+
/**
|
|
643
|
+
* Extract actions increase confidence.
|
|
644
|
+
*/
|
|
645
|
+
test(
|
|
646
|
+
'extract actions increase confidence',
|
|
647
|
+
async () => {
|
|
648
|
+
await fc.assert(
|
|
649
|
+
fc.asyncProperty(
|
|
650
|
+
fc.array(
|
|
651
|
+
fc.record({
|
|
652
|
+
timestamp: fc.integer({ min: 1000000000000, max: 2000000000000 }),
|
|
653
|
+
sessionId: sessionIdArbitrary,
|
|
654
|
+
action: fc.constant('click' as const),
|
|
655
|
+
intentId: intentIdArbitrary,
|
|
656
|
+
result: fc.constant('success' as const),
|
|
657
|
+
}),
|
|
658
|
+
{ minLength: 3, maxLength: 5 }
|
|
659
|
+
),
|
|
660
|
+
async (clickActions) => {
|
|
661
|
+
// Log without extract
|
|
662
|
+
const noExtractLog = clickActions;
|
|
663
|
+
|
|
664
|
+
// Log with extract
|
|
665
|
+
const withExtractLog: ActionLogEntry[] = [
|
|
666
|
+
...clickActions,
|
|
667
|
+
{
|
|
668
|
+
timestamp: Date.now(),
|
|
669
|
+
sessionId: clickActions[0].sessionId,
|
|
670
|
+
action: 'extract' as const,
|
|
671
|
+
intentId: 'DISPLAY_ID:DATA',
|
|
672
|
+
result: 'success' as const,
|
|
673
|
+
},
|
|
674
|
+
];
|
|
675
|
+
|
|
676
|
+
const noExtractResult = verificationEngine.calculateConfidence(
|
|
677
|
+
null,
|
|
678
|
+
noExtractLog
|
|
679
|
+
);
|
|
680
|
+
const withExtractResult = verificationEngine.calculateConfidence(
|
|
681
|
+
null,
|
|
682
|
+
withExtractLog
|
|
683
|
+
);
|
|
684
|
+
|
|
685
|
+
// With extract should have higher confidence
|
|
686
|
+
expect(withExtractResult.score).toBeGreaterThan(noExtractResult.score);
|
|
687
|
+
|
|
688
|
+
return true;
|
|
689
|
+
}
|
|
690
|
+
),
|
|
691
|
+
{ numRuns: 100 }
|
|
692
|
+
);
|
|
693
|
+
}
|
|
694
|
+
);
|
|
695
|
+
|
|
696
|
+
/**
|
|
697
|
+
* Reliability threshold is respected.
|
|
698
|
+
*/
|
|
699
|
+
test(
|
|
700
|
+
'reliability threshold is respected',
|
|
701
|
+
async () => {
|
|
702
|
+
await fc.assert(
|
|
703
|
+
fc.asyncProperty(
|
|
704
|
+
fc.array(actionLogEntryArbitrary, { minLength: 1, maxLength: 10 }),
|
|
705
|
+
async (actionLog) => {
|
|
706
|
+
const result = verificationEngine.calculateConfidence(null, actionLog);
|
|
707
|
+
|
|
708
|
+
// isReliable should match threshold comparison
|
|
709
|
+
expect(result.isReliable).toBe(
|
|
710
|
+
result.score >= result.reliabilityThreshold
|
|
711
|
+
);
|
|
712
|
+
|
|
713
|
+
return true;
|
|
714
|
+
}
|
|
715
|
+
),
|
|
716
|
+
{ numRuns: 100 }
|
|
717
|
+
);
|
|
718
|
+
}
|
|
719
|
+
);
|
|
720
|
+
|
|
721
|
+
/**
|
|
722
|
+
* Confidence factors are all present.
|
|
723
|
+
*/
|
|
724
|
+
test(
|
|
725
|
+
'confidence factors are all present',
|
|
726
|
+
async () => {
|
|
727
|
+
await fc.assert(
|
|
728
|
+
fc.asyncProperty(
|
|
729
|
+
fc.array(actionLogEntryArbitrary, { minLength: 1, maxLength: 10 }),
|
|
730
|
+
async (actionLog) => {
|
|
731
|
+
const result = verificationEngine.calculateConfidence(null, actionLog);
|
|
732
|
+
|
|
733
|
+
// Should have all expected factors
|
|
734
|
+
const factorNames = result.factors.map((f) => f.name);
|
|
735
|
+
expect(factorNames).toContain('Action Success Rate');
|
|
736
|
+
expect(factorNames).toContain('Extract Actions Present');
|
|
737
|
+
expect(factorNames).toContain('Action-Result Alignment');
|
|
738
|
+
expect(factorNames).toContain('Action Completeness');
|
|
739
|
+
|
|
740
|
+
// All factors should have valid scores
|
|
741
|
+
for (const factor of result.factors) {
|
|
742
|
+
expect(factor.score).toBeGreaterThanOrEqual(0);
|
|
743
|
+
expect(factor.score).toBeLessThanOrEqual(1);
|
|
744
|
+
expect(factor.weight).toBeGreaterThanOrEqual(0);
|
|
745
|
+
expect(factor.weight).toBeLessThanOrEqual(1);
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
return true;
|
|
749
|
+
}
|
|
750
|
+
),
|
|
751
|
+
{ numRuns: 100 }
|
|
752
|
+
);
|
|
753
|
+
}
|
|
754
|
+
);
|
|
755
|
+
|
|
756
|
+
/**
|
|
757
|
+
* Empty action log has low confidence.
|
|
758
|
+
*/
|
|
759
|
+
test(
|
|
760
|
+
'empty action log has low confidence',
|
|
761
|
+
async () => {
|
|
762
|
+
const result = verificationEngine.calculateConfidence(null, []);
|
|
763
|
+
|
|
764
|
+
// Empty action log should have low confidence (below reliability threshold)
|
|
765
|
+
expect(result.score).toBeLessThan(result.reliabilityThreshold);
|
|
766
|
+
expect(result.isReliable).toBe(false);
|
|
767
|
+
}
|
|
768
|
+
);
|
|
769
|
+
});
|