outcome-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/README.md +261 -0
  2. package/package.json +95 -0
  3. package/src/agents/README.md +139 -0
  4. package/src/agents/adapters/anthropic.adapter.ts +166 -0
  5. package/src/agents/adapters/dalle.adapter.ts +145 -0
  6. package/src/agents/adapters/gemini.adapter.ts +134 -0
  7. package/src/agents/adapters/imagen.adapter.ts +106 -0
  8. package/src/agents/adapters/nano-banana.adapter.ts +129 -0
  9. package/src/agents/adapters/openai.adapter.ts +165 -0
  10. package/src/agents/adapters/veo.adapter.ts +130 -0
  11. package/src/agents/agent.schema.property.test.ts +379 -0
  12. package/src/agents/agent.schema.test.ts +148 -0
  13. package/src/agents/agent.schema.ts +263 -0
  14. package/src/agents/index.ts +60 -0
  15. package/src/agents/registered-agent.schema.ts +356 -0
  16. package/src/agents/registry.ts +97 -0
  17. package/src/agents/tournament-configs.property.test.ts +266 -0
  18. package/src/cli/README.md +145 -0
  19. package/src/cli/commands/define.ts +79 -0
  20. package/src/cli/commands/list.ts +46 -0
  21. package/src/cli/commands/logs.ts +83 -0
  22. package/src/cli/commands/run.ts +416 -0
  23. package/src/cli/commands/verify.ts +110 -0
  24. package/src/cli/index.ts +81 -0
  25. package/src/config/README.md +128 -0
  26. package/src/config/env.ts +262 -0
  27. package/src/config/index.ts +19 -0
  28. package/src/eval/README.md +318 -0
  29. package/src/eval/ai-judge.test.ts +435 -0
  30. package/src/eval/ai-judge.ts +368 -0
  31. package/src/eval/code-validators.ts +414 -0
  32. package/src/eval/evaluateOutcome.property.test.ts +1174 -0
  33. package/src/eval/evaluateOutcome.ts +591 -0
  34. package/src/eval/immigration-validators.ts +122 -0
  35. package/src/eval/index.ts +90 -0
  36. package/src/eval/judge-cache.ts +402 -0
  37. package/src/eval/tournament-validators.property.test.ts +439 -0
  38. package/src/eval/validators.property.test.ts +1118 -0
  39. package/src/eval/validators.ts +1199 -0
  40. package/src/eval/weighted-scorer.ts +285 -0
  41. package/src/index.ts +17 -0
  42. package/src/league/README.md +188 -0
  43. package/src/league/health-check.ts +353 -0
  44. package/src/league/index.ts +93 -0
  45. package/src/league/killAgent.ts +151 -0
  46. package/src/league/league.test.ts +1151 -0
  47. package/src/league/runLeague.ts +843 -0
  48. package/src/league/scoreAgent.ts +175 -0
  49. package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
  50. package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
  51. package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
  52. package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
  53. package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
  54. package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
  55. package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
  56. package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
  57. package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
  58. package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
  59. package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
  60. package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
  61. package/src/modules/omnibridge/api/.gitkeep +1 -0
  62. package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
  63. package/src/modules/omnibridge/auth/.gitkeep +1 -0
  64. package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
  65. package/src/modules/omnibridge/auth/session-vault.ts +577 -0
  66. package/src/modules/omnibridge/core/.gitkeep +1 -0
  67. package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
  68. package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
  69. package/src/modules/omnibridge/core/types.ts +610 -0
  70. package/src/modules/omnibridge/execution/.gitkeep +1 -0
  71. package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
  72. package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
  73. package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
  74. package/src/modules/omnibridge/index.ts +212 -0
  75. package/src/modules/omnibridge/omnibridge.ts +510 -0
  76. package/src/modules/omnibridge/verification/.gitkeep +1 -0
  77. package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
  78. package/src/outcomes/README.md +75 -0
  79. package/src/outcomes/acquire-pilot-customer.ts +297 -0
  80. package/src/outcomes/code-delivery-outcomes.ts +89 -0
  81. package/src/outcomes/code-outcomes.ts +256 -0
  82. package/src/outcomes/code_review_battle.test.ts +135 -0
  83. package/src/outcomes/code_review_battle.ts +135 -0
  84. package/src/outcomes/cold_email_battle.ts +97 -0
  85. package/src/outcomes/content_creation_battle.ts +160 -0
  86. package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
  87. package/src/outcomes/index.ts +107 -0
  88. package/src/outcomes/lead_gen_battle.test.ts +113 -0
  89. package/src/outcomes/lead_gen_battle.ts +99 -0
  90. package/src/outcomes/outcome.schema.property.test.ts +229 -0
  91. package/src/outcomes/outcome.schema.ts +187 -0
  92. package/src/outcomes/qualified_sales_interest.ts +118 -0
  93. package/src/outcomes/swarm_planner.property.test.ts +370 -0
  94. package/src/outcomes/swarm_planner.ts +96 -0
  95. package/src/outcomes/web_extraction.ts +234 -0
  96. package/src/runtime/README.md +220 -0
  97. package/src/runtime/agentRunner.test.ts +341 -0
  98. package/src/runtime/agentRunner.ts +746 -0
  99. package/src/runtime/claudeAdapter.ts +232 -0
  100. package/src/runtime/costTracker.ts +123 -0
  101. package/src/runtime/index.ts +34 -0
  102. package/src/runtime/modelAdapter.property.test.ts +305 -0
  103. package/src/runtime/modelAdapter.ts +144 -0
  104. package/src/runtime/openaiAdapter.ts +235 -0
  105. package/src/utils/README.md +122 -0
  106. package/src/utils/command-runner.ts +134 -0
  107. package/src/utils/cost-guard.ts +379 -0
  108. package/src/utils/errors.test.ts +290 -0
  109. package/src/utils/errors.ts +442 -0
  110. package/src/utils/index.ts +37 -0
  111. package/src/utils/logger.test.ts +361 -0
  112. package/src/utils/logger.ts +419 -0
  113. package/src/utils/output-parsers.ts +216 -0
@@ -0,0 +1,769 @@
1
+ /**
2
+ * Verification Engine Property Tests
3
+ *
4
+ * Property-based tests for hallucination detection, diff analysis,
5
+ * and confidence scoring.
6
+ *
7
+ * Requirements: 8.4, 8.5, 8.6
8
+ */
9
+
10
+ import { describe, test, expect, beforeEach } from 'vitest';
11
+ import * as fc from 'fast-check';
12
+ import {
13
+ VerificationEngine,
14
+ createVerificationEngine,
15
+ } from '../verification/verification-engine.js';
16
+ import type { ActionLogEntry } from '../core/types.js';
17
+
18
+ // =============================================================================
19
+ // Test Setup
20
+ // =============================================================================
21
+
22
+ // Note: Each property test creates its own verificationEngine instance to ensure isolation
23
+ // The beforeEach instance is only used for tests that explicitly need shared state
24
+
25
+ let verificationEngine: VerificationEngine;
26
+
27
+ beforeEach(() => {
28
+ verificationEngine = createVerificationEngine({
29
+ reliabilityThreshold: 0.7,
30
+ actionSuccessWeight: 0.3,
31
+ extractPresenceWeight: 0.25,
32
+ alignmentWeight: 0.3,
33
+ completenessWeight: 0.15,
34
+ });
35
+ });
36
+
37
+ // =============================================================================
38
+ // Arbitraries
39
+ // =============================================================================
40
+
41
+ /**
42
+ * Generate arbitrary session IDs.
43
+ */
44
+ const sessionIdArbitrary = fc.stringMatching(/^session_[a-z0-9]{8,16}$/);
45
+
46
+ /**
47
+ * Generate arbitrary intent IDs.
48
+ */
49
+ const intentIdArbitrary = fc.stringMatching(
50
+ /^(ACTION|INPUT|DISPLAY|NAV)_ID:[A-Z_]{3,20}$/
51
+ );
52
+
53
+ /**
54
+ * Generate arbitrary action types.
55
+ */
56
+ const actionTypeArbitrary = fc.constantFrom(
57
+ 'click',
58
+ 'type',
59
+ 'navigate',
60
+ 'wait',
61
+ 'extract'
62
+ ) as fc.Arbitrary<'click' | 'type' | 'navigate' | 'wait' | 'extract'>;
63
+
64
+ /**
65
+ * Generate arbitrary non-extract action types.
66
+ */
67
+ const nonExtractActionTypeArbitrary = fc.constantFrom(
68
+ 'click',
69
+ 'type',
70
+ 'navigate',
71
+ 'wait'
72
+ ) as fc.Arbitrary<'click' | 'type' | 'navigate' | 'wait'>;
73
+
74
+ /**
75
+ * Generate arbitrary action results.
76
+ */
77
+ const actionResultArbitrary = fc.constantFrom('success', 'failure') as fc.Arbitrary<
78
+ 'success' | 'failure'
79
+ >;
80
+
81
+ /**
82
+ * Generate arbitrary action log entries with timestamp.
83
+ */
84
+ const actionLogEntryArbitrary = fc.record({
85
+ timestamp: fc.integer({ min: 1000000000000, max: 2000000000000 }),
86
+ sessionId: sessionIdArbitrary,
87
+ action: actionTypeArbitrary,
88
+ intentId: intentIdArbitrary,
89
+ value: fc.option(fc.string({ minLength: 0, maxLength: 100 }), { nil: undefined }),
90
+ result: actionResultArbitrary,
91
+ });
92
+
93
+ /**
94
+ * Generate arbitrary action log entries without extract actions.
95
+ */
96
+ const nonExtractActionLogEntryArbitrary = fc.record({
97
+ timestamp: fc.integer({ min: 1000000000000, max: 2000000000000 }),
98
+ sessionId: sessionIdArbitrary,
99
+ action: nonExtractActionTypeArbitrary,
100
+ intentId: intentIdArbitrary,
101
+ value: fc.option(fc.string({ minLength: 0, maxLength: 100 }), { nil: undefined }),
102
+ result: actionResultArbitrary,
103
+ });
104
+
105
+ /**
106
+ * Generate arbitrary failed action log entries.
107
+ */
108
+ const failedActionLogEntryArbitrary = fc.record({
109
+ timestamp: fc.integer({ min: 1000000000000, max: 2000000000000 }),
110
+ sessionId: sessionIdArbitrary,
111
+ action: actionTypeArbitrary,
112
+ intentId: intentIdArbitrary,
113
+ value: fc.option(fc.string({ minLength: 0, maxLength: 100 }), { nil: undefined }),
114
+ result: fc.constant('failure' as const),
115
+ });
116
+
117
+ /**
118
+ * Generate arbitrary claimed results with data.
119
+ */
120
+ const claimedResultWithDataArbitrary = fc.record({
121
+ data: fc.record({
122
+ items: fc.array(fc.string({ minLength: 1, maxLength: 20 }), { minLength: 1, maxLength: 5 }),
123
+ total: fc.integer({ min: 1, max: 100 }),
124
+ }),
125
+ metadata: fc.record({
126
+ confidence: fc.float({ min: 0, max: 1 }),
127
+ executionTimeMs: fc.integer({ min: 0, max: 10000 }),
128
+ actionsPerformed: fc.integer({ min: 0, max: 100 }),
129
+ }),
130
+ });
131
+
132
+ /**
133
+ * Generate arbitrary successful action log entries with extract.
134
+ */
135
+ const successfulExtractActionArbitrary = fc.record({
136
+ timestamp: fc.integer({ min: 1000000000000, max: 2000000000000 }),
137
+ sessionId: sessionIdArbitrary,
138
+ action: fc.constant('extract' as const),
139
+ intentId: intentIdArbitrary,
140
+ value: fc.option(fc.string({ minLength: 0, maxLength: 100 }), { nil: undefined }),
141
+ result: fc.constant('success' as const),
142
+ });
143
+
144
+ // =============================================================================
145
+ // Property 19: Hallucination Detection
146
+ // =============================================================================
147
+
148
+ describe('Property 19: Hallucination Detection', () => {
149
+ /**
150
+ * **Feature: omnibridge, Property 19: Hallucination Detection**
151
+ *
152
+ * *For any* agent result where the claimed outcome cannot be derived from
153
+ * the recorded action sequence, the verification SHALL flag
154
+ * `isHallucination: true`.
155
+ *
156
+ * **Validates: Requirements 8.5**
157
+ */
158
+ test(
159
+ 'Property 19: claimed data without extract actions is flagged as hallucination',
160
+ async () => {
161
+ await fc.assert(
162
+ fc.asyncProperty(
163
+ fc.array(nonExtractActionLogEntryArbitrary, { minLength: 1, maxLength: 10 }),
164
+ claimedResultWithDataArbitrary,
165
+ async (actionLog, claimedResult) => {
166
+ const result = verificationEngine.detectHallucination(
167
+ claimedResult,
168
+ actionLog
169
+ );
170
+
171
+ // Should be flagged as hallucination since no extract actions
172
+ expect(result.isHallucination).toBe(true);
173
+ expect(result.reasons.length).toBeGreaterThan(0);
174
+ expect(result.reasons).toContain(
175
+ 'Data claimed but no extract actions performed'
176
+ );
177
+
178
+ return true;
179
+ }
180
+ ),
181
+ { numRuns: 100 }
182
+ );
183
+ }
184
+ );
185
+
186
+ /**
187
+ * Claimed data with extract actions is NOT flagged as hallucination.
188
+ */
189
+ test(
190
+ 'claimed data with extract actions is NOT flagged as hallucination',
191
+ async () => {
192
+ await fc.assert(
193
+ fc.asyncProperty(
194
+ successfulExtractActionArbitrary,
195
+ fc.array(actionLogEntryArbitrary, { minLength: 0, maxLength: 5 }),
196
+ claimedResultWithDataArbitrary,
197
+ async (extractAction, otherActions, claimedResult) => {
198
+ // Combine extract action with other actions
199
+ const actionLog = [extractAction, ...otherActions];
200
+
201
+ const result = verificationEngine.detectHallucination(
202
+ claimedResult,
203
+ actionLog
204
+ );
205
+
206
+ // Should NOT be flagged for missing extract actions
207
+ expect(result.reasons).not.toContain(
208
+ 'Data claimed but no extract actions performed'
209
+ );
210
+
211
+ return true;
212
+ }
213
+ ),
214
+ { numRuns: 100 }
215
+ );
216
+ }
217
+ );
218
+
219
+ /**
220
+ * All failed actions with success claim is flagged as hallucination.
221
+ */
222
+ test(
223
+ 'all failed actions with success claim is flagged as hallucination',
224
+ async () => {
225
+ await fc.assert(
226
+ fc.asyncProperty(
227
+ fc.array(failedActionLogEntryArbitrary, { minLength: 1, maxLength: 10 }),
228
+ claimedResultWithDataArbitrary,
229
+ async (actionLog, claimedResult) => {
230
+ const result = verificationEngine.detectHallucination(
231
+ claimedResult,
232
+ actionLog
233
+ );
234
+
235
+ // Should be flagged as hallucination since all actions failed
236
+ expect(result.isHallucination).toBe(true);
237
+ expect(result.reasons).toContain(
238
+ 'Success claimed but all actions failed'
239
+ );
240
+
241
+ return true;
242
+ }
243
+ ),
244
+ { numRuns: 100 }
245
+ );
246
+ }
247
+ );
248
+
249
+ /**
250
+ * No actions with claimed result is flagged as hallucination.
251
+ */
252
+ test(
253
+ 'no actions with claimed result is flagged as hallucination',
254
+ async () => {
255
+ await fc.assert(
256
+ fc.asyncProperty(claimedResultWithDataArbitrary, async (claimedResult) => {
257
+ const result = verificationEngine.detectHallucination(claimedResult, []);
258
+
259
+ // Should be flagged as hallucination since no actions
260
+ expect(result.isHallucination).toBe(true);
261
+ expect(result.reasons).toContain(
262
+ 'Result claimed with no actions performed'
263
+ );
264
+
265
+ return true;
266
+ }),
267
+ { numRuns: 100 }
268
+ );
269
+ }
270
+ );
271
+
272
+ /**
273
+ * Null claimed result is never flagged as hallucination.
274
+ */
275
+ test(
276
+ 'null claimed result is never flagged as hallucination',
277
+ async () => {
278
+ await fc.assert(
279
+ fc.asyncProperty(
280
+ fc.array(actionLogEntryArbitrary, { minLength: 0, maxLength: 10 }),
281
+ async (actionLog) => {
282
+ const result = verificationEngine.detectHallucination(null, actionLog);
283
+
284
+ // Null result should never be a hallucination
285
+ expect(result.isHallucination).toBe(false);
286
+ expect(result.reasons.length).toBe(0);
287
+
288
+ return true;
289
+ }
290
+ ),
291
+ { numRuns: 100 }
292
+ );
293
+ }
294
+ );
295
+
296
+ /**
297
+ * Undefined claimed result is never flagged as hallucination.
298
+ */
299
+ test(
300
+ 'undefined claimed result is never flagged as hallucination',
301
+ async () => {
302
+ await fc.assert(
303
+ fc.asyncProperty(
304
+ fc.array(actionLogEntryArbitrary, { minLength: 0, maxLength: 10 }),
305
+ async (actionLog) => {
306
+ const result = verificationEngine.detectHallucination(undefined, actionLog);
307
+
308
+ // Undefined result should never be a hallucination
309
+ expect(result.isHallucination).toBe(false);
310
+ expect(result.reasons.length).toBe(0);
311
+
312
+ return true;
313
+ }
314
+ ),
315
+ { numRuns: 100 }
316
+ );
317
+ }
318
+ );
319
+
320
+ /**
321
+ * Action summary is correctly calculated.
322
+ */
323
+ test(
324
+ 'action summary is correctly calculated',
325
+ async () => {
326
+ await fc.assert(
327
+ fc.asyncProperty(
328
+ fc.array(actionLogEntryArbitrary, { minLength: 1, maxLength: 20 }),
329
+ async (actionLog) => {
330
+ const result = verificationEngine.detectHallucination(null, actionLog);
331
+
332
+ // Verify action summary
333
+ expect(result.actionSummary.totalActions).toBe(actionLog.length);
334
+
335
+ const expectedSuccessful = actionLog.filter(
336
+ (a) => a.result === 'success'
337
+ ).length;
338
+ expect(result.actionSummary.successfulActions).toBe(expectedSuccessful);
339
+
340
+ const expectedFailed = actionLog.filter(
341
+ (a) => a.result === 'failure'
342
+ ).length;
343
+ expect(result.actionSummary.failedActions).toBe(expectedFailed);
344
+
345
+ const expectedExtract = actionLog.filter(
346
+ (a) => a.action === 'extract'
347
+ ).length;
348
+ expect(result.actionSummary.extractActions).toBe(expectedExtract);
349
+
350
+ return true;
351
+ }
352
+ ),
353
+ { numRuns: 100 }
354
+ );
355
+ }
356
+ );
357
+
358
+ /**
359
+ * Hallucination confidence is between 0 and 1.
360
+ */
361
+ test(
362
+ 'hallucination confidence is between 0 and 1',
363
+ async () => {
364
+ await fc.assert(
365
+ fc.asyncProperty(
366
+ fc.array(actionLogEntryArbitrary, { minLength: 0, maxLength: 10 }),
367
+ fc.oneof(
368
+ fc.constant(null),
369
+ fc.constant(undefined),
370
+ claimedResultWithDataArbitrary
371
+ ),
372
+ async (actionLog, claimedResult) => {
373
+ const result = verificationEngine.detectHallucination(
374
+ claimedResult,
375
+ actionLog
376
+ );
377
+
378
+ expect(result.confidence).toBeGreaterThanOrEqual(0);
379
+ expect(result.confidence).toBeLessThanOrEqual(1);
380
+
381
+ return true;
382
+ }
383
+ ),
384
+ { numRuns: 100 }
385
+ );
386
+ }
387
+ );
388
+ });
389
+
390
+ // =============================================================================
391
+ // Diff Analysis Tests (Requirement 8.4)
392
+ // =============================================================================
393
+
394
+ describe('Diff Analysis for Conflicts', () => {
395
+ /**
396
+ * Divergence point is correctly identified.
397
+ */
398
+ test(
399
+ 'divergence point is correctly identified',
400
+ async () => {
401
+ await fc.assert(
402
+ fc.asyncProperty(
403
+ fc.array(actionLogEntryArbitrary, { minLength: 2, maxLength: 10 }),
404
+ fc.integer({ min: 0, max: 9 }),
405
+ async (commonActions, divergeIndexRaw) => {
406
+ const divergeIndex = divergeIndexRaw % commonActions.length;
407
+
408
+ // Create two logs that share common prefix
409
+ const logA: ActionLogEntry[] = [];
410
+ const logB: ActionLogEntry[] = [];
411
+
412
+ for (let i = 0; i < divergeIndex; i++) {
413
+ logA.push(commonActions[i]);
414
+ logB.push(commonActions[i]);
415
+ }
416
+
417
+ // Add divergent actions
418
+ if (divergeIndex < commonActions.length) {
419
+ logA.push({
420
+ ...commonActions[divergeIndex],
421
+ result: 'success',
422
+ });
423
+ logB.push({
424
+ ...commonActions[divergeIndex],
425
+ result: 'failure',
426
+ });
427
+ }
428
+
429
+ const analysis = verificationEngine.analyzeDiff(logA, logB);
430
+
431
+ expect(analysis.divergencePoint).toBe(divergeIndex);
432
+
433
+ return true;
434
+ }
435
+ ),
436
+ { numRuns: 100 }
437
+ );
438
+ }
439
+ );
440
+
441
+ /**
442
+ * Identical logs have divergence at the end.
443
+ */
444
+ test(
445
+ 'identical logs have divergence at the end',
446
+ async () => {
447
+ await fc.assert(
448
+ fc.asyncProperty(
449
+ fc.array(actionLogEntryArbitrary, { minLength: 1, maxLength: 10 }),
450
+ async (actions) => {
451
+ const analysis = verificationEngine.analyzeDiff(actions, actions);
452
+
453
+ // Divergence should be at the end for identical logs
454
+ expect(analysis.divergencePoint).toBe(actions.length);
455
+
456
+ // Recommendation depends on whether agents completed the task
457
+ // If both have only failures, it's 'both_invalid'
458
+ // If both have successes, it's 'tie'
459
+ const hasSuccesses = actions.some(a => a.result === 'success');
460
+ const hasExtracts = actions.some(a => a.action === 'extract');
461
+
462
+ if (!hasSuccesses) {
463
+ expect(analysis.recommendation).toBe('both_invalid');
464
+ } else if (hasExtracts && hasSuccesses) {
465
+ expect(analysis.recommendation).toBe('tie');
466
+ }
467
+ // Other cases may vary based on metrics
468
+
469
+ return true;
470
+ }
471
+ ),
472
+ { numRuns: 100 }
473
+ );
474
+ }
475
+ );
476
+
477
+ /**
478
+ * Agent with higher success rate is recommended.
479
+ */
480
+ test(
481
+ 'agent with higher success rate is recommended',
482
+ async () => {
483
+ await fc.assert(
484
+ fc.asyncProperty(
485
+ fc.array(actionLogEntryArbitrary, { minLength: 3, maxLength: 10 }),
486
+ async (baseActions) => {
487
+ // Create logA with all successes
488
+ const logA: ActionLogEntry[] = baseActions.map((a) => ({
489
+ ...a,
490
+ result: 'success' as const,
491
+ action: 'extract' as const, // Ensure task completion
492
+ }));
493
+
494
+ // Create logB with all failures
495
+ const logB: ActionLogEntry[] = baseActions.map((a) => ({
496
+ ...a,
497
+ result: 'failure' as const,
498
+ }));
499
+
500
+ const analysis = verificationEngine.analyzeDiff(logA, logB);
501
+
502
+ // Agent A should be recommended (higher success rate)
503
+ expect(analysis.recommendation).toBe('agent_a');
504
+ expect(analysis.agentAMetrics.successRate).toBeGreaterThan(
505
+ analysis.agentBMetrics.successRate
506
+ );
507
+
508
+ return true;
509
+ }
510
+ ),
511
+ { numRuns: 100 }
512
+ );
513
+ }
514
+ );
515
+
516
+ /**
517
+ * Both invalid when neither completes task.
518
+ */
519
+ test(
520
+ 'both invalid when neither has successful actions',
521
+ async () => {
522
+ await fc.assert(
523
+ fc.asyncProperty(
524
+ fc.array(failedActionLogEntryArbitrary, { minLength: 1, maxLength: 5 }),
525
+ fc.array(failedActionLogEntryArbitrary, { minLength: 1, maxLength: 5 }),
526
+ async (logA, logB) => {
527
+ const analysis = verificationEngine.analyzeDiff(logA, logB);
528
+
529
+ expect(analysis.recommendation).toBe('both_invalid');
530
+
531
+ return true;
532
+ }
533
+ ),
534
+ { numRuns: 100 }
535
+ );
536
+ }
537
+ );
538
+
539
+ /**
540
+ * Recommendation confidence is between 0 and 1.
541
+ */
542
+ test(
543
+ 'recommendation confidence is between 0 and 1',
544
+ async () => {
545
+ await fc.assert(
546
+ fc.asyncProperty(
547
+ fc.array(actionLogEntryArbitrary, { minLength: 1, maxLength: 10 }),
548
+ fc.array(actionLogEntryArbitrary, { minLength: 1, maxLength: 10 }),
549
+ async (logA, logB) => {
550
+ const analysis = verificationEngine.analyzeDiff(logA, logB);
551
+
552
+ expect(analysis.recommendationConfidence).toBeGreaterThanOrEqual(0);
553
+ expect(analysis.recommendationConfidence).toBeLessThanOrEqual(1);
554
+
555
+ return true;
556
+ }
557
+ ),
558
+ { numRuns: 100 }
559
+ );
560
+ }
561
+ );
562
+ });
563
+
564
+ // =============================================================================
565
+ // Confidence Scoring Tests (Requirement 8.6)
566
+ // =============================================================================
567
+
568
+ describe('Confidence Scoring', () => {
569
+ /**
570
+ * Confidence score is between 0 and 1.
571
+ */
572
+ test(
573
+ 'confidence score is between 0 and 1',
574
+ async () => {
575
+ await fc.assert(
576
+ fc.asyncProperty(
577
+ fc.array(actionLogEntryArbitrary, { minLength: 0, maxLength: 10 }),
578
+ fc.oneof(
579
+ fc.constant(null),
580
+ fc.constant(undefined),
581
+ claimedResultWithDataArbitrary
582
+ ),
583
+ async (actionLog, claimedResult) => {
584
+ const result = verificationEngine.calculateConfidence(
585
+ claimedResult,
586
+ actionLog
587
+ );
588
+
589
+ expect(result.score).toBeGreaterThanOrEqual(0);
590
+ expect(result.score).toBeLessThanOrEqual(1);
591
+
592
+ return true;
593
+ }
594
+ ),
595
+ { numRuns: 100 }
596
+ );
597
+ }
598
+ );
599
+
600
+ /**
601
+ * Higher success rate leads to higher confidence.
602
+ */
603
+ test(
604
+ 'higher success rate leads to higher confidence',
605
+ async () => {
606
+ await fc.assert(
607
+ fc.asyncProperty(
608
+ fc.array(actionLogEntryArbitrary, { minLength: 5, maxLength: 10 }),
609
+ async (baseActions) => {
610
+ // Create log with all successes
611
+ const successLog: ActionLogEntry[] = baseActions.map((a) => ({
612
+ ...a,
613
+ result: 'success' as const,
614
+ }));
615
+
616
+ // Create log with all failures
617
+ const failureLog: ActionLogEntry[] = baseActions.map((a) => ({
618
+ ...a,
619
+ result: 'failure' as const,
620
+ }));
621
+
622
+ const successResult = verificationEngine.calculateConfidence(
623
+ null,
624
+ successLog
625
+ );
626
+ const failureResult = verificationEngine.calculateConfidence(
627
+ null,
628
+ failureLog
629
+ );
630
+
631
+ // Success log should have higher confidence
632
+ expect(successResult.score).toBeGreaterThan(failureResult.score);
633
+
634
+ return true;
635
+ }
636
+ ),
637
+ { numRuns: 100 }
638
+ );
639
+ }
640
+ );
641
+
642
+ /**
643
+ * Extract actions increase confidence.
644
+ */
645
+ test(
646
+ 'extract actions increase confidence',
647
+ async () => {
648
+ await fc.assert(
649
+ fc.asyncProperty(
650
+ fc.array(
651
+ fc.record({
652
+ timestamp: fc.integer({ min: 1000000000000, max: 2000000000000 }),
653
+ sessionId: sessionIdArbitrary,
654
+ action: fc.constant('click' as const),
655
+ intentId: intentIdArbitrary,
656
+ result: fc.constant('success' as const),
657
+ }),
658
+ { minLength: 3, maxLength: 5 }
659
+ ),
660
+ async (clickActions) => {
661
+ // Log without extract
662
+ const noExtractLog = clickActions;
663
+
664
+ // Log with extract
665
+ const withExtractLog: ActionLogEntry[] = [
666
+ ...clickActions,
667
+ {
668
+ timestamp: Date.now(),
669
+ sessionId: clickActions[0].sessionId,
670
+ action: 'extract' as const,
671
+ intentId: 'DISPLAY_ID:DATA',
672
+ result: 'success' as const,
673
+ },
674
+ ];
675
+
676
+ const noExtractResult = verificationEngine.calculateConfidence(
677
+ null,
678
+ noExtractLog
679
+ );
680
+ const withExtractResult = verificationEngine.calculateConfidence(
681
+ null,
682
+ withExtractLog
683
+ );
684
+
685
+ // With extract should have higher confidence
686
+ expect(withExtractResult.score).toBeGreaterThan(noExtractResult.score);
687
+
688
+ return true;
689
+ }
690
+ ),
691
+ { numRuns: 100 }
692
+ );
693
+ }
694
+ );
695
+
696
+ /**
697
+ * Reliability threshold is respected.
698
+ */
699
+ test(
700
+ 'reliability threshold is respected',
701
+ async () => {
702
+ await fc.assert(
703
+ fc.asyncProperty(
704
+ fc.array(actionLogEntryArbitrary, { minLength: 1, maxLength: 10 }),
705
+ async (actionLog) => {
706
+ const result = verificationEngine.calculateConfidence(null, actionLog);
707
+
708
+ // isReliable should match threshold comparison
709
+ expect(result.isReliable).toBe(
710
+ result.score >= result.reliabilityThreshold
711
+ );
712
+
713
+ return true;
714
+ }
715
+ ),
716
+ { numRuns: 100 }
717
+ );
718
+ }
719
+ );
720
+
721
+ /**
722
+ * Confidence factors are all present.
723
+ */
724
+ test(
725
+ 'confidence factors are all present',
726
+ async () => {
727
+ await fc.assert(
728
+ fc.asyncProperty(
729
+ fc.array(actionLogEntryArbitrary, { minLength: 1, maxLength: 10 }),
730
+ async (actionLog) => {
731
+ const result = verificationEngine.calculateConfidence(null, actionLog);
732
+
733
+ // Should have all expected factors
734
+ const factorNames = result.factors.map((f) => f.name);
735
+ expect(factorNames).toContain('Action Success Rate');
736
+ expect(factorNames).toContain('Extract Actions Present');
737
+ expect(factorNames).toContain('Action-Result Alignment');
738
+ expect(factorNames).toContain('Action Completeness');
739
+
740
+ // All factors should have valid scores
741
+ for (const factor of result.factors) {
742
+ expect(factor.score).toBeGreaterThanOrEqual(0);
743
+ expect(factor.score).toBeLessThanOrEqual(1);
744
+ expect(factor.weight).toBeGreaterThanOrEqual(0);
745
+ expect(factor.weight).toBeLessThanOrEqual(1);
746
+ }
747
+
748
+ return true;
749
+ }
750
+ ),
751
+ { numRuns: 100 }
752
+ );
753
+ }
754
+ );
755
+
756
+ /**
757
+ * Empty action log has low confidence.
758
+ */
759
+ test(
760
+ 'empty action log has low confidence',
761
+ async () => {
762
+ const result = verificationEngine.calculateConfidence(null, []);
763
+
764
+ // Empty action log should have low confidence (below reliability threshold)
765
+ expect(result.score).toBeLessThan(result.reliabilityThreshold);
766
+ expect(result.isReliable).toBe(false);
767
+ }
768
+ );
769
+ });