outcome-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/README.md +261 -0
  2. package/package.json +95 -0
  3. package/src/agents/README.md +139 -0
  4. package/src/agents/adapters/anthropic.adapter.ts +166 -0
  5. package/src/agents/adapters/dalle.adapter.ts +145 -0
  6. package/src/agents/adapters/gemini.adapter.ts +134 -0
  7. package/src/agents/adapters/imagen.adapter.ts +106 -0
  8. package/src/agents/adapters/nano-banana.adapter.ts +129 -0
  9. package/src/agents/adapters/openai.adapter.ts +165 -0
  10. package/src/agents/adapters/veo.adapter.ts +130 -0
  11. package/src/agents/agent.schema.property.test.ts +379 -0
  12. package/src/agents/agent.schema.test.ts +148 -0
  13. package/src/agents/agent.schema.ts +263 -0
  14. package/src/agents/index.ts +60 -0
  15. package/src/agents/registered-agent.schema.ts +356 -0
  16. package/src/agents/registry.ts +97 -0
  17. package/src/agents/tournament-configs.property.test.ts +266 -0
  18. package/src/cli/README.md +145 -0
  19. package/src/cli/commands/define.ts +79 -0
  20. package/src/cli/commands/list.ts +46 -0
  21. package/src/cli/commands/logs.ts +83 -0
  22. package/src/cli/commands/run.ts +416 -0
  23. package/src/cli/commands/verify.ts +110 -0
  24. package/src/cli/index.ts +81 -0
  25. package/src/config/README.md +128 -0
  26. package/src/config/env.ts +262 -0
  27. package/src/config/index.ts +19 -0
  28. package/src/eval/README.md +318 -0
  29. package/src/eval/ai-judge.test.ts +435 -0
  30. package/src/eval/ai-judge.ts +368 -0
  31. package/src/eval/code-validators.ts +414 -0
  32. package/src/eval/evaluateOutcome.property.test.ts +1174 -0
  33. package/src/eval/evaluateOutcome.ts +591 -0
  34. package/src/eval/immigration-validators.ts +122 -0
  35. package/src/eval/index.ts +90 -0
  36. package/src/eval/judge-cache.ts +402 -0
  37. package/src/eval/tournament-validators.property.test.ts +439 -0
  38. package/src/eval/validators.property.test.ts +1118 -0
  39. package/src/eval/validators.ts +1199 -0
  40. package/src/eval/weighted-scorer.ts +285 -0
  41. package/src/index.ts +17 -0
  42. package/src/league/README.md +188 -0
  43. package/src/league/health-check.ts +353 -0
  44. package/src/league/index.ts +93 -0
  45. package/src/league/killAgent.ts +151 -0
  46. package/src/league/league.test.ts +1151 -0
  47. package/src/league/runLeague.ts +843 -0
  48. package/src/league/scoreAgent.ts +175 -0
  49. package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
  50. package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
  51. package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
  52. package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
  53. package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
  54. package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
  55. package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
  56. package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
  57. package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
  58. package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
  59. package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
  60. package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
  61. package/src/modules/omnibridge/api/.gitkeep +1 -0
  62. package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
  63. package/src/modules/omnibridge/auth/.gitkeep +1 -0
  64. package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
  65. package/src/modules/omnibridge/auth/session-vault.ts +577 -0
  66. package/src/modules/omnibridge/core/.gitkeep +1 -0
  67. package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
  68. package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
  69. package/src/modules/omnibridge/core/types.ts +610 -0
  70. package/src/modules/omnibridge/execution/.gitkeep +1 -0
  71. package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
  72. package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
  73. package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
  74. package/src/modules/omnibridge/index.ts +212 -0
  75. package/src/modules/omnibridge/omnibridge.ts +510 -0
  76. package/src/modules/omnibridge/verification/.gitkeep +1 -0
  77. package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
  78. package/src/outcomes/README.md +75 -0
  79. package/src/outcomes/acquire-pilot-customer.ts +297 -0
  80. package/src/outcomes/code-delivery-outcomes.ts +89 -0
  81. package/src/outcomes/code-outcomes.ts +256 -0
  82. package/src/outcomes/code_review_battle.test.ts +135 -0
  83. package/src/outcomes/code_review_battle.ts +135 -0
  84. package/src/outcomes/cold_email_battle.ts +97 -0
  85. package/src/outcomes/content_creation_battle.ts +160 -0
  86. package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
  87. package/src/outcomes/index.ts +107 -0
  88. package/src/outcomes/lead_gen_battle.test.ts +113 -0
  89. package/src/outcomes/lead_gen_battle.ts +99 -0
  90. package/src/outcomes/outcome.schema.property.test.ts +229 -0
  91. package/src/outcomes/outcome.schema.ts +187 -0
  92. package/src/outcomes/qualified_sales_interest.ts +118 -0
  93. package/src/outcomes/swarm_planner.property.test.ts +370 -0
  94. package/src/outcomes/swarm_planner.ts +96 -0
  95. package/src/outcomes/web_extraction.ts +234 -0
  96. package/src/runtime/README.md +220 -0
  97. package/src/runtime/agentRunner.test.ts +341 -0
  98. package/src/runtime/agentRunner.ts +746 -0
  99. package/src/runtime/claudeAdapter.ts +232 -0
  100. package/src/runtime/costTracker.ts +123 -0
  101. package/src/runtime/index.ts +34 -0
  102. package/src/runtime/modelAdapter.property.test.ts +305 -0
  103. package/src/runtime/modelAdapter.ts +144 -0
  104. package/src/runtime/openaiAdapter.ts +235 -0
  105. package/src/utils/README.md +122 -0
  106. package/src/utils/command-runner.ts +134 -0
  107. package/src/utils/cost-guard.ts +379 -0
  108. package/src/utils/errors.test.ts +290 -0
  109. package/src/utils/errors.ts +442 -0
  110. package/src/utils/index.ts +37 -0
  111. package/src/utils/logger.test.ts +361 -0
  112. package/src/utils/logger.ts +419 -0
  113. package/src/utils/output-parsers.ts +216 -0
@@ -0,0 +1,783 @@
1
+ /**
2
+ * Verification Engine
3
+ *
4
+ * Provides hallucination detection, diff analysis for conflicts,
5
+ * and confidence scoring for agent verification.
6
+ *
7
+ * Requirements: 8.4, 8.5, 8.6
8
+ */
9
+
10
+ import type {
11
+ ActionLogEntry,
12
+ VerificationProof,
13
+ DiffAnalysis,
14
+ } from '../core/types.js';
15
+
16
+ // =============================================================================
17
+ // Types
18
+ // =============================================================================
19
+
20
+ /**
21
+ * Result of hallucination detection.
22
+ */
23
+ export interface HallucinationDetectionResult {
24
+ /** Whether hallucination was detected */
25
+ isHallucination: boolean;
26
+ /** Confidence in the detection (0-1) */
27
+ confidence: number;
28
+ /** Reasons for the detection */
29
+ reasons: string[];
30
+ /** The claimed result that was analyzed */
31
+ claimedResult: unknown;
32
+ /** Summary of actions that were performed */
33
+ actionSummary: ActionSummary;
34
+ }
35
+
36
+ /**
37
+ * Summary of actions performed in a session.
38
+ */
39
+ export interface ActionSummary {
40
+ /** Total number of actions */
41
+ totalActions: number;
42
+ /** Number of successful actions */
43
+ successfulActions: number;
44
+ /** Number of failed actions */
45
+ failedActions: number;
46
+ /** Number of extract actions */
47
+ extractActions: number;
48
+ /** Number of click actions */
49
+ clickActions: number;
50
+ /** Number of type actions */
51
+ typeActions: number;
52
+ /** Number of navigate actions */
53
+ navigateActions: number;
54
+ /** Unique intent IDs interacted with */
55
+ uniqueIntentIds: string[];
56
+ }
57
+
58
+ /**
59
+ * Result of confidence scoring.
60
+ */
61
+ export interface ConfidenceScoreResult {
62
+ /** Overall confidence score (0-1) */
63
+ score: number;
64
+ /** Breakdown of confidence factors */
65
+ factors: ConfidenceFactor[];
66
+ /** Whether the result is considered reliable */
67
+ isReliable: boolean;
68
+ /** Threshold used for reliability determination */
69
+ reliabilityThreshold: number;
70
+ }
71
+
72
+ /**
73
+ * A factor contributing to confidence score.
74
+ */
75
+ export interface ConfidenceFactor {
76
+ /** Name of the factor */
77
+ name: string;
78
+ /** Weight of this factor (0-1) */
79
+ weight: number;
80
+ /** Score for this factor (0-1) */
81
+ score: number;
82
+ /** Description of how this factor was calculated */
83
+ description: string;
84
+ }
85
+
86
+ /**
87
+ * Configuration for the Verification Engine.
88
+ */
89
+ export interface VerificationEngineConfig {
90
+ /** Minimum confidence threshold for reliable results */
91
+ reliabilityThreshold: number;
92
+ /** Weight for action success rate in confidence calculation */
93
+ actionSuccessWeight: number;
94
+ /** Weight for extract action presence in confidence calculation */
95
+ extractPresenceWeight: number;
96
+ /** Weight for action-result alignment in confidence calculation */
97
+ alignmentWeight: number;
98
+ /** Weight for action completeness in confidence calculation */
99
+ completenessWeight: number;
100
+ }
101
+
102
+ /**
103
+ * Extended diff analysis with winner recommendation details.
104
+ */
105
+ export interface ExtendedDiffAnalysis extends DiffAnalysis {
106
+ /** Detailed reasoning for the recommendation */
107
+ recommendationReason: string;
108
+ /** Confidence in the recommendation (0-1) */
109
+ recommendationConfidence: number;
110
+ /** Metrics for agent A */
111
+ agentAMetrics: AgentMetrics;
112
+ /** Metrics for agent B */
113
+ agentBMetrics: AgentMetrics;
114
+ }
115
+
116
+ /**
117
+ * Metrics for an agent's performance.
118
+ */
119
+ export interface AgentMetrics {
120
+ /** Total actions performed */
121
+ totalActions: number;
122
+ /** Successful actions */
123
+ successfulActions: number;
124
+ /** Failed actions */
125
+ failedActions: number;
126
+ /** Success rate (0-1) */
127
+ successRate: number;
128
+ /** Number of extract actions */
129
+ extractActions: number;
130
+ /** Whether agent completed the task */
131
+ completedTask: boolean;
132
+ }
133
+
134
+ // =============================================================================
135
+ // Default Configuration
136
+ // =============================================================================
137
+
138
+ const DEFAULT_CONFIG: VerificationEngineConfig = {
139
+ reliabilityThreshold: 0.7,
140
+ actionSuccessWeight: 0.3,
141
+ extractPresenceWeight: 0.25,
142
+ alignmentWeight: 0.3,
143
+ completenessWeight: 0.15,
144
+ };
145
+
146
+ // =============================================================================
147
+ // Verification Engine Implementation
148
+ // =============================================================================
149
+
150
+ /**
151
+ * Verification Engine for detecting hallucinations and analyzing agent results.
152
+ */
153
+ export class VerificationEngine {
154
+ private config: VerificationEngineConfig;
155
+
156
+ constructor(config: Partial<VerificationEngineConfig> = {}) {
157
+ this.config = { ...DEFAULT_CONFIG, ...config };
158
+ }
159
+
160
+ // ===========================================================================
161
+ // Hallucination Detection (Requirement 8.5)
162
+ // ===========================================================================
163
+
164
+ /**
165
+ * Detect if a claimed result is a hallucination.
166
+ *
167
+ * A hallucination is detected when:
168
+ * 1. Agent claims data but has no extract actions
169
+ * 2. Agent claims success but all actions failed
170
+ * 3. Claimed data fields don't correspond to any extracted intent IDs
171
+ * 4. Result complexity exceeds what actions could produce
172
+ *
173
+ * Requirements: 8.5
174
+ */
175
+ detectHallucination(
176
+ claimedResult: unknown,
177
+ actionLog: ActionLogEntry[]
178
+ ): HallucinationDetectionResult {
179
+ const reasons: string[] = [];
180
+ const actionSummary = this.summarizeActions(actionLog);
181
+
182
+ // Check 1: No actions at all but claiming a result
183
+ if (actionLog.length === 0 && claimedResult !== null && claimedResult !== undefined) {
184
+ reasons.push('Result claimed with no actions performed');
185
+ }
186
+
187
+ // Check 2: All actions failed but claiming success
188
+ if (actionSummary.totalActions > 0 && actionSummary.successfulActions === 0) {
189
+ if (this.isSuccessfulResult(claimedResult)) {
190
+ reasons.push('Success claimed but all actions failed');
191
+ }
192
+ }
193
+
194
+ // Check 3: Data claimed but no extract actions
195
+ if (this.hasDataClaim(claimedResult) && actionSummary.extractActions === 0) {
196
+ reasons.push('Data claimed but no extract actions performed');
197
+ }
198
+
199
+ // Check 4: Claimed data fields don't match extracted intent IDs
200
+ if (this.hasDataClaim(claimedResult)) {
201
+ const dataFieldMismatch = this.checkDataFieldMismatch(
202
+ claimedResult,
203
+ actionSummary.uniqueIntentIds
204
+ );
205
+ if (dataFieldMismatch) {
206
+ reasons.push('Claimed data fields do not correspond to extracted elements');
207
+ }
208
+ }
209
+
210
+ // Check 5: Result complexity exceeds action capability
211
+ if (this.isResultTooComplex(claimedResult, actionSummary)) {
212
+ reasons.push('Result complexity exceeds what actions could produce');
213
+ }
214
+
215
+ // Calculate confidence in the detection
216
+ const isHallucination = reasons.length > 0;
217
+ const confidence = this.calculateHallucinationConfidence(reasons, actionSummary);
218
+
219
+ return {
220
+ isHallucination,
221
+ confidence,
222
+ reasons,
223
+ claimedResult,
224
+ actionSummary,
225
+ };
226
+ }
227
+
228
+ /**
229
+ * Summarize actions from an action log.
230
+ */
231
+ private summarizeActions(actionLog: ActionLogEntry[]): ActionSummary {
232
+ const uniqueIntentIds = new Set<string>();
233
+
234
+ let successfulActions = 0;
235
+ let failedActions = 0;
236
+ let extractActions = 0;
237
+ let clickActions = 0;
238
+ let typeActions = 0;
239
+ let navigateActions = 0;
240
+
241
+ for (const entry of actionLog) {
242
+ uniqueIntentIds.add(entry.intentId);
243
+
244
+ if (entry.result === 'success') {
245
+ successfulActions++;
246
+ } else {
247
+ failedActions++;
248
+ }
249
+
250
+ switch (entry.action) {
251
+ case 'extract':
252
+ extractActions++;
253
+ break;
254
+ case 'click':
255
+ clickActions++;
256
+ break;
257
+ case 'type':
258
+ typeActions++;
259
+ break;
260
+ case 'navigate':
261
+ navigateActions++;
262
+ break;
263
+ }
264
+ }
265
+
266
+ return {
267
+ totalActions: actionLog.length,
268
+ successfulActions,
269
+ failedActions,
270
+ extractActions,
271
+ clickActions,
272
+ typeActions,
273
+ navigateActions,
274
+ uniqueIntentIds: Array.from(uniqueIntentIds),
275
+ };
276
+ }
277
+
278
+ /**
279
+ * Check if a result indicates success.
280
+ */
281
+ private isSuccessfulResult(result: unknown): boolean {
282
+ if (result === null || result === undefined) {
283
+ return false;
284
+ }
285
+
286
+ if (typeof result === 'object') {
287
+ const obj = result as Record<string, unknown>;
288
+
289
+ // Check for explicit success indicators
290
+ if ('success' in obj && obj.success === true) {
291
+ return true;
292
+ }
293
+
294
+ // Check for data presence (implies success)
295
+ if ('data' in obj && obj.data !== null && obj.data !== undefined) {
296
+ const data = obj.data;
297
+ if (Array.isArray(data) && data.length > 0) {
298
+ return true;
299
+ }
300
+ if (typeof data === 'object' && Object.keys(data as object).length > 0) {
301
+ return true;
302
+ }
303
+ }
304
+
305
+ // Check for non-empty result object
306
+ const nonMetaKeys = Object.keys(obj).filter(
307
+ (key) => !['metadata', 'verificationHash', 'confidence', 'error'].includes(key)
308
+ );
309
+ if (nonMetaKeys.length > 0) {
310
+ return true;
311
+ }
312
+ }
313
+
314
+ return false;
315
+ }
316
+
317
+ /**
318
+ * Check if a result claims to have data.
319
+ */
320
+ private hasDataClaim(result: unknown): boolean {
321
+ if (result === null || result === undefined) {
322
+ return false;
323
+ }
324
+
325
+ if (typeof result !== 'object') {
326
+ return false;
327
+ }
328
+
329
+ const obj = result as Record<string, unknown>;
330
+
331
+ // Check for explicit data field
332
+ if ('data' in obj) {
333
+ const data = obj.data;
334
+ if (Array.isArray(data) && data.length > 0) {
335
+ return true;
336
+ }
337
+ if (typeof data === 'object' && data !== null && Object.keys(data as object).length > 0) {
338
+ return true;
339
+ }
340
+ }
341
+
342
+ // Check for data-like fields (excluding metadata)
343
+ const dataFields = Object.keys(obj).filter(
344
+ (key) => !['metadata', 'verificationHash', 'confidence', 'error', 'success'].includes(key)
345
+ );
346
+
347
+ return dataFields.length > 0;
348
+ }
349
+
350
+ /**
351
+ * Check if claimed data fields mismatch with extracted intent IDs.
352
+ */
353
+ private checkDataFieldMismatch(
354
+ _result: unknown,
355
+ extractedIntentIds: string[]
356
+ ): boolean {
357
+ if (extractedIntentIds.length === 0) {
358
+ return true; // No extractions means any data claim is a mismatch
359
+ }
360
+
361
+ // If we have extract actions, we assume the data could be valid
362
+ // More sophisticated matching would require schema analysis
363
+ return false;
364
+ }
365
+
366
+ /**
367
+ * Check if result complexity exceeds what actions could produce.
368
+ */
369
+ private isResultTooComplex(result: unknown, summary: ActionSummary): boolean {
370
+ if (result === null || result === undefined) {
371
+ return false;
372
+ }
373
+
374
+ if (typeof result !== 'object') {
375
+ return false;
376
+ }
377
+
378
+ const obj = result as Record<string, unknown>;
379
+
380
+ // Count data items in result
381
+ let dataItemCount = 0;
382
+
383
+ if ('data' in obj && Array.isArray(obj.data)) {
384
+ dataItemCount = obj.data.length;
385
+ } else if ('data' in obj && typeof obj.data === 'object' && obj.data !== null) {
386
+ dataItemCount = Object.keys(obj.data as object).length;
387
+ }
388
+
389
+ // If claiming many data items but few extract actions, suspicious
390
+ // Allow some multiplier for batch extractions
391
+ const maxReasonableItems = Math.max(summary.extractActions * 10, 1);
392
+
393
+ return dataItemCount > maxReasonableItems && summary.extractActions < dataItemCount / 10;
394
+ }
395
+
396
+ /**
397
+ * Calculate confidence in hallucination detection.
398
+ */
399
+ private calculateHallucinationConfidence(
400
+ reasons: string[],
401
+ summary: ActionSummary
402
+ ): number {
403
+ if (reasons.length === 0) {
404
+ return 0; // No hallucination detected
405
+ }
406
+
407
+ // Base confidence from number of reasons
408
+ let confidence = Math.min(reasons.length * 0.3, 0.9);
409
+
410
+ // Increase confidence if no actions at all
411
+ if (summary.totalActions === 0) {
412
+ confidence = Math.max(confidence, 0.95);
413
+ }
414
+
415
+ // Increase confidence if all actions failed
416
+ if (summary.totalActions > 0 && summary.successfulActions === 0) {
417
+ confidence = Math.max(confidence, 0.85);
418
+ }
419
+
420
+ return Math.min(confidence, 1);
421
+ }
422
+
423
+ // ===========================================================================
424
+ // Diff Analysis for Conflicts (Requirement 8.4)
425
+ // ===========================================================================
426
+
427
+ /**
428
+ * Compare action logs of competing agents and recommend a winner.
429
+ *
430
+ * Requirements: 8.4
431
+ */
432
+ analyzeDiff(
433
+ agentALog: ActionLogEntry[],
434
+ agentBLog: ActionLogEntry[]
435
+ ): ExtendedDiffAnalysis {
436
+ // Find divergence point
437
+ const divergencePoint = this.findDivergencePoint(agentALog, agentBLog);
438
+
439
+ // Get paths from divergence
440
+ const agentAPath = agentALog.slice(divergencePoint);
441
+ const agentBPath = agentBLog.slice(divergencePoint);
442
+
443
+ // Calculate metrics for each agent
444
+ const agentAMetrics = this.calculateAgentMetrics(agentALog);
445
+ const agentBMetrics = this.calculateAgentMetrics(agentBLog);
446
+
447
+ // Determine recommendation
448
+ const { recommendation, reason, confidence } = this.determineWinner(
449
+ agentAMetrics,
450
+ agentBMetrics,
451
+ divergencePoint
452
+ );
453
+
454
+ return {
455
+ divergencePoint,
456
+ agentAPath,
457
+ agentBPath,
458
+ recommendation,
459
+ recommendationReason: reason,
460
+ recommendationConfidence: confidence,
461
+ agentAMetrics,
462
+ agentBMetrics,
463
+ };
464
+ }
465
+
466
+ /**
467
+ * Find the index where two action logs diverge.
468
+ */
469
+ private findDivergencePoint(
470
+ logA: ActionLogEntry[],
471
+ logB: ActionLogEntry[]
472
+ ): number {
473
+ const minLength = Math.min(logA.length, logB.length);
474
+
475
+ for (let i = 0; i < minLength; i++) {
476
+ if (!this.actionsMatch(logA[i], logB[i])) {
477
+ return i;
478
+ }
479
+ }
480
+
481
+ // If one log is longer, divergence is at the end of the shorter one
482
+ if (logA.length !== logB.length) {
483
+ return minLength;
484
+ }
485
+
486
+ // Logs are identical
487
+ return logA.length;
488
+ }
489
+
490
+ /**
491
+ * Check if two action entries match (ignoring timestamps).
492
+ */
493
+ private actionsMatch(a: ActionLogEntry, b: ActionLogEntry): boolean {
494
+ return (
495
+ a.action === b.action &&
496
+ a.intentId === b.intentId &&
497
+ a.value === b.value &&
498
+ a.result === b.result
499
+ );
500
+ }
501
+
502
+ /**
503
+ * Calculate metrics for an agent's action log.
504
+ */
505
+ private calculateAgentMetrics(log: ActionLogEntry[]): AgentMetrics {
506
+ const successfulActions = log.filter((e) => e.result === 'success').length;
507
+ const failedActions = log.filter((e) => e.result === 'failure').length;
508
+ const extractActions = log.filter((e) => e.action === 'extract').length;
509
+
510
+ const totalActions = log.length;
511
+ const successRate = totalActions > 0 ? successfulActions / totalActions : 0;
512
+
513
+ // Consider task completed if there are successful extract actions
514
+ const completedTask = extractActions > 0 && successfulActions > 0;
515
+
516
+ return {
517
+ totalActions,
518
+ successfulActions,
519
+ failedActions,
520
+ successRate,
521
+ extractActions,
522
+ completedTask,
523
+ };
524
+ }
525
+
526
+ /**
527
+ * Determine the winner based on metrics.
528
+ */
529
+ private determineWinner(
530
+ metricsA: AgentMetrics,
531
+ metricsB: AgentMetrics,
532
+ divergencePoint: number
533
+ ): { recommendation: 'agent_a' | 'agent_b' | 'tie' | 'both_invalid'; reason: string; confidence: number } {
534
+ // Both invalid if neither completed the task
535
+ if (!metricsA.completedTask && !metricsB.completedTask) {
536
+ // Check if both have only failures
537
+ if (metricsA.successfulActions === 0 && metricsB.successfulActions === 0) {
538
+ return {
539
+ recommendation: 'both_invalid',
540
+ reason: 'Both agents failed to complete any actions successfully',
541
+ confidence: 0.95,
542
+ };
543
+ }
544
+ }
545
+
546
+ // One completed, one didn't
547
+ if (metricsA.completedTask && !metricsB.completedTask) {
548
+ return {
549
+ recommendation: 'agent_a',
550
+ reason: 'Agent A completed the task while Agent B did not',
551
+ confidence: 0.9,
552
+ };
553
+ }
554
+ if (metricsB.completedTask && !metricsA.completedTask) {
555
+ return {
556
+ recommendation: 'agent_b',
557
+ reason: 'Agent B completed the task while Agent A did not',
558
+ confidence: 0.9,
559
+ };
560
+ }
561
+
562
+ // Both completed - compare success rates
563
+ const rateDiff = Math.abs(metricsA.successRate - metricsB.successRate);
564
+
565
+ if (rateDiff > 0.1) {
566
+ if (metricsA.successRate > metricsB.successRate) {
567
+ return {
568
+ recommendation: 'agent_a',
569
+ reason: `Agent A has higher success rate (${(metricsA.successRate * 100).toFixed(1)}% vs ${(metricsB.successRate * 100).toFixed(1)}%)`,
570
+ confidence: 0.7 + rateDiff * 0.2,
571
+ };
572
+ } else {
573
+ return {
574
+ recommendation: 'agent_b',
575
+ reason: `Agent B has higher success rate (${(metricsB.successRate * 100).toFixed(1)}% vs ${(metricsA.successRate * 100).toFixed(1)}%)`,
576
+ confidence: 0.7 + rateDiff * 0.2,
577
+ };
578
+ }
579
+ }
580
+
581
+ // Similar success rates - compare extract actions
582
+ if (metricsA.extractActions !== metricsB.extractActions) {
583
+ if (metricsA.extractActions > metricsB.extractActions) {
584
+ return {
585
+ recommendation: 'agent_a',
586
+ reason: `Agent A performed more data extractions (${metricsA.extractActions} vs ${metricsB.extractActions})`,
587
+ confidence: 0.65,
588
+ };
589
+ } else {
590
+ return {
591
+ recommendation: 'agent_b',
592
+ reason: `Agent B performed more data extractions (${metricsB.extractActions} vs ${metricsA.extractActions})`,
593
+ confidence: 0.65,
594
+ };
595
+ }
596
+ }
597
+
598
+ // Truly tied
599
+ return {
600
+ recommendation: 'tie',
601
+ reason: `Both agents performed similarly (diverged at action ${divergencePoint})`,
602
+ confidence: 0.5,
603
+ };
604
+ }
605
+
606
+ // ===========================================================================
607
+ // Confidence Scoring (Requirement 8.6)
608
+ // ===========================================================================
609
+
610
+ /**
611
+ * Calculate confidence score for a verification result.
612
+ *
613
+ * Requirements: 8.6
614
+ */
615
+ calculateConfidence(
616
+ claimedResult: unknown,
617
+ actionLog: ActionLogEntry[],
618
+ proof?: VerificationProof
619
+ ): ConfidenceScoreResult {
620
+ const factors: ConfidenceFactor[] = [];
621
+ const summary = this.summarizeActions(actionLog);
622
+
623
+ // Factor 1: Action success rate
624
+ const successRateScore = summary.totalActions > 0
625
+ ? summary.successfulActions / summary.totalActions
626
+ : 0;
627
+ factors.push({
628
+ name: 'Action Success Rate',
629
+ weight: this.config.actionSuccessWeight,
630
+ score: successRateScore,
631
+ description: `${summary.successfulActions}/${summary.totalActions} actions succeeded`,
632
+ });
633
+
634
+ // Factor 2: Extract action presence
635
+ const extractScore = summary.extractActions > 0 ? 1 : 0;
636
+ factors.push({
637
+ name: 'Extract Actions Present',
638
+ weight: this.config.extractPresenceWeight,
639
+ score: extractScore,
640
+ description: summary.extractActions > 0
641
+ ? `${summary.extractActions} extract actions performed`
642
+ : 'No extract actions performed',
643
+ });
644
+
645
+ // Factor 3: Action-result alignment
646
+ const alignmentScore = this.calculateAlignmentScore(claimedResult, summary);
647
+ factors.push({
648
+ name: 'Action-Result Alignment',
649
+ weight: this.config.alignmentWeight,
650
+ score: alignmentScore,
651
+ description: alignmentScore > 0.7
652
+ ? 'Result aligns well with actions'
653
+ : 'Result may not fully align with actions',
654
+ });
655
+
656
+ // Factor 4: Action completeness
657
+ const completenessScore = this.calculateCompletenessScore(summary);
658
+ factors.push({
659
+ name: 'Action Completeness',
660
+ weight: this.config.completenessWeight,
661
+ score: completenessScore,
662
+ description: completenessScore > 0.7
663
+ ? 'Action sequence appears complete'
664
+ : 'Action sequence may be incomplete',
665
+ });
666
+
667
+ // Calculate weighted score
668
+ const totalWeight = factors.reduce((sum, f) => sum + f.weight, 0);
669
+ const weightedSum = factors.reduce((sum, f) => sum + f.score * f.weight, 0);
670
+ const score = totalWeight > 0 ? weightedSum / totalWeight : 0;
671
+
672
+ // Adjust score based on proof if available
673
+ let finalScore = score;
674
+ if (proof) {
675
+ if (proof.resultMatchesActions) {
676
+ finalScore = Math.min(finalScore + 0.1, 1);
677
+ } else {
678
+ finalScore = Math.max(finalScore - 0.2, 0);
679
+ }
680
+ }
681
+
682
+ return {
683
+ score: finalScore,
684
+ factors,
685
+ isReliable: finalScore >= this.config.reliabilityThreshold,
686
+ reliabilityThreshold: this.config.reliabilityThreshold,
687
+ };
688
+ }
689
+
690
+ /**
691
+ * Calculate alignment score between result and actions.
692
+ */
693
+ private calculateAlignmentScore(_result: unknown, summary: ActionSummary): number {
694
+ // No result claimed - perfect alignment (nothing to misalign)
695
+ if (_result === null || _result === undefined) {
696
+ return 1;
697
+ }
698
+
699
+ // No actions but claiming result - poor alignment
700
+ if (summary.totalActions === 0) {
701
+ return 0;
702
+ }
703
+
704
+ // All failures but claiming success - poor alignment
705
+ if (summary.successfulActions === 0 && this.isSuccessfulResult(_result)) {
706
+ return 0;
707
+ }
708
+
709
+ // Data claimed without extract actions - poor alignment
710
+ if (this.hasDataClaim(_result) && summary.extractActions === 0) {
711
+ return 0.2;
712
+ }
713
+
714
+ // Has extract actions and data claim - good alignment
715
+ if (this.hasDataClaim(_result) && summary.extractActions > 0) {
716
+ return 0.9;
717
+ }
718
+
719
+ // Default moderate alignment
720
+ return 0.6;
721
+ }
722
+
723
+ /**
724
+ * Calculate completeness score for action sequence.
725
+ */
726
+ private calculateCompletenessScore(summary: ActionSummary): number {
727
+ // No actions - incomplete
728
+ if (summary.totalActions === 0) {
729
+ return 0;
730
+ }
731
+
732
+ // Check for typical complete sequence patterns
733
+ let score = 0.5; // Base score
734
+
735
+ // Has navigation (usually needed to start)
736
+ if (summary.navigateActions > 0) {
737
+ score += 0.15;
738
+ }
739
+
740
+ // Has interactions (clicks, types)
741
+ if (summary.clickActions > 0 || summary.typeActions > 0) {
742
+ score += 0.15;
743
+ }
744
+
745
+ // Has extractions (usually needed to get results)
746
+ if (summary.extractActions > 0) {
747
+ score += 0.2;
748
+ }
749
+
750
+ return Math.min(score, 1);
751
+ }
752
+
753
+ // ===========================================================================
754
+ // Utility Methods
755
+ // ===========================================================================
756
+
757
+ /**
758
+ * Get the current configuration.
759
+ */
760
+ getConfig(): VerificationEngineConfig {
761
+ return { ...this.config };
762
+ }
763
+
764
+ /**
765
+ * Update configuration.
766
+ */
767
+ updateConfig(config: Partial<VerificationEngineConfig>): void {
768
+ this.config = { ...this.config, ...config };
769
+ }
770
+ }
771
+
772
+ // =============================================================================
773
+ // Factory Function
774
+ // =============================================================================
775
+
776
+ /**
777
+ * Create a new Verification Engine instance.
778
+ */
779
+ export function createVerificationEngine(
780
+ config: Partial<VerificationEngineConfig> = {}
781
+ ): VerificationEngine {
782
+ return new VerificationEngine(config);
783
+ }