outcome-cli 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +261 -0
- package/package.json +95 -0
- package/src/agents/README.md +139 -0
- package/src/agents/adapters/anthropic.adapter.ts +166 -0
- package/src/agents/adapters/dalle.adapter.ts +145 -0
- package/src/agents/adapters/gemini.adapter.ts +134 -0
- package/src/agents/adapters/imagen.adapter.ts +106 -0
- package/src/agents/adapters/nano-banana.adapter.ts +129 -0
- package/src/agents/adapters/openai.adapter.ts +165 -0
- package/src/agents/adapters/veo.adapter.ts +130 -0
- package/src/agents/agent.schema.property.test.ts +379 -0
- package/src/agents/agent.schema.test.ts +148 -0
- package/src/agents/agent.schema.ts +263 -0
- package/src/agents/index.ts +60 -0
- package/src/agents/registered-agent.schema.ts +356 -0
- package/src/agents/registry.ts +97 -0
- package/src/agents/tournament-configs.property.test.ts +266 -0
- package/src/cli/README.md +145 -0
- package/src/cli/commands/define.ts +79 -0
- package/src/cli/commands/list.ts +46 -0
- package/src/cli/commands/logs.ts +83 -0
- package/src/cli/commands/run.ts +416 -0
- package/src/cli/commands/verify.ts +110 -0
- package/src/cli/index.ts +81 -0
- package/src/config/README.md +128 -0
- package/src/config/env.ts +262 -0
- package/src/config/index.ts +19 -0
- package/src/eval/README.md +318 -0
- package/src/eval/ai-judge.test.ts +435 -0
- package/src/eval/ai-judge.ts +368 -0
- package/src/eval/code-validators.ts +414 -0
- package/src/eval/evaluateOutcome.property.test.ts +1174 -0
- package/src/eval/evaluateOutcome.ts +591 -0
- package/src/eval/immigration-validators.ts +122 -0
- package/src/eval/index.ts +90 -0
- package/src/eval/judge-cache.ts +402 -0
- package/src/eval/tournament-validators.property.test.ts +439 -0
- package/src/eval/validators.property.test.ts +1118 -0
- package/src/eval/validators.ts +1199 -0
- package/src/eval/weighted-scorer.ts +285 -0
- package/src/index.ts +17 -0
- package/src/league/README.md +188 -0
- package/src/league/health-check.ts +353 -0
- package/src/league/index.ts +93 -0
- package/src/league/killAgent.ts +151 -0
- package/src/league/league.test.ts +1151 -0
- package/src/league/runLeague.ts +843 -0
- package/src/league/scoreAgent.ts +175 -0
- package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
- package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
- package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
- package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
- package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
- package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
- package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
- package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
- package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
- package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
- package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
- package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
- package/src/modules/omnibridge/api/.gitkeep +1 -0
- package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
- package/src/modules/omnibridge/auth/.gitkeep +1 -0
- package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
- package/src/modules/omnibridge/auth/session-vault.ts +577 -0
- package/src/modules/omnibridge/core/.gitkeep +1 -0
- package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
- package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
- package/src/modules/omnibridge/core/types.ts +610 -0
- package/src/modules/omnibridge/execution/.gitkeep +1 -0
- package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
- package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
- package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
- package/src/modules/omnibridge/index.ts +212 -0
- package/src/modules/omnibridge/omnibridge.ts +510 -0
- package/src/modules/omnibridge/verification/.gitkeep +1 -0
- package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
- package/src/outcomes/README.md +75 -0
- package/src/outcomes/acquire-pilot-customer.ts +297 -0
- package/src/outcomes/code-delivery-outcomes.ts +89 -0
- package/src/outcomes/code-outcomes.ts +256 -0
- package/src/outcomes/code_review_battle.test.ts +135 -0
- package/src/outcomes/code_review_battle.ts +135 -0
- package/src/outcomes/cold_email_battle.ts +97 -0
- package/src/outcomes/content_creation_battle.ts +160 -0
- package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
- package/src/outcomes/index.ts +107 -0
- package/src/outcomes/lead_gen_battle.test.ts +113 -0
- package/src/outcomes/lead_gen_battle.ts +99 -0
- package/src/outcomes/outcome.schema.property.test.ts +229 -0
- package/src/outcomes/outcome.schema.ts +187 -0
- package/src/outcomes/qualified_sales_interest.ts +118 -0
- package/src/outcomes/swarm_planner.property.test.ts +370 -0
- package/src/outcomes/swarm_planner.ts +96 -0
- package/src/outcomes/web_extraction.ts +234 -0
- package/src/runtime/README.md +220 -0
- package/src/runtime/agentRunner.test.ts +341 -0
- package/src/runtime/agentRunner.ts +746 -0
- package/src/runtime/claudeAdapter.ts +232 -0
- package/src/runtime/costTracker.ts +123 -0
- package/src/runtime/index.ts +34 -0
- package/src/runtime/modelAdapter.property.test.ts +305 -0
- package/src/runtime/modelAdapter.ts +144 -0
- package/src/runtime/openaiAdapter.ts +235 -0
- package/src/utils/README.md +122 -0
- package/src/utils/command-runner.ts +134 -0
- package/src/utils/cost-guard.ts +379 -0
- package/src/utils/errors.test.ts +290 -0
- package/src/utils/errors.ts +442 -0
- package/src/utils/index.ts +37 -0
- package/src/utils/logger.test.ts +361 -0
- package/src/utils/logger.ts +419 -0
- package/src/utils/output-parsers.ts +216 -0
|
@@ -0,0 +1,783 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Verification Engine
|
|
3
|
+
*
|
|
4
|
+
* Provides hallucination detection, diff analysis for conflicts,
|
|
5
|
+
* and confidence scoring for agent verification.
|
|
6
|
+
*
|
|
7
|
+
* Requirements: 8.4, 8.5, 8.6
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import type {
|
|
11
|
+
ActionLogEntry,
|
|
12
|
+
VerificationProof,
|
|
13
|
+
DiffAnalysis,
|
|
14
|
+
} from '../core/types.js';
|
|
15
|
+
|
|
16
|
+
// =============================================================================
|
|
17
|
+
// Types
|
|
18
|
+
// =============================================================================
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Result of hallucination detection.
|
|
22
|
+
*/
|
|
23
|
+
export interface HallucinationDetectionResult {
|
|
24
|
+
/** Whether hallucination was detected */
|
|
25
|
+
isHallucination: boolean;
|
|
26
|
+
/** Confidence in the detection (0-1) */
|
|
27
|
+
confidence: number;
|
|
28
|
+
/** Reasons for the detection */
|
|
29
|
+
reasons: string[];
|
|
30
|
+
/** The claimed result that was analyzed */
|
|
31
|
+
claimedResult: unknown;
|
|
32
|
+
/** Summary of actions that were performed */
|
|
33
|
+
actionSummary: ActionSummary;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Summary of actions performed in a session.
|
|
38
|
+
*/
|
|
39
|
+
export interface ActionSummary {
|
|
40
|
+
/** Total number of actions */
|
|
41
|
+
totalActions: number;
|
|
42
|
+
/** Number of successful actions */
|
|
43
|
+
successfulActions: number;
|
|
44
|
+
/** Number of failed actions */
|
|
45
|
+
failedActions: number;
|
|
46
|
+
/** Number of extract actions */
|
|
47
|
+
extractActions: number;
|
|
48
|
+
/** Number of click actions */
|
|
49
|
+
clickActions: number;
|
|
50
|
+
/** Number of type actions */
|
|
51
|
+
typeActions: number;
|
|
52
|
+
/** Number of navigate actions */
|
|
53
|
+
navigateActions: number;
|
|
54
|
+
/** Unique intent IDs interacted with */
|
|
55
|
+
uniqueIntentIds: string[];
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Result of confidence scoring.
|
|
60
|
+
*/
|
|
61
|
+
export interface ConfidenceScoreResult {
|
|
62
|
+
/** Overall confidence score (0-1) */
|
|
63
|
+
score: number;
|
|
64
|
+
/** Breakdown of confidence factors */
|
|
65
|
+
factors: ConfidenceFactor[];
|
|
66
|
+
/** Whether the result is considered reliable */
|
|
67
|
+
isReliable: boolean;
|
|
68
|
+
/** Threshold used for reliability determination */
|
|
69
|
+
reliabilityThreshold: number;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* A factor contributing to confidence score.
|
|
74
|
+
*/
|
|
75
|
+
export interface ConfidenceFactor {
|
|
76
|
+
/** Name of the factor */
|
|
77
|
+
name: string;
|
|
78
|
+
/** Weight of this factor (0-1) */
|
|
79
|
+
weight: number;
|
|
80
|
+
/** Score for this factor (0-1) */
|
|
81
|
+
score: number;
|
|
82
|
+
/** Description of how this factor was calculated */
|
|
83
|
+
description: string;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Configuration for the Verification Engine.
|
|
88
|
+
*/
|
|
89
|
+
export interface VerificationEngineConfig {
|
|
90
|
+
/** Minimum confidence threshold for reliable results */
|
|
91
|
+
reliabilityThreshold: number;
|
|
92
|
+
/** Weight for action success rate in confidence calculation */
|
|
93
|
+
actionSuccessWeight: number;
|
|
94
|
+
/** Weight for extract action presence in confidence calculation */
|
|
95
|
+
extractPresenceWeight: number;
|
|
96
|
+
/** Weight for action-result alignment in confidence calculation */
|
|
97
|
+
alignmentWeight: number;
|
|
98
|
+
/** Weight for action completeness in confidence calculation */
|
|
99
|
+
completenessWeight: number;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Extended diff analysis with winner recommendation details.
|
|
104
|
+
*/
|
|
105
|
+
export interface ExtendedDiffAnalysis extends DiffAnalysis {
|
|
106
|
+
/** Detailed reasoning for the recommendation */
|
|
107
|
+
recommendationReason: string;
|
|
108
|
+
/** Confidence in the recommendation (0-1) */
|
|
109
|
+
recommendationConfidence: number;
|
|
110
|
+
/** Metrics for agent A */
|
|
111
|
+
agentAMetrics: AgentMetrics;
|
|
112
|
+
/** Metrics for agent B */
|
|
113
|
+
agentBMetrics: AgentMetrics;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Metrics for an agent's performance.
|
|
118
|
+
*/
|
|
119
|
+
export interface AgentMetrics {
|
|
120
|
+
/** Total actions performed */
|
|
121
|
+
totalActions: number;
|
|
122
|
+
/** Successful actions */
|
|
123
|
+
successfulActions: number;
|
|
124
|
+
/** Failed actions */
|
|
125
|
+
failedActions: number;
|
|
126
|
+
/** Success rate (0-1) */
|
|
127
|
+
successRate: number;
|
|
128
|
+
/** Number of extract actions */
|
|
129
|
+
extractActions: number;
|
|
130
|
+
/** Whether agent completed the task */
|
|
131
|
+
completedTask: boolean;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// =============================================================================
|
|
135
|
+
// Default Configuration
|
|
136
|
+
// =============================================================================
|
|
137
|
+
|
|
138
|
+
const DEFAULT_CONFIG: VerificationEngineConfig = {
|
|
139
|
+
reliabilityThreshold: 0.7,
|
|
140
|
+
actionSuccessWeight: 0.3,
|
|
141
|
+
extractPresenceWeight: 0.25,
|
|
142
|
+
alignmentWeight: 0.3,
|
|
143
|
+
completenessWeight: 0.15,
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
// =============================================================================
|
|
147
|
+
// Verification Engine Implementation
|
|
148
|
+
// =============================================================================
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Verification Engine for detecting hallucinations and analyzing agent results.
|
|
152
|
+
*/
|
|
153
|
+
export class VerificationEngine {
|
|
154
|
+
private config: VerificationEngineConfig;
|
|
155
|
+
|
|
156
|
+
constructor(config: Partial<VerificationEngineConfig> = {}) {
|
|
157
|
+
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// ===========================================================================
|
|
161
|
+
// Hallucination Detection (Requirement 8.5)
|
|
162
|
+
// ===========================================================================
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Detect if a claimed result is a hallucination.
|
|
166
|
+
*
|
|
167
|
+
* A hallucination is detected when:
|
|
168
|
+
* 1. Agent claims data but has no extract actions
|
|
169
|
+
* 2. Agent claims success but all actions failed
|
|
170
|
+
* 3. Claimed data fields don't correspond to any extracted intent IDs
|
|
171
|
+
* 4. Result complexity exceeds what actions could produce
|
|
172
|
+
*
|
|
173
|
+
* Requirements: 8.5
|
|
174
|
+
*/
|
|
175
|
+
detectHallucination(
|
|
176
|
+
claimedResult: unknown,
|
|
177
|
+
actionLog: ActionLogEntry[]
|
|
178
|
+
): HallucinationDetectionResult {
|
|
179
|
+
const reasons: string[] = [];
|
|
180
|
+
const actionSummary = this.summarizeActions(actionLog);
|
|
181
|
+
|
|
182
|
+
// Check 1: No actions at all but claiming a result
|
|
183
|
+
if (actionLog.length === 0 && claimedResult !== null && claimedResult !== undefined) {
|
|
184
|
+
reasons.push('Result claimed with no actions performed');
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// Check 2: All actions failed but claiming success
|
|
188
|
+
if (actionSummary.totalActions > 0 && actionSummary.successfulActions === 0) {
|
|
189
|
+
if (this.isSuccessfulResult(claimedResult)) {
|
|
190
|
+
reasons.push('Success claimed but all actions failed');
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Check 3: Data claimed but no extract actions
|
|
195
|
+
if (this.hasDataClaim(claimedResult) && actionSummary.extractActions === 0) {
|
|
196
|
+
reasons.push('Data claimed but no extract actions performed');
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Check 4: Claimed data fields don't match extracted intent IDs
|
|
200
|
+
if (this.hasDataClaim(claimedResult)) {
|
|
201
|
+
const dataFieldMismatch = this.checkDataFieldMismatch(
|
|
202
|
+
claimedResult,
|
|
203
|
+
actionSummary.uniqueIntentIds
|
|
204
|
+
);
|
|
205
|
+
if (dataFieldMismatch) {
|
|
206
|
+
reasons.push('Claimed data fields do not correspond to extracted elements');
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Check 5: Result complexity exceeds action capability
|
|
211
|
+
if (this.isResultTooComplex(claimedResult, actionSummary)) {
|
|
212
|
+
reasons.push('Result complexity exceeds what actions could produce');
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Calculate confidence in the detection
|
|
216
|
+
const isHallucination = reasons.length > 0;
|
|
217
|
+
const confidence = this.calculateHallucinationConfidence(reasons, actionSummary);
|
|
218
|
+
|
|
219
|
+
return {
|
|
220
|
+
isHallucination,
|
|
221
|
+
confidence,
|
|
222
|
+
reasons,
|
|
223
|
+
claimedResult,
|
|
224
|
+
actionSummary,
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Summarize actions from an action log.
|
|
230
|
+
*/
|
|
231
|
+
private summarizeActions(actionLog: ActionLogEntry[]): ActionSummary {
|
|
232
|
+
const uniqueIntentIds = new Set<string>();
|
|
233
|
+
|
|
234
|
+
let successfulActions = 0;
|
|
235
|
+
let failedActions = 0;
|
|
236
|
+
let extractActions = 0;
|
|
237
|
+
let clickActions = 0;
|
|
238
|
+
let typeActions = 0;
|
|
239
|
+
let navigateActions = 0;
|
|
240
|
+
|
|
241
|
+
for (const entry of actionLog) {
|
|
242
|
+
uniqueIntentIds.add(entry.intentId);
|
|
243
|
+
|
|
244
|
+
if (entry.result === 'success') {
|
|
245
|
+
successfulActions++;
|
|
246
|
+
} else {
|
|
247
|
+
failedActions++;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
switch (entry.action) {
|
|
251
|
+
case 'extract':
|
|
252
|
+
extractActions++;
|
|
253
|
+
break;
|
|
254
|
+
case 'click':
|
|
255
|
+
clickActions++;
|
|
256
|
+
break;
|
|
257
|
+
case 'type':
|
|
258
|
+
typeActions++;
|
|
259
|
+
break;
|
|
260
|
+
case 'navigate':
|
|
261
|
+
navigateActions++;
|
|
262
|
+
break;
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
return {
|
|
267
|
+
totalActions: actionLog.length,
|
|
268
|
+
successfulActions,
|
|
269
|
+
failedActions,
|
|
270
|
+
extractActions,
|
|
271
|
+
clickActions,
|
|
272
|
+
typeActions,
|
|
273
|
+
navigateActions,
|
|
274
|
+
uniqueIntentIds: Array.from(uniqueIntentIds),
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Check if a result indicates success.
|
|
280
|
+
*/
|
|
281
|
+
private isSuccessfulResult(result: unknown): boolean {
|
|
282
|
+
if (result === null || result === undefined) {
|
|
283
|
+
return false;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
if (typeof result === 'object') {
|
|
287
|
+
const obj = result as Record<string, unknown>;
|
|
288
|
+
|
|
289
|
+
// Check for explicit success indicators
|
|
290
|
+
if ('success' in obj && obj.success === true) {
|
|
291
|
+
return true;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// Check for data presence (implies success)
|
|
295
|
+
if ('data' in obj && obj.data !== null && obj.data !== undefined) {
|
|
296
|
+
const data = obj.data;
|
|
297
|
+
if (Array.isArray(data) && data.length > 0) {
|
|
298
|
+
return true;
|
|
299
|
+
}
|
|
300
|
+
if (typeof data === 'object' && Object.keys(data as object).length > 0) {
|
|
301
|
+
return true;
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// Check for non-empty result object
|
|
306
|
+
const nonMetaKeys = Object.keys(obj).filter(
|
|
307
|
+
(key) => !['metadata', 'verificationHash', 'confidence', 'error'].includes(key)
|
|
308
|
+
);
|
|
309
|
+
if (nonMetaKeys.length > 0) {
|
|
310
|
+
return true;
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
return false;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
/**
|
|
318
|
+
* Check if a result claims to have data.
|
|
319
|
+
*/
|
|
320
|
+
private hasDataClaim(result: unknown): boolean {
|
|
321
|
+
if (result === null || result === undefined) {
|
|
322
|
+
return false;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
if (typeof result !== 'object') {
|
|
326
|
+
return false;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
const obj = result as Record<string, unknown>;
|
|
330
|
+
|
|
331
|
+
// Check for explicit data field
|
|
332
|
+
if ('data' in obj) {
|
|
333
|
+
const data = obj.data;
|
|
334
|
+
if (Array.isArray(data) && data.length > 0) {
|
|
335
|
+
return true;
|
|
336
|
+
}
|
|
337
|
+
if (typeof data === 'object' && data !== null && Object.keys(data as object).length > 0) {
|
|
338
|
+
return true;
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// Check for data-like fields (excluding metadata)
|
|
343
|
+
const dataFields = Object.keys(obj).filter(
|
|
344
|
+
(key) => !['metadata', 'verificationHash', 'confidence', 'error', 'success'].includes(key)
|
|
345
|
+
);
|
|
346
|
+
|
|
347
|
+
return dataFields.length > 0;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
/**
|
|
351
|
+
* Check if claimed data fields mismatch with extracted intent IDs.
|
|
352
|
+
*/
|
|
353
|
+
private checkDataFieldMismatch(
|
|
354
|
+
_result: unknown,
|
|
355
|
+
extractedIntentIds: string[]
|
|
356
|
+
): boolean {
|
|
357
|
+
if (extractedIntentIds.length === 0) {
|
|
358
|
+
return true; // No extractions means any data claim is a mismatch
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// If we have extract actions, we assume the data could be valid
|
|
362
|
+
// More sophisticated matching would require schema analysis
|
|
363
|
+
return false;
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
/**
|
|
367
|
+
* Check if result complexity exceeds what actions could produce.
|
|
368
|
+
*/
|
|
369
|
+
private isResultTooComplex(result: unknown, summary: ActionSummary): boolean {
|
|
370
|
+
if (result === null || result === undefined) {
|
|
371
|
+
return false;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
if (typeof result !== 'object') {
|
|
375
|
+
return false;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
const obj = result as Record<string, unknown>;
|
|
379
|
+
|
|
380
|
+
// Count data items in result
|
|
381
|
+
let dataItemCount = 0;
|
|
382
|
+
|
|
383
|
+
if ('data' in obj && Array.isArray(obj.data)) {
|
|
384
|
+
dataItemCount = obj.data.length;
|
|
385
|
+
} else if ('data' in obj && typeof obj.data === 'object' && obj.data !== null) {
|
|
386
|
+
dataItemCount = Object.keys(obj.data as object).length;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
// If claiming many data items but few extract actions, suspicious
|
|
390
|
+
// Allow some multiplier for batch extractions
|
|
391
|
+
const maxReasonableItems = Math.max(summary.extractActions * 10, 1);
|
|
392
|
+
|
|
393
|
+
return dataItemCount > maxReasonableItems && summary.extractActions < dataItemCount / 10;
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
/**
|
|
397
|
+
* Calculate confidence in hallucination detection.
|
|
398
|
+
*/
|
|
399
|
+
private calculateHallucinationConfidence(
|
|
400
|
+
reasons: string[],
|
|
401
|
+
summary: ActionSummary
|
|
402
|
+
): number {
|
|
403
|
+
if (reasons.length === 0) {
|
|
404
|
+
return 0; // No hallucination detected
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
// Base confidence from number of reasons
|
|
408
|
+
let confidence = Math.min(reasons.length * 0.3, 0.9);
|
|
409
|
+
|
|
410
|
+
// Increase confidence if no actions at all
|
|
411
|
+
if (summary.totalActions === 0) {
|
|
412
|
+
confidence = Math.max(confidence, 0.95);
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
// Increase confidence if all actions failed
|
|
416
|
+
if (summary.totalActions > 0 && summary.successfulActions === 0) {
|
|
417
|
+
confidence = Math.max(confidence, 0.85);
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
return Math.min(confidence, 1);
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// ===========================================================================
|
|
424
|
+
// Diff Analysis for Conflicts (Requirement 8.4)
|
|
425
|
+
// ===========================================================================
|
|
426
|
+
|
|
427
|
+
/**
|
|
428
|
+
* Compare action logs of competing agents and recommend a winner.
|
|
429
|
+
*
|
|
430
|
+
* Requirements: 8.4
|
|
431
|
+
*/
|
|
432
|
+
analyzeDiff(
|
|
433
|
+
agentALog: ActionLogEntry[],
|
|
434
|
+
agentBLog: ActionLogEntry[]
|
|
435
|
+
): ExtendedDiffAnalysis {
|
|
436
|
+
// Find divergence point
|
|
437
|
+
const divergencePoint = this.findDivergencePoint(agentALog, agentBLog);
|
|
438
|
+
|
|
439
|
+
// Get paths from divergence
|
|
440
|
+
const agentAPath = agentALog.slice(divergencePoint);
|
|
441
|
+
const agentBPath = agentBLog.slice(divergencePoint);
|
|
442
|
+
|
|
443
|
+
// Calculate metrics for each agent
|
|
444
|
+
const agentAMetrics = this.calculateAgentMetrics(agentALog);
|
|
445
|
+
const agentBMetrics = this.calculateAgentMetrics(agentBLog);
|
|
446
|
+
|
|
447
|
+
// Determine recommendation
|
|
448
|
+
const { recommendation, reason, confidence } = this.determineWinner(
|
|
449
|
+
agentAMetrics,
|
|
450
|
+
agentBMetrics,
|
|
451
|
+
divergencePoint
|
|
452
|
+
);
|
|
453
|
+
|
|
454
|
+
return {
|
|
455
|
+
divergencePoint,
|
|
456
|
+
agentAPath,
|
|
457
|
+
agentBPath,
|
|
458
|
+
recommendation,
|
|
459
|
+
recommendationReason: reason,
|
|
460
|
+
recommendationConfidence: confidence,
|
|
461
|
+
agentAMetrics,
|
|
462
|
+
agentBMetrics,
|
|
463
|
+
};
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
/**
|
|
467
|
+
* Find the index where two action logs diverge.
|
|
468
|
+
*/
|
|
469
|
+
private findDivergencePoint(
|
|
470
|
+
logA: ActionLogEntry[],
|
|
471
|
+
logB: ActionLogEntry[]
|
|
472
|
+
): number {
|
|
473
|
+
const minLength = Math.min(logA.length, logB.length);
|
|
474
|
+
|
|
475
|
+
for (let i = 0; i < minLength; i++) {
|
|
476
|
+
if (!this.actionsMatch(logA[i], logB[i])) {
|
|
477
|
+
return i;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
// If one log is longer, divergence is at the end of the shorter one
|
|
482
|
+
if (logA.length !== logB.length) {
|
|
483
|
+
return minLength;
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
// Logs are identical
|
|
487
|
+
return logA.length;
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
/**
|
|
491
|
+
* Check if two action entries match (ignoring timestamps).
|
|
492
|
+
*/
|
|
493
|
+
private actionsMatch(a: ActionLogEntry, b: ActionLogEntry): boolean {
|
|
494
|
+
return (
|
|
495
|
+
a.action === b.action &&
|
|
496
|
+
a.intentId === b.intentId &&
|
|
497
|
+
a.value === b.value &&
|
|
498
|
+
a.result === b.result
|
|
499
|
+
);
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
/**
|
|
503
|
+
* Calculate metrics for an agent's action log.
|
|
504
|
+
*/
|
|
505
|
+
private calculateAgentMetrics(log: ActionLogEntry[]): AgentMetrics {
|
|
506
|
+
const successfulActions = log.filter((e) => e.result === 'success').length;
|
|
507
|
+
const failedActions = log.filter((e) => e.result === 'failure').length;
|
|
508
|
+
const extractActions = log.filter((e) => e.action === 'extract').length;
|
|
509
|
+
|
|
510
|
+
const totalActions = log.length;
|
|
511
|
+
const successRate = totalActions > 0 ? successfulActions / totalActions : 0;
|
|
512
|
+
|
|
513
|
+
// Consider task completed if there are successful extract actions
|
|
514
|
+
const completedTask = extractActions > 0 && successfulActions > 0;
|
|
515
|
+
|
|
516
|
+
return {
|
|
517
|
+
totalActions,
|
|
518
|
+
successfulActions,
|
|
519
|
+
failedActions,
|
|
520
|
+
successRate,
|
|
521
|
+
extractActions,
|
|
522
|
+
completedTask,
|
|
523
|
+
};
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
/**
|
|
527
|
+
* Determine the winner based on metrics.
|
|
528
|
+
*/
|
|
529
|
+
private determineWinner(
|
|
530
|
+
metricsA: AgentMetrics,
|
|
531
|
+
metricsB: AgentMetrics,
|
|
532
|
+
divergencePoint: number
|
|
533
|
+
): { recommendation: 'agent_a' | 'agent_b' | 'tie' | 'both_invalid'; reason: string; confidence: number } {
|
|
534
|
+
// Both invalid if neither completed the task
|
|
535
|
+
if (!metricsA.completedTask && !metricsB.completedTask) {
|
|
536
|
+
// Check if both have only failures
|
|
537
|
+
if (metricsA.successfulActions === 0 && metricsB.successfulActions === 0) {
|
|
538
|
+
return {
|
|
539
|
+
recommendation: 'both_invalid',
|
|
540
|
+
reason: 'Both agents failed to complete any actions successfully',
|
|
541
|
+
confidence: 0.95,
|
|
542
|
+
};
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
// One completed, one didn't
|
|
547
|
+
if (metricsA.completedTask && !metricsB.completedTask) {
|
|
548
|
+
return {
|
|
549
|
+
recommendation: 'agent_a',
|
|
550
|
+
reason: 'Agent A completed the task while Agent B did not',
|
|
551
|
+
confidence: 0.9,
|
|
552
|
+
};
|
|
553
|
+
}
|
|
554
|
+
if (metricsB.completedTask && !metricsA.completedTask) {
|
|
555
|
+
return {
|
|
556
|
+
recommendation: 'agent_b',
|
|
557
|
+
reason: 'Agent B completed the task while Agent A did not',
|
|
558
|
+
confidence: 0.9,
|
|
559
|
+
};
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
// Both completed - compare success rates
|
|
563
|
+
const rateDiff = Math.abs(metricsA.successRate - metricsB.successRate);
|
|
564
|
+
|
|
565
|
+
if (rateDiff > 0.1) {
|
|
566
|
+
if (metricsA.successRate > metricsB.successRate) {
|
|
567
|
+
return {
|
|
568
|
+
recommendation: 'agent_a',
|
|
569
|
+
reason: `Agent A has higher success rate (${(metricsA.successRate * 100).toFixed(1)}% vs ${(metricsB.successRate * 100).toFixed(1)}%)`,
|
|
570
|
+
confidence: 0.7 + rateDiff * 0.2,
|
|
571
|
+
};
|
|
572
|
+
} else {
|
|
573
|
+
return {
|
|
574
|
+
recommendation: 'agent_b',
|
|
575
|
+
reason: `Agent B has higher success rate (${(metricsB.successRate * 100).toFixed(1)}% vs ${(metricsA.successRate * 100).toFixed(1)}%)`,
|
|
576
|
+
confidence: 0.7 + rateDiff * 0.2,
|
|
577
|
+
};
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
// Similar success rates - compare extract actions
|
|
582
|
+
if (metricsA.extractActions !== metricsB.extractActions) {
|
|
583
|
+
if (metricsA.extractActions > metricsB.extractActions) {
|
|
584
|
+
return {
|
|
585
|
+
recommendation: 'agent_a',
|
|
586
|
+
reason: `Agent A performed more data extractions (${metricsA.extractActions} vs ${metricsB.extractActions})`,
|
|
587
|
+
confidence: 0.65,
|
|
588
|
+
};
|
|
589
|
+
} else {
|
|
590
|
+
return {
|
|
591
|
+
recommendation: 'agent_b',
|
|
592
|
+
reason: `Agent B performed more data extractions (${metricsB.extractActions} vs ${metricsA.extractActions})`,
|
|
593
|
+
confidence: 0.65,
|
|
594
|
+
};
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
// Truly tied
|
|
599
|
+
return {
|
|
600
|
+
recommendation: 'tie',
|
|
601
|
+
reason: `Both agents performed similarly (diverged at action ${divergencePoint})`,
|
|
602
|
+
confidence: 0.5,
|
|
603
|
+
};
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
// ===========================================================================
|
|
607
|
+
// Confidence Scoring (Requirement 8.6)
|
|
608
|
+
// ===========================================================================
|
|
609
|
+
|
|
610
|
+
/**
|
|
611
|
+
* Calculate confidence score for a verification result.
|
|
612
|
+
*
|
|
613
|
+
* Requirements: 8.6
|
|
614
|
+
*/
|
|
615
|
+
calculateConfidence(
|
|
616
|
+
claimedResult: unknown,
|
|
617
|
+
actionLog: ActionLogEntry[],
|
|
618
|
+
proof?: VerificationProof
|
|
619
|
+
): ConfidenceScoreResult {
|
|
620
|
+
const factors: ConfidenceFactor[] = [];
|
|
621
|
+
const summary = this.summarizeActions(actionLog);
|
|
622
|
+
|
|
623
|
+
// Factor 1: Action success rate
|
|
624
|
+
const successRateScore = summary.totalActions > 0
|
|
625
|
+
? summary.successfulActions / summary.totalActions
|
|
626
|
+
: 0;
|
|
627
|
+
factors.push({
|
|
628
|
+
name: 'Action Success Rate',
|
|
629
|
+
weight: this.config.actionSuccessWeight,
|
|
630
|
+
score: successRateScore,
|
|
631
|
+
description: `${summary.successfulActions}/${summary.totalActions} actions succeeded`,
|
|
632
|
+
});
|
|
633
|
+
|
|
634
|
+
// Factor 2: Extract action presence
|
|
635
|
+
const extractScore = summary.extractActions > 0 ? 1 : 0;
|
|
636
|
+
factors.push({
|
|
637
|
+
name: 'Extract Actions Present',
|
|
638
|
+
weight: this.config.extractPresenceWeight,
|
|
639
|
+
score: extractScore,
|
|
640
|
+
description: summary.extractActions > 0
|
|
641
|
+
? `${summary.extractActions} extract actions performed`
|
|
642
|
+
: 'No extract actions performed',
|
|
643
|
+
});
|
|
644
|
+
|
|
645
|
+
// Factor 3: Action-result alignment
|
|
646
|
+
const alignmentScore = this.calculateAlignmentScore(claimedResult, summary);
|
|
647
|
+
factors.push({
|
|
648
|
+
name: 'Action-Result Alignment',
|
|
649
|
+
weight: this.config.alignmentWeight,
|
|
650
|
+
score: alignmentScore,
|
|
651
|
+
description: alignmentScore > 0.7
|
|
652
|
+
? 'Result aligns well with actions'
|
|
653
|
+
: 'Result may not fully align with actions',
|
|
654
|
+
});
|
|
655
|
+
|
|
656
|
+
// Factor 4: Action completeness
|
|
657
|
+
const completenessScore = this.calculateCompletenessScore(summary);
|
|
658
|
+
factors.push({
|
|
659
|
+
name: 'Action Completeness',
|
|
660
|
+
weight: this.config.completenessWeight,
|
|
661
|
+
score: completenessScore,
|
|
662
|
+
description: completenessScore > 0.7
|
|
663
|
+
? 'Action sequence appears complete'
|
|
664
|
+
: 'Action sequence may be incomplete',
|
|
665
|
+
});
|
|
666
|
+
|
|
667
|
+
// Calculate weighted score
|
|
668
|
+
const totalWeight = factors.reduce((sum, f) => sum + f.weight, 0);
|
|
669
|
+
const weightedSum = factors.reduce((sum, f) => sum + f.score * f.weight, 0);
|
|
670
|
+
const score = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
671
|
+
|
|
672
|
+
// Adjust score based on proof if available
|
|
673
|
+
let finalScore = score;
|
|
674
|
+
if (proof) {
|
|
675
|
+
if (proof.resultMatchesActions) {
|
|
676
|
+
finalScore = Math.min(finalScore + 0.1, 1);
|
|
677
|
+
} else {
|
|
678
|
+
finalScore = Math.max(finalScore - 0.2, 0);
|
|
679
|
+
}
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
return {
|
|
683
|
+
score: finalScore,
|
|
684
|
+
factors,
|
|
685
|
+
isReliable: finalScore >= this.config.reliabilityThreshold,
|
|
686
|
+
reliabilityThreshold: this.config.reliabilityThreshold,
|
|
687
|
+
};
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
/**
|
|
691
|
+
* Calculate alignment score between result and actions.
|
|
692
|
+
*/
|
|
693
|
+
private calculateAlignmentScore(_result: unknown, summary: ActionSummary): number {
|
|
694
|
+
// No result claimed - perfect alignment (nothing to misalign)
|
|
695
|
+
if (_result === null || _result === undefined) {
|
|
696
|
+
return 1;
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
// No actions but claiming result - poor alignment
|
|
700
|
+
if (summary.totalActions === 0) {
|
|
701
|
+
return 0;
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
// All failures but claiming success - poor alignment
|
|
705
|
+
if (summary.successfulActions === 0 && this.isSuccessfulResult(_result)) {
|
|
706
|
+
return 0;
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
// Data claimed without extract actions - poor alignment
|
|
710
|
+
if (this.hasDataClaim(_result) && summary.extractActions === 0) {
|
|
711
|
+
return 0.2;
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
// Has extract actions and data claim - good alignment
|
|
715
|
+
if (this.hasDataClaim(_result) && summary.extractActions > 0) {
|
|
716
|
+
return 0.9;
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
// Default moderate alignment
|
|
720
|
+
return 0.6;
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
/**
|
|
724
|
+
* Calculate completeness score for action sequence.
|
|
725
|
+
*/
|
|
726
|
+
private calculateCompletenessScore(summary: ActionSummary): number {
|
|
727
|
+
// No actions - incomplete
|
|
728
|
+
if (summary.totalActions === 0) {
|
|
729
|
+
return 0;
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
// Check for typical complete sequence patterns
|
|
733
|
+
let score = 0.5; // Base score
|
|
734
|
+
|
|
735
|
+
// Has navigation (usually needed to start)
|
|
736
|
+
if (summary.navigateActions > 0) {
|
|
737
|
+
score += 0.15;
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
// Has interactions (clicks, types)
|
|
741
|
+
if (summary.clickActions > 0 || summary.typeActions > 0) {
|
|
742
|
+
score += 0.15;
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
// Has extractions (usually needed to get results)
|
|
746
|
+
if (summary.extractActions > 0) {
|
|
747
|
+
score += 0.2;
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
return Math.min(score, 1);
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
// ===========================================================================
|
|
754
|
+
// Utility Methods
|
|
755
|
+
// ===========================================================================
|
|
756
|
+
|
|
757
|
+
/**
|
|
758
|
+
* Get the current configuration.
|
|
759
|
+
*/
|
|
760
|
+
getConfig(): VerificationEngineConfig {
|
|
761
|
+
return { ...this.config };
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
/**
|
|
765
|
+
* Update configuration.
|
|
766
|
+
*/
|
|
767
|
+
updateConfig(config: Partial<VerificationEngineConfig>): void {
|
|
768
|
+
this.config = { ...this.config, ...config };
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
// =============================================================================
|
|
773
|
+
// Factory Function
|
|
774
|
+
// =============================================================================
|
|
775
|
+
|
|
776
|
+
/**
|
|
777
|
+
* Create a new Verification Engine instance.
|
|
778
|
+
*/
|
|
779
|
+
export function createVerificationEngine(
|
|
780
|
+
config: Partial<VerificationEngineConfig> = {}
|
|
781
|
+
): VerificationEngine {
|
|
782
|
+
return new VerificationEngine(config);
|
|
783
|
+
}
|