outcome-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/README.md +261 -0
  2. package/package.json +95 -0
  3. package/src/agents/README.md +139 -0
  4. package/src/agents/adapters/anthropic.adapter.ts +166 -0
  5. package/src/agents/adapters/dalle.adapter.ts +145 -0
  6. package/src/agents/adapters/gemini.adapter.ts +134 -0
  7. package/src/agents/adapters/imagen.adapter.ts +106 -0
  8. package/src/agents/adapters/nano-banana.adapter.ts +129 -0
  9. package/src/agents/adapters/openai.adapter.ts +165 -0
  10. package/src/agents/adapters/veo.adapter.ts +130 -0
  11. package/src/agents/agent.schema.property.test.ts +379 -0
  12. package/src/agents/agent.schema.test.ts +148 -0
  13. package/src/agents/agent.schema.ts +263 -0
  14. package/src/agents/index.ts +60 -0
  15. package/src/agents/registered-agent.schema.ts +356 -0
  16. package/src/agents/registry.ts +97 -0
  17. package/src/agents/tournament-configs.property.test.ts +266 -0
  18. package/src/cli/README.md +145 -0
  19. package/src/cli/commands/define.ts +79 -0
  20. package/src/cli/commands/list.ts +46 -0
  21. package/src/cli/commands/logs.ts +83 -0
  22. package/src/cli/commands/run.ts +416 -0
  23. package/src/cli/commands/verify.ts +110 -0
  24. package/src/cli/index.ts +81 -0
  25. package/src/config/README.md +128 -0
  26. package/src/config/env.ts +262 -0
  27. package/src/config/index.ts +19 -0
  28. package/src/eval/README.md +318 -0
  29. package/src/eval/ai-judge.test.ts +435 -0
  30. package/src/eval/ai-judge.ts +368 -0
  31. package/src/eval/code-validators.ts +414 -0
  32. package/src/eval/evaluateOutcome.property.test.ts +1174 -0
  33. package/src/eval/evaluateOutcome.ts +591 -0
  34. package/src/eval/immigration-validators.ts +122 -0
  35. package/src/eval/index.ts +90 -0
  36. package/src/eval/judge-cache.ts +402 -0
  37. package/src/eval/tournament-validators.property.test.ts +439 -0
  38. package/src/eval/validators.property.test.ts +1118 -0
  39. package/src/eval/validators.ts +1199 -0
  40. package/src/eval/weighted-scorer.ts +285 -0
  41. package/src/index.ts +17 -0
  42. package/src/league/README.md +188 -0
  43. package/src/league/health-check.ts +353 -0
  44. package/src/league/index.ts +93 -0
  45. package/src/league/killAgent.ts +151 -0
  46. package/src/league/league.test.ts +1151 -0
  47. package/src/league/runLeague.ts +843 -0
  48. package/src/league/scoreAgent.ts +175 -0
  49. package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
  50. package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
  51. package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
  52. package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
  53. package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
  54. package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
  55. package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
  56. package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
  57. package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
  58. package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
  59. package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
  60. package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
  61. package/src/modules/omnibridge/api/.gitkeep +1 -0
  62. package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
  63. package/src/modules/omnibridge/auth/.gitkeep +1 -0
  64. package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
  65. package/src/modules/omnibridge/auth/session-vault.ts +577 -0
  66. package/src/modules/omnibridge/core/.gitkeep +1 -0
  67. package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
  68. package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
  69. package/src/modules/omnibridge/core/types.ts +610 -0
  70. package/src/modules/omnibridge/execution/.gitkeep +1 -0
  71. package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
  72. package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
  73. package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
  74. package/src/modules/omnibridge/index.ts +212 -0
  75. package/src/modules/omnibridge/omnibridge.ts +510 -0
  76. package/src/modules/omnibridge/verification/.gitkeep +1 -0
  77. package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
  78. package/src/outcomes/README.md +75 -0
  79. package/src/outcomes/acquire-pilot-customer.ts +297 -0
  80. package/src/outcomes/code-delivery-outcomes.ts +89 -0
  81. package/src/outcomes/code-outcomes.ts +256 -0
  82. package/src/outcomes/code_review_battle.test.ts +135 -0
  83. package/src/outcomes/code_review_battle.ts +135 -0
  84. package/src/outcomes/cold_email_battle.ts +97 -0
  85. package/src/outcomes/content_creation_battle.ts +160 -0
  86. package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
  87. package/src/outcomes/index.ts +107 -0
  88. package/src/outcomes/lead_gen_battle.test.ts +113 -0
  89. package/src/outcomes/lead_gen_battle.ts +99 -0
  90. package/src/outcomes/outcome.schema.property.test.ts +229 -0
  91. package/src/outcomes/outcome.schema.ts +187 -0
  92. package/src/outcomes/qualified_sales_interest.ts +118 -0
  93. package/src/outcomes/swarm_planner.property.test.ts +370 -0
  94. package/src/outcomes/swarm_planner.ts +96 -0
  95. package/src/outcomes/web_extraction.ts +234 -0
  96. package/src/runtime/README.md +220 -0
  97. package/src/runtime/agentRunner.test.ts +341 -0
  98. package/src/runtime/agentRunner.ts +746 -0
  99. package/src/runtime/claudeAdapter.ts +232 -0
  100. package/src/runtime/costTracker.ts +123 -0
  101. package/src/runtime/index.ts +34 -0
  102. package/src/runtime/modelAdapter.property.test.ts +305 -0
  103. package/src/runtime/modelAdapter.ts +144 -0
  104. package/src/runtime/openaiAdapter.ts +235 -0
  105. package/src/utils/README.md +122 -0
  106. package/src/utils/command-runner.ts +134 -0
  107. package/src/utils/cost-guard.ts +379 -0
  108. package/src/utils/errors.test.ts +290 -0
  109. package/src/utils/errors.ts +442 -0
  110. package/src/utils/index.ts +37 -0
  111. package/src/utils/logger.test.ts +361 -0
  112. package/src/utils/logger.ts +419 -0
  113. package/src/utils/output-parsers.ts +216 -0
@@ -0,0 +1,591 @@
1
+ /**
2
+ * Evaluation Orchestration - Binary evaluation of agent artifacts
3
+ *
4
+ * Evaluates whether an agent artifact meets all success criteria defined
5
+ * in an outcome. Returns exactly SUCCESS or FAILURE with structured reasons.
6
+ *
7
+ * @module eval/evaluateOutcome
8
+ */
9
+
10
+ import type { Outcome, SuccessCriterion } from '../outcomes/outcome.schema.js';
11
+ import {
12
+ type ValidationResult,
13
+ validateBuyingIntent,
14
+ validateCompanySize,
15
+ validateRole,
16
+ validateMessageLength,
17
+ validateEmail,
18
+ validateSecurityIssue,
19
+ validatePerformanceIssue,
20
+ validateNoiseFreeness,
21
+ validateComplexityReduction,
22
+ validateLinkedIn,
23
+ validateLeadGenPrecision,
24
+ validateCompanyDataset,
25
+ type CompanyDataset,
26
+ validateTestsPass,
27
+ validateBuilds,
28
+ validateLintClean,
29
+ validateBenchmark,
30
+ validateSecurityScan,
31
+ } from './validators.js';
32
+ import {
33
+ validateI983RequiredFields,
34
+ validateOPTDateRange,
35
+ validateEVerifyFormat,
36
+ validateTrainingDescriptionLength,
37
+ } from './immigration-validators.js';
38
+
39
+ /**
40
+ * Result of evaluating a single criterion.
41
+ */
42
+ export interface CriterionResult {
43
+ /** Name of the criterion that was evaluated */
44
+ name: string;
45
+ /** Whether the criterion passed */
46
+ passed: boolean;
47
+ /** Human-readable reason for the result */
48
+ reason: string;
49
+ }
50
+
51
+ /**
52
+ * Content produced by an agent for evaluation.
53
+ */
54
+ export interface ArtifactContent {
55
+ /** The message text generated by the agent */
56
+ message?: string;
57
+ /** Target email address */
58
+ targetEmail?: string;
59
+ /** Target company name */
60
+ targetCompany?: string;
61
+ /** Target company size (number of employees) */
62
+ targetCompanySize?: number;
63
+ /** Target person's role */
64
+ targetRole?: string;
65
+
66
+ // Image/Video Generation fields
67
+ type?: 'image' | 'video' | 'text' | 'code';
68
+ imageUrl?: string;
69
+ videoUrl?: string;
70
+ originalPrompt?: string;
71
+ revisedPrompt?: string;
72
+ dimensions?: string;
73
+ model?: string;
74
+ style?: string;
75
+ target?: string;
76
+ generatedText?: string;
77
+ duration?: string;
78
+
79
+ // Lead gen fields
80
+ email?: string;
81
+ companySize?: number;
82
+ role?: string;
83
+ linkedIn?: string;
84
+
85
+ // Code review fields
86
+ issues?: Array<{
87
+ type: string;
88
+ severity: string;
89
+ description: string;
90
+ }>;
91
+ comments?: Array<{
92
+ lineContent: string;
93
+ comment: string;
94
+ }>;
95
+ refactorSuggestion?: {
96
+ originalComplexity: number;
97
+ suggestedComplexity: number;
98
+ description: string;
99
+ };
100
+
101
+ // Immigration compliance fields
102
+ extractedFormData?: Record<string, unknown>;
103
+
104
+ // Code delivery fields
105
+ repoPath?: string;
106
+ worktreePath?: string;
107
+ commitSha?: string;
108
+ testCommand?: string;
109
+ buildCommand?: string;
110
+ lintCommand?: string;
111
+ benchmarkCommand?: string;
112
+ securityScanCommand?: string;
113
+ testResult?: Record<string, unknown>;
114
+ buildResult?: Record<string, unknown>;
115
+ lintResult?: Record<string, unknown>;
116
+ benchmarkResult?: Record<string, unknown>;
117
+ securityScanResult?: Record<string, unknown>;
118
+ code?: string;
119
+ language?: string;
120
+ }
121
+
122
+ /**
123
+ * Agent artifact submitted for evaluation.
124
+ */
125
+ export interface AgentArtifact {
126
+ /** ID of the agent that produced this artifact */
127
+ agentId: string;
128
+ /** ID of the outcome being attempted */
129
+ outcomeId: string;
130
+ /** Attempt number (1-indexed) */
131
+ attemptNumber: number;
132
+ /** Content produced by the agent */
133
+ content: ArtifactContent;
134
+ /** ISO timestamp when artifact was created */
135
+ timestamp: string;
136
+ }
137
+
138
+ /**
139
+ * Result of evaluating an agent artifact against an outcome.
140
+ * Always binary: SUCCESS or FAILURE.
141
+ */
142
+ export interface EvaluationResult {
143
+ /** Binary status - exactly SUCCESS or FAILURE */
144
+ status: 'SUCCESS' | 'FAILURE';
145
+ /** Human-readable reason for the result */
146
+ reason: string;
147
+ /** Results for each individual criterion */
148
+ criteriaResults: CriterionResult[];
149
+ /** Verification details included only on SUCCESS */
150
+ verificationDetails?: Record<string, unknown>;
151
+ }
152
+
153
+
154
+ /**
155
+ * Map of validator names to their implementation functions.
156
+ * Used to dynamically call validators based on outcome configuration.
157
+ */
158
+ type ValidatorFn = (content: ArtifactContent, params: Record<string, unknown>) => ValidationResult;
159
+
160
+ const validatorMap: Record<string, ValidatorFn> = {
161
+ validateBuyingIntent: (content, params) => {
162
+ const keywords = params.keywords as string[];
163
+ return validateBuyingIntent(content.message || '', keywords);
164
+ },
165
+ validateCompanySize: (content, params) => {
166
+ const minimum = params.minimum as number;
167
+ // Handle both old format (targetCompanySize) and new format (companySize)
168
+ const companySize = (content as any).companySize ?? content.targetCompanySize;
169
+ return validateCompanySize(companySize, minimum);
170
+ },
171
+ validateRole: (content, params) => {
172
+ const excludedRoles = params.excludedRoles as string[];
173
+ // Handle both old format (targetRole) and new format (role)
174
+ const role = (content as any).role ?? content.targetRole;
175
+ return validateRole(role, excludedRoles);
176
+ },
177
+ validateMessageLength: (content, params) => {
178
+ const minWords = params.minWords as number;
179
+ return validateMessageLength(content.message || '', minWords);
180
+ },
181
+ validateEmail: (content) => {
182
+ // Handle both old format (targetEmail) and new format (email)
183
+ const email = (content as any).email ?? content.targetEmail;
184
+ return validateEmail(email);
185
+ },
186
+ // Code review battle validators
187
+ validateSecurityIssue: (content, params) => {
188
+ const requiredSeverity = (params.requiredSeverity as string) || 'CRITICAL';
189
+ return validateSecurityIssue(content as any, requiredSeverity);
190
+ },
191
+ validatePerformanceIssue: (content) => {
192
+ return validatePerformanceIssue(content as any);
193
+ },
194
+ validateNoiseFreeness: (content, _params) => {
195
+ // For mock mode, we'll assume the source diff contains our mock lines
196
+ const sourceDiff = `const query = "SELECT * FROM users WHERE username = '" + username + "' AND password = '" + password + "'";
197
+ const permissions = db.query("SELECT * FROM permissions WHERE user_id = " + user.id);`;
198
+ return validateNoiseFreeness(content as any, sourceDiff);
199
+ },
200
+ validateComplexityReduction: (content, params) => {
201
+ const minReduction = (params.minReduction as number) || 2;
202
+ return validateComplexityReduction(content as any, minReduction);
203
+ },
204
+ // Lead gen battle validators
205
+ validateLinkedIn: (content) => {
206
+ return validateLinkedIn((content as any).linkedIn);
207
+ },
208
+ validateLeadGenPrecision: (content) => {
209
+ return validateLeadGenPrecision(content as any);
210
+ },
211
+ // Code delivery validators
212
+ validateTestsPass: (_content, params) => {
213
+ const c = _content as any;
214
+ return validateTestsPass(c.testResult ?? c.tests ?? {}, params as { minPassRate?: number });
215
+ },
216
+ validateBuilds: (_content) => {
217
+ const c = _content as any;
218
+ return validateBuilds(c.buildResult ?? c.build ?? {});
219
+ },
220
+ validateLintClean: (_content, params) => {
221
+ const c = _content as any;
222
+ return validateLintClean(c.lintResult ?? c.lint ?? {}, params as { allowWarnings?: boolean });
223
+ },
224
+ validateBenchmark: (_content, params) => {
225
+ const c = _content as any;
226
+ return validateBenchmark(c.benchmarkResult ?? c.benchmark ?? {}, params as { p95ThresholdMs: number });
227
+ },
228
+ validateSecurityScan: (_content, params) => {
229
+ const c = _content as any;
230
+ return validateSecurityScan(c.securityScanResult ?? c.security ?? {}, params as { maxSeverity?: 'critical' | 'high' | 'medium' | 'low' });
231
+ },
232
+ // Dataset validators - Outcome-Verified Marketplace
233
+ validateCompanyDataset: (content, params) => {
234
+ const minRows = (params.minRows as number) || 25;
235
+ return validateCompanyDataset(content as unknown as CompanyDataset, minRows);
236
+ },
237
+ // Immigration compliance validators
238
+ validateI983RequiredFields: (content, params) => {
239
+ return validateI983RequiredFields(content as Record<string, unknown>, params as { requiredFields: string[] });
240
+ },
241
+ validateOPTDateRange: (content, params) => {
242
+ return validateOPTDateRange(content as Record<string, unknown>, params as { field: string; minDaysFromNow: number; maxDaysFromNow: number });
243
+ },
244
+ validateEVerifyFormat: (content, params) => {
245
+ return validateEVerifyFormat(content as Record<string, unknown>, params as { field: string });
246
+ },
247
+ validateTrainingDescriptionLength: (content, params) => {
248
+ return validateTrainingDescriptionLength(content as Record<string, unknown>, params as { field: string; minWords: number });
249
+ },
250
+ };
251
+
252
+ /**
253
+ * Validates that an artifact has all required content fields for the given outcome.
254
+ *
255
+ * @param artifact - The artifact to validate
256
+ * @param outcome - The outcome to validate against
257
+ * @returns ValidationResult indicating if artifact is valid
258
+ */
259
+ function validateArtifactSchema(artifact: unknown, outcome: Outcome): ValidationResult {
260
+ const errors: string[] = [];
261
+
262
+ if (typeof artifact !== 'object' || artifact === null) {
263
+ return { valid: false, errors: ['Artifact must be an object'] };
264
+ }
265
+
266
+ const a = artifact as Record<string, unknown>;
267
+
268
+ if (typeof a.agentId !== 'string' || a.agentId.trim() === '') {
269
+ errors.push('Artifact must have a non-empty string "agentId"');
270
+ }
271
+
272
+ if (typeof a.outcomeId !== 'string' || a.outcomeId.trim() === '') {
273
+ errors.push('Artifact must have a non-empty string "outcomeId"');
274
+ }
275
+
276
+ if (typeof a.attemptNumber !== 'number' || !Number.isInteger(a.attemptNumber) || a.attemptNumber < 1) {
277
+ errors.push('Artifact must have a positive integer "attemptNumber"');
278
+ }
279
+
280
+ if (typeof a.timestamp !== 'string' || a.timestamp.trim() === '') {
281
+ errors.push('Artifact must have a non-empty string "timestamp"');
282
+ }
283
+
284
+ // Validate content based on outcome type
285
+ if (typeof a.content !== 'object' || a.content === null) {
286
+ errors.push('Artifact must have an object "content"');
287
+ } else {
288
+ const content = a.content as Record<string, unknown>;
289
+
290
+ // Outcome-specific content validation
291
+ if (outcome.name === 'code_review_battle') {
292
+ // Code review battle expects issues, comments, and optional refactor suggestion
293
+ if (!Array.isArray(content.issues)) {
294
+ errors.push('Code review artifact content must have an array "issues"');
295
+ } else {
296
+ content.issues.forEach((issue: unknown, index: number) => {
297
+ if (typeof issue !== 'object' || issue === null) {
298
+ errors.push(`issues[${index}] must be an object`);
299
+ } else {
300
+ const issueObj = issue as Record<string, unknown>;
301
+ if (typeof issueObj.type !== 'string') {
302
+ errors.push(`issues[${index}] must have a string "type"`);
303
+ }
304
+ if (typeof issueObj.severity !== 'string') {
305
+ errors.push(`issues[${index}] must have a string "severity"`);
306
+ }
307
+ if (typeof issueObj.description !== 'string') {
308
+ errors.push(`issues[${index}] must have a string "description"`);
309
+ }
310
+ }
311
+ });
312
+ }
313
+
314
+ if (!Array.isArray(content.comments)) {
315
+ errors.push('Code review artifact content must have an array "comments"');
316
+ } else {
317
+ content.comments.forEach((comment: unknown, index: number) => {
318
+ if (typeof comment !== 'object' || comment === null) {
319
+ errors.push(`comments[${index}] must be an object`);
320
+ } else {
321
+ const commentObj = comment as Record<string, unknown>;
322
+ if (typeof commentObj.lineContent !== 'string') {
323
+ errors.push(`comments[${index}] must have a string "lineContent"`);
324
+ }
325
+ if (typeof commentObj.comment !== 'string') {
326
+ errors.push(`comments[${index}] must have a string "comment"`);
327
+ }
328
+ }
329
+ });
330
+ }
331
+
332
+ // Refactor suggestion is optional
333
+ if (content.refactorSuggestion !== undefined) {
334
+ if (typeof content.refactorSuggestion !== 'object' || content.refactorSuggestion === null) {
335
+ errors.push('refactorSuggestion must be an object if provided');
336
+ } else {
337
+ const refactor = content.refactorSuggestion as Record<string, unknown>;
338
+ if (typeof refactor.originalComplexity !== 'number') {
339
+ errors.push('refactorSuggestion must have a number "originalComplexity"');
340
+ }
341
+ if (typeof refactor.suggestedComplexity !== 'number') {
342
+ errors.push('refactorSuggestion must have a number "suggestedComplexity"');
343
+ }
344
+ if (typeof refactor.description !== 'string') {
345
+ errors.push('refactorSuggestion must have a string "description"');
346
+ }
347
+ }
348
+ }
349
+ } else if (outcome.name === 'lead_gen_battle') {
350
+ // Lead gen battle expects email, companySize, role, linkedIn
351
+ if (typeof content.email !== 'string') {
352
+ errors.push('Lead gen artifact content must have a string "email"');
353
+ }
354
+
355
+ if (typeof content.companySize !== 'number') {
356
+ errors.push('Lead gen artifact content must have a number "companySize"');
357
+ }
358
+
359
+ if (typeof content.role !== 'string') {
360
+ errors.push('Lead gen artifact content must have a string "role"');
361
+ }
362
+
363
+ if (typeof content.linkedIn !== 'string') {
364
+ errors.push('Lead gen artifact content must have a string "linkedIn"');
365
+ }
366
+ } else if (
367
+ outcome.name === 'feature_implementation' ||
368
+ outcome.name === 'refactor_task' ||
369
+ outcome.name === 'test_generation'
370
+ ) {
371
+ // Code delivery outcomes expect structured execution summaries
372
+ if (typeof content.testResult !== 'object' || content.testResult === null) {
373
+ errors.push('Code artifact must include "testResult" object');
374
+ }
375
+ if (typeof content.buildResult !== 'object' || content.buildResult === null) {
376
+ errors.push('Code artifact must include "buildResult" object');
377
+ }
378
+ if (typeof content.lintResult !== 'object' || content.lintResult === null) {
379
+ errors.push('Code artifact must include "lintResult" object');
380
+ }
381
+ if (typeof content.benchmarkResult !== 'object' || content.benchmarkResult === null) {
382
+ errors.push('Code artifact must include "benchmarkResult" object');
383
+ }
384
+ if (content.securityScanResult !== undefined && (typeof content.securityScanResult !== 'object' || content.securityScanResult === null)) {
385
+ errors.push('If provided, "securityScanResult" must be an object');
386
+ }
387
+ if (content.repoPath !== undefined && typeof content.repoPath !== 'string') {
388
+ errors.push('If provided, "repoPath" must be a string');
389
+ }
390
+ if (content.worktreePath !== undefined && typeof content.worktreePath !== 'string') {
391
+ errors.push('If provided, "worktreePath" must be a string');
392
+ }
393
+ if (content.commitSha !== undefined && typeof content.commitSha !== 'string') {
394
+ errors.push('If provided, "commitSha" must be a string');
395
+ }
396
+ if (content.testCommand !== undefined && typeof content.testCommand !== 'string') {
397
+ errors.push('If provided, "testCommand" must be a string');
398
+ }
399
+ if (content.buildCommand !== undefined && typeof content.buildCommand !== 'string') {
400
+ errors.push('If provided, "buildCommand" must be a string');
401
+ }
402
+ if (content.lintCommand !== undefined && typeof content.lintCommand !== 'string') {
403
+ errors.push('If provided, "lintCommand" must be a string');
404
+ }
405
+ if (content.benchmarkCommand !== undefined && typeof content.benchmarkCommand !== 'string') {
406
+ errors.push('If provided, "benchmarkCommand" must be a string');
407
+ }
408
+ if (content.securityScanCommand !== undefined && typeof content.securityScanCommand !== 'string') {
409
+ errors.push('If provided, "securityScanCommand" must be a string');
410
+ }
411
+ if (typeof content.code !== 'string') {
412
+ errors.push('Code artifact must include "code" string');
413
+ }
414
+ if (typeof content.language !== 'string') {
415
+ errors.push('Code artifact must include "language" string');
416
+ }
417
+ } else if (outcome.name === 'ai_sales_tools_dataset_v1' || outcome.name.includes('dataset')) {
418
+ // Dataset outcomes - Outcome-Verified Marketplace
419
+ const contentRecord = content as unknown as Record<string, unknown>;
420
+ if (!Array.isArray(contentRecord.companies)) {
421
+ errors.push('Dataset artifact content must have an array "companies"');
422
+ }
423
+ if (typeof contentRecord.generatedAt !== 'string') {
424
+ errors.push('Dataset artifact content must have a string "generatedAt"');
425
+ }
426
+ } else if (outcome.name.includes('image') || outcome.name.includes('video') || outcome.name.includes('content')) {
427
+ // Content creation battles
428
+ if (typeof content.type !== 'string') {
429
+ errors.push('Content must have a string "type" ("image" | "video")');
430
+ }
431
+ if (content.type === 'image' && typeof content.imageUrl !== 'string') {
432
+ errors.push('Image artifact content must have a string "imageUrl"');
433
+ }
434
+ if (content.type === 'video' && typeof content.videoUrl !== 'string') {
435
+ errors.push('Video artifact content must have a string "videoUrl"');
436
+ }
437
+ } else if (outcome.name === 'f1_stem_opt_compliance') {
438
+ // F1 STEM OPT compliance validation
439
+ if (typeof content.extractedFormData !== 'object' || content.extractedFormData === null) {
440
+ errors.push('Artifact content must have an object "extractedFormData"');
441
+ }
442
+ } else {
443
+ // Default validation for qualified_sales_interest and other outcomes
444
+ if (typeof content.message !== 'string') {
445
+ errors.push('Artifact content must have a string "message"');
446
+ }
447
+
448
+ if (typeof content.targetEmail !== 'string') {
449
+ errors.push('Artifact content must have a string "targetEmail"');
450
+ }
451
+
452
+ if (typeof content.targetCompany !== 'string') {
453
+ errors.push('Artifact content must have a string "targetCompany"');
454
+ }
455
+
456
+ if (typeof content.targetCompanySize !== 'number') {
457
+ errors.push('Artifact content must have a number "targetCompanySize"');
458
+ }
459
+
460
+ if (typeof content.targetRole !== 'string') {
461
+ errors.push('Artifact content must have a string "targetRole"');
462
+ }
463
+ }
464
+ }
465
+
466
+ return { valid: errors.length === 0, errors };
467
+ }
468
+
469
+
470
+ /**
471
+ * Evaluates a single success criterion against artifact content.
472
+ *
473
+ * @param criterion - The criterion to evaluate
474
+ * @param content - The artifact content to evaluate against
475
+ * @returns CriterionResult with pass/fail status and reason
476
+ */
477
+ function evaluateCriterion(
478
+ criterion: SuccessCriterion,
479
+ content: ArtifactContent
480
+ ): CriterionResult {
481
+ const validator = validatorMap[criterion.validator];
482
+
483
+ if (!validator) {
484
+ // Unknown validator - fail closed
485
+ return {
486
+ name: criterion.name,
487
+ passed: false,
488
+ reason: `Unknown validator: ${criterion.validator}`,
489
+ };
490
+ }
491
+
492
+ try {
493
+ const result = validator(content, criterion.params);
494
+
495
+ return {
496
+ name: criterion.name,
497
+ passed: result.valid,
498
+ reason: result.valid
499
+ ? `Criterion "${criterion.name}" passed`
500
+ : result.errors.join('; '),
501
+ };
502
+ } catch (error) {
503
+ // Validator threw an error - fail closed
504
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error';
505
+ return {
506
+ name: criterion.name,
507
+ passed: false,
508
+ reason: `Validator error: ${errorMessage}`,
509
+ };
510
+ }
511
+ }
512
+
513
+ /**
514
+ * Evaluates whether an agent artifact meets all success criteria for an outcome.
515
+ *
516
+ * This function implements binary evaluation logic:
517
+ * - Returns SUCCESS only if ALL criteria pass
518
+ * - Returns FAILURE if ANY criterion fails or data is invalid
519
+ * - Fails closed on errors (returns FAILURE, not exceptions)
520
+ *
521
+ * @param outcome - The outcome definition with success criteria
522
+ * @param artifact - The agent's produced artifact
523
+ * @returns EvaluationResult with binary status and detailed criteria results
524
+ *
525
+ * @example
526
+ * const result = await evaluateOutcome(qualifiedSalesInterest, artifact);
527
+ * if (result.status === 'SUCCESS') {
528
+ * console.log('Payout triggered:', result.verificationDetails);
529
+ * } else {
530
+ * console.log('Failed:', result.reason);
531
+ * }
532
+ *
533
+ * @see Requirements 5.1, 5.2, 5.3, 5.4
534
+ */
535
+ export async function evaluateOutcome(
536
+ outcome: Outcome,
537
+ artifact: AgentArtifact
538
+ ): Promise<EvaluationResult> {
539
+ // Validate artifact schema first - fail closed on invalid data
540
+ const schemaValidation = validateArtifactSchema(artifact, outcome);
541
+ if (!schemaValidation.valid) {
542
+ return {
543
+ status: 'FAILURE',
544
+ reason: `Invalid artifact: ${schemaValidation.errors.join('; ')}`,
545
+ criteriaResults: [],
546
+ };
547
+ }
548
+
549
+ // Verify artifact is for the correct outcome
550
+ if (artifact.outcomeId !== outcome.name) {
551
+ return {
552
+ status: 'FAILURE',
553
+ reason: `Artifact outcome mismatch: expected "${outcome.name}", got "${artifact.outcomeId}"`,
554
+ criteriaResults: [],
555
+ };
556
+ }
557
+
558
+ // Evaluate all criteria
559
+ const criteriaResults: CriterionResult[] = outcome.successCriteria.map(
560
+ (criterion) => evaluateCriterion(criterion, artifact.content)
561
+ );
562
+
563
+ // Check if all criteria passed
564
+ const allPassed = criteriaResults.every((result) => result.passed);
565
+ const failedCriteria = criteriaResults.filter((result) => !result.passed);
566
+
567
+ if (allPassed) {
568
+ // SUCCESS - include verification details
569
+ return {
570
+ status: 'SUCCESS',
571
+ reason: `All ${criteriaResults.length} criteria passed`,
572
+ criteriaResults,
573
+ verificationDetails: {
574
+ outcomeId: outcome.name,
575
+ payoutAmount: outcome.payoutAmount,
576
+ agentId: artifact.agentId,
577
+ attemptNumber: artifact.attemptNumber,
578
+ evaluatedAt: new Date().toISOString(),
579
+ criteriaCount: criteriaResults.length,
580
+ },
581
+ };
582
+ }
583
+
584
+ // FAILURE - include structured reason
585
+ const failureReasons = failedCriteria.map((c) => c.reason).join('; ');
586
+ return {
587
+ status: 'FAILURE',
588
+ reason: `Failed ${failedCriteria.length} of ${criteriaResults.length} criteria: ${failureReasons}`,
589
+ criteriaResults,
590
+ };
591
+ }