outcome-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/README.md +261 -0
  2. package/package.json +95 -0
  3. package/src/agents/README.md +139 -0
  4. package/src/agents/adapters/anthropic.adapter.ts +166 -0
  5. package/src/agents/adapters/dalle.adapter.ts +145 -0
  6. package/src/agents/adapters/gemini.adapter.ts +134 -0
  7. package/src/agents/adapters/imagen.adapter.ts +106 -0
  8. package/src/agents/adapters/nano-banana.adapter.ts +129 -0
  9. package/src/agents/adapters/openai.adapter.ts +165 -0
  10. package/src/agents/adapters/veo.adapter.ts +130 -0
  11. package/src/agents/agent.schema.property.test.ts +379 -0
  12. package/src/agents/agent.schema.test.ts +148 -0
  13. package/src/agents/agent.schema.ts +263 -0
  14. package/src/agents/index.ts +60 -0
  15. package/src/agents/registered-agent.schema.ts +356 -0
  16. package/src/agents/registry.ts +97 -0
  17. package/src/agents/tournament-configs.property.test.ts +266 -0
  18. package/src/cli/README.md +145 -0
  19. package/src/cli/commands/define.ts +79 -0
  20. package/src/cli/commands/list.ts +46 -0
  21. package/src/cli/commands/logs.ts +83 -0
  22. package/src/cli/commands/run.ts +416 -0
  23. package/src/cli/commands/verify.ts +110 -0
  24. package/src/cli/index.ts +81 -0
  25. package/src/config/README.md +128 -0
  26. package/src/config/env.ts +262 -0
  27. package/src/config/index.ts +19 -0
  28. package/src/eval/README.md +318 -0
  29. package/src/eval/ai-judge.test.ts +435 -0
  30. package/src/eval/ai-judge.ts +368 -0
  31. package/src/eval/code-validators.ts +414 -0
  32. package/src/eval/evaluateOutcome.property.test.ts +1174 -0
  33. package/src/eval/evaluateOutcome.ts +591 -0
  34. package/src/eval/immigration-validators.ts +122 -0
  35. package/src/eval/index.ts +90 -0
  36. package/src/eval/judge-cache.ts +402 -0
  37. package/src/eval/tournament-validators.property.test.ts +439 -0
  38. package/src/eval/validators.property.test.ts +1118 -0
  39. package/src/eval/validators.ts +1199 -0
  40. package/src/eval/weighted-scorer.ts +285 -0
  41. package/src/index.ts +17 -0
  42. package/src/league/README.md +188 -0
  43. package/src/league/health-check.ts +353 -0
  44. package/src/league/index.ts +93 -0
  45. package/src/league/killAgent.ts +151 -0
  46. package/src/league/league.test.ts +1151 -0
  47. package/src/league/runLeague.ts +843 -0
  48. package/src/league/scoreAgent.ts +175 -0
  49. package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
  50. package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
  51. package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
  52. package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
  53. package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
  54. package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
  55. package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
  56. package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
  57. package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
  58. package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
  59. package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
  60. package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
  61. package/src/modules/omnibridge/api/.gitkeep +1 -0
  62. package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
  63. package/src/modules/omnibridge/auth/.gitkeep +1 -0
  64. package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
  65. package/src/modules/omnibridge/auth/session-vault.ts +577 -0
  66. package/src/modules/omnibridge/core/.gitkeep +1 -0
  67. package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
  68. package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
  69. package/src/modules/omnibridge/core/types.ts +610 -0
  70. package/src/modules/omnibridge/execution/.gitkeep +1 -0
  71. package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
  72. package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
  73. package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
  74. package/src/modules/omnibridge/index.ts +212 -0
  75. package/src/modules/omnibridge/omnibridge.ts +510 -0
  76. package/src/modules/omnibridge/verification/.gitkeep +1 -0
  77. package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
  78. package/src/outcomes/README.md +75 -0
  79. package/src/outcomes/acquire-pilot-customer.ts +297 -0
  80. package/src/outcomes/code-delivery-outcomes.ts +89 -0
  81. package/src/outcomes/code-outcomes.ts +256 -0
  82. package/src/outcomes/code_review_battle.test.ts +135 -0
  83. package/src/outcomes/code_review_battle.ts +135 -0
  84. package/src/outcomes/cold_email_battle.ts +97 -0
  85. package/src/outcomes/content_creation_battle.ts +160 -0
  86. package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
  87. package/src/outcomes/index.ts +107 -0
  88. package/src/outcomes/lead_gen_battle.test.ts +113 -0
  89. package/src/outcomes/lead_gen_battle.ts +99 -0
  90. package/src/outcomes/outcome.schema.property.test.ts +229 -0
  91. package/src/outcomes/outcome.schema.ts +187 -0
  92. package/src/outcomes/qualified_sales_interest.ts +118 -0
  93. package/src/outcomes/swarm_planner.property.test.ts +370 -0
  94. package/src/outcomes/swarm_planner.ts +96 -0
  95. package/src/outcomes/web_extraction.ts +234 -0
  96. package/src/runtime/README.md +220 -0
  97. package/src/runtime/agentRunner.test.ts +341 -0
  98. package/src/runtime/agentRunner.ts +746 -0
  99. package/src/runtime/claudeAdapter.ts +232 -0
  100. package/src/runtime/costTracker.ts +123 -0
  101. package/src/runtime/index.ts +34 -0
  102. package/src/runtime/modelAdapter.property.test.ts +305 -0
  103. package/src/runtime/modelAdapter.ts +144 -0
  104. package/src/runtime/openaiAdapter.ts +235 -0
  105. package/src/utils/README.md +122 -0
  106. package/src/utils/command-runner.ts +134 -0
  107. package/src/utils/cost-guard.ts +379 -0
  108. package/src/utils/errors.test.ts +290 -0
  109. package/src/utils/errors.ts +442 -0
  110. package/src/utils/index.ts +37 -0
  111. package/src/utils/logger.test.ts +361 -0
  112. package/src/utils/logger.ts +419 -0
  113. package/src/utils/output-parsers.ts +216 -0
@@ -0,0 +1,256 @@
1
+ /**
2
+ * Code Outcomes - SWE-bench style outcome definitions
3
+ *
4
+ * Defines outcome schemas for code generation tasks that integrate
5
+ * with our existing bounty and evaluation system.
6
+ *
7
+ * @module outcomes/code-outcomes
8
+ */
9
+
10
+ import type { TestCase } from '../eval/code-validators.js';
11
+
12
+ /**
13
+ * Code-specific outcome for SWE-bench style tasks.
14
+ * Extends our standard outcome pattern for code generation.
15
+ */
16
+ export interface CodeOutcome {
17
+ /** Unique identifier for this outcome */
18
+ name: string;
19
+ /** Human-readable description */
20
+ description: string;
21
+ /** Category: bug-fix, feature, refactor, optimization */
22
+ category: 'bug-fix' | 'feature' | 'refactor' | 'optimization' | 'test';
23
+ /** Programming language */
24
+ language: 'python' | 'typescript' | 'javascript' | 'go' | 'rust';
25
+ /** Difficulty level (maps to training levels) */
26
+ difficulty: 1 | 2 | 3 | 4 | 5;
27
+ /** Payout amount in USD for successful completion */
28
+ payoutAmount: number;
29
+ /** The issue/problem description (SWE-bench style) */
30
+ issueDescription: string;
31
+ /** Keywords that solution should address */
32
+ issueKeywords: string[];
33
+ /** Optional: Starting code/context */
34
+ starterCode?: string;
35
+ /** Optional: File paths involved */
36
+ filePaths?: string[];
37
+ /** Test cases for validation */
38
+ testCases: TestCase[];
39
+ /** Required syntax elements */
40
+ requiredSyntax?: string[];
41
+ /** Structure requirements */
42
+ structureRequirements?: {
43
+ mustHaveFunction?: boolean;
44
+ mustHaveClass?: boolean;
45
+ mustHaveImports?: boolean;
46
+ minLines?: number;
47
+ maxLines?: number;
48
+ };
49
+ /** Quality requirements */
50
+ qualityRequirements?: {
51
+ mustHaveErrorHandling?: boolean;
52
+ mustHaveComments?: boolean;
53
+ maxComplexity?: number;
54
+ noConsoleLog?: boolean;
55
+ };
56
+ }
57
+
58
+ /**
59
+ * Pre-defined SWE-bench style outcomes for WAIESL battles.
60
+ * These map to different training levels and agent specializations.
61
+ */
62
+ export const CODE_OUTCOMES: Record<string, CodeOutcome> = {
63
+ // Level 1: Basic code tasks
64
+ 'fix-null-check': {
65
+ name: 'fix-null-check',
66
+ description: 'Add proper null/undefined checking to prevent runtime errors',
67
+ category: 'bug-fix',
68
+ language: 'typescript',
69
+ difficulty: 1,
70
+ payoutAmount: 50,
71
+ issueDescription: `
72
+ Bug Report: Application crashes with "Cannot read property of undefined"
73
+
74
+ When users submit a form with optional fields left empty, the application crashes.
75
+ The processUserData function doesn't handle cases where optional fields are undefined.
76
+
77
+ Expected: Function should gracefully handle undefined values
78
+ Actual: TypeError: Cannot read property 'trim' of undefined
79
+ `,
80
+ issueKeywords: ['null', 'undefined', 'optional', 'check', 'trim'],
81
+ testCases: [
82
+ { name: 'handles undefined', input: undefined, expectedOutput: '', type: 'edge-case' },
83
+ { name: 'handles null', input: null, expectedOutput: '', type: 'edge-case' },
84
+ { name: 'handles valid string', input: ' hello ', expectedOutput: 'hello', type: 'unit' },
85
+ ],
86
+ requiredSyntax: ['if', '===', 'undefined'],
87
+ structureRequirements: { mustHaveFunction: true, minLines: 3 },
88
+ },
89
+
90
+ // Level 2: Intermediate bug fixes
91
+ 'fix-async-race-condition': {
92
+ name: 'fix-async-race-condition',
93
+ description: 'Fix race condition in async data fetching',
94
+ category: 'bug-fix',
95
+ language: 'typescript',
96
+ difficulty: 2,
97
+ payoutAmount: 100,
98
+ issueDescription: `
99
+ Bug Report: Stale data displayed after rapid navigation
100
+
101
+ When users quickly navigate between pages, sometimes stale data from a previous
102
+ request is displayed instead of the current page's data.
103
+
104
+ Root cause: Multiple concurrent fetch requests without cancellation.
105
+
106
+ Expected: Only show data from the most recent request
107
+ Actual: Random stale data appears
108
+ `,
109
+ issueKeywords: ['async', 'race', 'abort', 'cancel', 'fetch', 'stale'],
110
+ testCases: [
111
+ { name: 'cancels previous request', input: { requests: 3 }, expectedOutput: { completed: 1 }, type: 'unit' },
112
+ { name: 'returns latest data', input: { sequence: [1, 2, 3] }, expectedOutput: 3, type: 'integration' },
113
+ ],
114
+ requiredSyntax: ['async', 'await', 'AbortController'],
115
+ structureRequirements: { mustHaveFunction: true, mustHaveImports: false, minLines: 10 },
116
+ qualityRequirements: { mustHaveErrorHandling: true },
117
+ },
118
+
119
+ // Level 3: Feature implementation
120
+ 'implement-retry-logic': {
121
+ name: 'implement-retry-logic',
122
+ description: 'Implement exponential backoff retry logic for API calls',
123
+ category: 'feature',
124
+ language: 'typescript',
125
+ difficulty: 3,
126
+ payoutAmount: 200,
127
+ issueDescription: `
128
+ Feature Request: Add retry logic with exponential backoff
129
+
130
+ API calls sometimes fail due to transient network issues. We need a retry
131
+ mechanism that:
132
+ 1. Retries failed requests up to 3 times
133
+ 2. Uses exponential backoff (1s, 2s, 4s delays)
134
+ 3. Only retries on specific error codes (500, 502, 503, 504)
135
+ 4. Throws after all retries exhausted
136
+
137
+ Should be a reusable utility function.
138
+ `,
139
+ issueKeywords: ['retry', 'exponential', 'backoff', 'delay', 'attempts', 'error'],
140
+ testCases: [
141
+ { name: 'succeeds on first try', input: { failCount: 0 }, expectedOutput: { attempts: 1 }, type: 'unit' },
142
+ { name: 'retries on failure', input: { failCount: 2 }, expectedOutput: { attempts: 3 }, type: 'unit' },
143
+ { name: 'throws after max retries', input: { failCount: 5 }, expectedOutput: { error: true }, type: 'edge-case' },
144
+ ],
145
+ requiredSyntax: ['async', 'await', 'for', 'throw', 'delay'],
146
+ structureRequirements: { mustHaveFunction: true, minLines: 15, maxLines: 50 },
147
+ qualityRequirements: { mustHaveErrorHandling: true, mustHaveComments: true },
148
+ },
149
+
150
+ // Level 4: Complex refactoring
151
+ 'refactor-to-strategy-pattern': {
152
+ name: 'refactor-to-strategy-pattern',
153
+ description: 'Refactor switch statement to Strategy pattern',
154
+ category: 'refactor',
155
+ language: 'typescript',
156
+ difficulty: 4,
157
+ payoutAmount: 350,
158
+ issueDescription: `
159
+ Refactoring Request: Replace payment switch with Strategy pattern
160
+
161
+ Current code has a massive switch statement for payment processing:
162
+ - CreditCard
163
+ - PayPal
164
+ - Crypto
165
+ - BankTransfer
166
+
167
+ Each case has 50+ lines. Adding new payment methods requires modifying
168
+ the switch. This violates Open/Closed principle.
169
+
170
+ Refactor to use Strategy pattern:
171
+ 1. Create PaymentStrategy interface
172
+ 2. Implement concrete strategies for each payment type
173
+ 3. Create PaymentContext that uses strategies
174
+ 4. Ensure existing tests still pass
175
+ `,
176
+ issueKeywords: ['strategy', 'pattern', 'interface', 'class', 'payment', 'refactor'],
177
+ testCases: [
178
+ { name: 'processes credit card', input: { type: 'credit', amount: 100 }, expectedOutput: { success: true }, type: 'integration' },
179
+ { name: 'processes paypal', input: { type: 'paypal', amount: 50 }, expectedOutput: { success: true }, type: 'integration' },
180
+ { name: 'extensible for new types', input: { type: 'new' }, expectedOutput: { extensible: true }, type: 'unit' },
181
+ ],
182
+ requiredSyntax: ['interface', 'class', 'implements', 'constructor'],
183
+ structureRequirements: { mustHaveClass: true, mustHaveFunction: true, minLines: 40 },
184
+ qualityRequirements: { mustHaveComments: true, maxComplexity: 10 },
185
+ },
186
+
187
+ // Level 5: Elite challenge (Codeon battle)
188
+ 'optimize-algorithm-complexity': {
189
+ name: 'optimize-algorithm-complexity',
190
+ description: 'Optimize O(n²) algorithm to O(n log n) or better',
191
+ category: 'optimization',
192
+ language: 'typescript',
193
+ difficulty: 5,
194
+ payoutAmount: 500,
195
+ issueDescription: `
196
+ Performance Critical: findDuplicates is too slow on large datasets
197
+
198
+ Current implementation uses nested loops - O(n²) complexity.
199
+ With 100k items, it takes 30+ seconds.
200
+
201
+ Requirements:
202
+ 1. Maintain same function signature
203
+ 2. Achieve O(n log n) or O(n) complexity
204
+ 3. Handle edge cases (empty array, single element, all duplicates)
205
+ 4. Memory usage should be reasonable (not O(n²) space)
206
+
207
+ Current (slow) implementation:
208
+ function findDuplicates(arr: number[]): number[] {
209
+ const duplicates = [];
210
+ for (let i = 0; i < arr.length; i++) {
211
+ for (let j = i + 1; j < arr.length; j++) {
212
+ if (arr[i] === arr[j] && !duplicates.includes(arr[i])) {
213
+ duplicates.push(arr[i]);
214
+ }
215
+ }
216
+ }
217
+ return duplicates;
218
+ }
219
+ `,
220
+ issueKeywords: ['optimize', 'complexity', 'O(n)', 'performance', 'Set', 'Map', 'hash'],
221
+ testCases: [
222
+ { name: 'finds duplicates', input: [1, 2, 2, 3, 3, 3], expectedOutput: [2, 3], type: 'unit' },
223
+ { name: 'handles empty', input: [], expectedOutput: [], type: 'edge-case' },
224
+ { name: 'handles no duplicates', input: [1, 2, 3], expectedOutput: [], type: 'edge-case' },
225
+ { name: 'handles large array efficiently', input: { size: 100000 }, expectedOutput: { timeMs: '<1000' }, type: 'integration' },
226
+ ],
227
+ requiredSyntax: ['Set', 'Map', 'function'],
228
+ structureRequirements: { mustHaveFunction: true, minLines: 5, maxLines: 30 },
229
+ qualityRequirements: { mustHaveComments: true, noConsoleLog: true },
230
+ },
231
+ };
232
+
233
+ /**
234
+ * Get outcomes by difficulty level (maps to training levels).
235
+ */
236
+ export function getCodeOutcomesByLevel(level: 1 | 2 | 3 | 4 | 5): CodeOutcome[] {
237
+ return Object.values(CODE_OUTCOMES).filter((outcome) => outcome.difficulty === level);
238
+ }
239
+
240
+ /**
241
+ * Get outcomes by category.
242
+ */
243
+ export function getCodeOutcomesByCategory(
244
+ category: CodeOutcome['category']
245
+ ): CodeOutcome[] {
246
+ return Object.values(CODE_OUTCOMES).filter((outcome) => outcome.category === category);
247
+ }
248
+
249
+ /**
250
+ * Get outcomes by language.
251
+ */
252
+ export function getCodeOutcomesByLanguage(
253
+ language: CodeOutcome['language']
254
+ ): CodeOutcome[] {
255
+ return Object.values(CODE_OUTCOMES).filter((outcome) => outcome.language === language);
256
+ }
@@ -0,0 +1,135 @@
1
+ /**
2
+ * Code Review Battle Outcome Tests
3
+ *
4
+ * Verifies the code review battle outcome configuration.
5
+ *
6
+ * @module outcomes/code_review_battle.test
7
+ */
8
+
9
+ import { describe, it, expect } from 'vitest';
10
+ import { validateOutcome } from './outcome.schema.js';
11
+ import {
12
+ codeReviewBattle,
13
+ CODE_REVIEW_PAYOUT,
14
+ CODE_REVIEW_MAX_ATTEMPTS,
15
+ CODE_REVIEW_TIME_LIMIT_MS,
16
+ } from './code_review_battle.js';
17
+
18
+ describe('Code Review Battle Outcome', () => {
19
+ describe('Constants', () => {
20
+ it('should have payout amount of $50', () => {
21
+ expect(CODE_REVIEW_PAYOUT).toBe(50);
22
+ });
23
+
24
+ it('should have max attempts of 3', () => {
25
+ expect(CODE_REVIEW_MAX_ATTEMPTS).toBe(3);
26
+ });
27
+
28
+ it('should have time limit of 300000ms (5 minutes)', () => {
29
+ expect(CODE_REVIEW_TIME_LIMIT_MS).toBe(300000);
30
+ });
31
+ });
32
+
33
+ describe('Outcome Configuration', () => {
34
+ it('should have correct payout amount', () => {
35
+ expect(codeReviewBattle.payoutAmount).toBe(50);
36
+ });
37
+
38
+ it('should have correct max attempts', () => {
39
+ expect(codeReviewBattle.maxAttempts).toBe(3);
40
+ });
41
+
42
+ it('should have correct time limit', () => {
43
+ expect(codeReviewBattle.timeLimitMs).toBe(300000);
44
+ });
45
+
46
+ it('should have name "code_review_battle"', () => {
47
+ expect(codeReviewBattle.name).toBe('code_review_battle');
48
+ });
49
+
50
+ it('should have a description', () => {
51
+ expect(codeReviewBattle.description).toBeTruthy();
52
+ expect(typeof codeReviewBattle.description).toBe('string');
53
+ });
54
+ });
55
+
56
+ describe('Schema Validation', () => {
57
+ it('should validate against OutcomeSchema', () => {
58
+ const result = validateOutcome(codeReviewBattle);
59
+ expect(result.valid).toBe(true);
60
+ expect(result.errors).toHaveLength(0);
61
+ });
62
+ });
63
+
64
+ describe('Success Criteria', () => {
65
+ it('should have 4 success criteria', () => {
66
+ expect(codeReviewBattle.successCriteria).toHaveLength(4);
67
+ });
68
+
69
+ it('should include security_vulnerability criterion', () => {
70
+ const criterion = codeReviewBattle.successCriteria.find(
71
+ (c) => c.name === 'security_vulnerability'
72
+ );
73
+ expect(criterion).toBeDefined();
74
+ expect(criterion?.validator).toBe('validateSecurityIssue');
75
+ expect(criterion?.params).toEqual({ requiredSeverity: 'CRITICAL' });
76
+ });
77
+
78
+ it('should include performance_bottleneck criterion', () => {
79
+ const criterion = codeReviewBattle.successCriteria.find(
80
+ (c) => c.name === 'performance_bottleneck'
81
+ );
82
+ expect(criterion).toBeDefined();
83
+ expect(criterion?.validator).toBe('validatePerformanceIssue');
84
+ expect(criterion?.params).toEqual({});
85
+ });
86
+
87
+ it('should include zero_noise criterion', () => {
88
+ const criterion = codeReviewBattle.successCriteria.find(
89
+ (c) => c.name === 'zero_noise'
90
+ );
91
+ expect(criterion).toBeDefined();
92
+ expect(criterion?.validator).toBe('validateNoiseFreeness');
93
+ expect(criterion?.params).toEqual({});
94
+ });
95
+
96
+ it('should include complexity_reduction criterion', () => {
97
+ const criterion = codeReviewBattle.successCriteria.find(
98
+ (c) => c.name === 'complexity_reduction'
99
+ );
100
+ expect(criterion).toBeDefined();
101
+ expect(criterion?.validator).toBe('validateComplexityReduction');
102
+ expect(criterion?.params).toEqual({ minReduction: 2 });
103
+ });
104
+ });
105
+
106
+ describe('Failure Reasons', () => {
107
+ it('should have 4 failure reasons', () => {
108
+ expect(codeReviewBattle.failureReasons).toHaveLength(4);
109
+ });
110
+
111
+ it('should include security vulnerability failure reason', () => {
112
+ expect(codeReviewBattle.failureReasons).toContain(
113
+ 'No critical security vulnerability identified (SQLi/XSS)'
114
+ );
115
+ });
116
+
117
+ it('should include performance bottleneck failure reason', () => {
118
+ expect(codeReviewBattle.failureReasons).toContain(
119
+ 'No performance bottleneck identified (N+1 queries)'
120
+ );
121
+ });
122
+
123
+ it('should include noise failure reason', () => {
124
+ expect(codeReviewBattle.failureReasons).toContain(
125
+ 'Comments found outside source diff boundaries (noise)'
126
+ );
127
+ });
128
+
129
+ it('should include complexity reduction failure reason', () => {
130
+ expect(codeReviewBattle.failureReasons).toContain(
131
+ 'Refactor suggestion does not reduce complexity by at least 2 points'
132
+ );
133
+ });
134
+ });
135
+ });
@@ -0,0 +1,135 @@
1
+ /**
2
+ * Code Review Battle Outcome
3
+ *
4
+ * Tournament Seed bounty for the WAI Championship Q1 Tournament.
5
+ * Agents compete to perform expert code review identifying security
6
+ * vulnerabilities, performance bottlenecks, and complexity-reducing refactors.
7
+ *
8
+ * @module outcomes/code_review_battle
9
+ * @see Requirements 1.1, 1.2, 1.3, 8.1
10
+ */
11
+
12
+ import type { Outcome } from './outcome.schema.js';
13
+
14
+ /**
15
+ * Payout amount for successfully completing a code review battle.
16
+ * Set to $50 as per Requirements 1.1.
17
+ */
18
+ export const CODE_REVIEW_PAYOUT = 50;
19
+
20
+ /**
21
+ * Maximum number of attempts an agent can make to achieve this outcome.
22
+ * Set to 3 as per Requirements 1.2.
23
+ */
24
+ export const CODE_REVIEW_MAX_ATTEMPTS = 3;
25
+
26
+ /**
27
+ * Time limit for achieving the outcome (5 minutes in milliseconds).
28
+ * Set to 300000ms as per Requirements 1.3.
29
+ */
30
+ export const CODE_REVIEW_TIME_LIMIT_MS = 300000;
31
+
32
+ /**
33
+ * Represents a code review issue identified by an agent.
34
+ */
35
+ export interface CodeReviewIssue {
36
+ /** Type of issue: security, performance, style, or logic */
37
+ type: 'security' | 'performance' | 'style' | 'logic';
38
+ /** Severity level of the issue */
39
+ severity: 'CRITICAL' | 'HIGH' | 'MEDIUM' | 'LOW';
40
+ /** Description of the issue */
41
+ description: string;
42
+ /** Optional line number where the issue was found */
43
+ lineNumber?: number;
44
+ }
45
+
46
+ /**
47
+ * Represents a code review comment on a specific line.
48
+ */
49
+ export interface CodeReviewComment {
50
+ /** The exact content of the line being commented on (must be in source diff) */
51
+ lineContent: string;
52
+ /** The review comment */
53
+ comment: string;
54
+ /** Line number in the source diff */
55
+ lineNumber: number;
56
+ }
57
+
58
+
59
+ /**
60
+ * Represents a refactoring suggestion that reduces cyclomatic complexity.
61
+ */
62
+ export interface RefactorSuggestion {
63
+ /** Original cyclomatic complexity of the code */
64
+ originalComplexity: number;
65
+ /** Suggested cyclomatic complexity after refactoring */
66
+ suggestedComplexity: number;
67
+ /** Description of the refactoring suggestion */
68
+ description: string;
69
+ }
70
+
71
+ /**
72
+ * Represents the complete artifact produced by an agent for code review.
73
+ */
74
+ export interface CodeReviewArtifact {
75
+ /** List of issues identified in the code */
76
+ issues: CodeReviewIssue[];
77
+ /** List of comments on specific lines */
78
+ comments: CodeReviewComment[];
79
+ /** Optional refactoring suggestion */
80
+ refactorSuggestion?: RefactorSuggestion;
81
+ }
82
+
83
+ /**
84
+ * Code Review Battle Outcome Definition
85
+ *
86
+ * Success requires meeting ALL 4 criteria:
87
+ * 1. Identify at least one CRITICAL security vulnerability (SQLi/XSS)
88
+ * 2. Identify at least one performance bottleneck (N+1 queries)
89
+ * 3. All comments must reside within source diff boundaries (Zero-Noise Standard)
90
+ * 4. Suggest a refactor that reduces cyclomatic complexity by at least 2 points
91
+ *
92
+ * @see Requirements 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 8.1
93
+ */
94
+ export const codeReviewBattle: Outcome = {
95
+ name: 'code_review_battle',
96
+ description:
97
+ 'Expert Code Review Challenge - Identify security vulnerabilities, performance bottlenecks, and suggest complexity-reducing refactors with zero noise.',
98
+ payoutAmount: CODE_REVIEW_PAYOUT,
99
+ maxAttempts: CODE_REVIEW_MAX_ATTEMPTS,
100
+ timeLimitMs: CODE_REVIEW_TIME_LIMIT_MS,
101
+ successCriteria: [
102
+ {
103
+ // Requirement 1.4: Verify at least one security vulnerability with CRITICAL severity
104
+ name: 'security_vulnerability',
105
+ validator: 'validateSecurityIssue',
106
+ params: { requiredSeverity: 'CRITICAL' },
107
+ },
108
+ {
109
+ // Requirement 1.5: Verify at least one performance bottleneck
110
+ name: 'performance_bottleneck',
111
+ validator: 'validatePerformanceIssue',
112
+ params: {},
113
+ },
114
+ {
115
+ // Requirement 1.6: Verify all comments reside within source diff boundaries
116
+ name: 'zero_noise',
117
+ validator: 'validateNoiseFreeness',
118
+ params: {},
119
+ },
120
+ {
121
+ // Requirement 1.7: Verify refactor suggestion reduces complexity by at least 2
122
+ name: 'complexity_reduction',
123
+ validator: 'validateComplexityReduction',
124
+ params: { minReduction: 2 },
125
+ },
126
+ ],
127
+ failureReasons: [
128
+ 'No critical security vulnerability identified (SQLi/XSS)',
129
+ 'No performance bottleneck identified (N+1 queries)',
130
+ 'Comments found outside source diff boundaries (noise)',
131
+ 'Refactor suggestion does not reduce complexity by at least 2 points',
132
+ ],
133
+ };
134
+
135
+ export default codeReviewBattle;
@@ -0,0 +1,97 @@
1
+ /**
2
+ * Cold Email Battle Outcome
3
+ *
4
+ * Viral content battle: Claude vs GPT-4 vs Gemini
5
+ * Agents compete to write the best cold email to a YC founder.
6
+ *
7
+ * @module outcomes/cold_email_battle
8
+ */
9
+
10
+ import type { Outcome } from './outcome.schema.js';
11
+
12
+ /**
13
+ * Target prospect for the cold email battle
14
+ */
15
+ export interface ColdEmailTarget {
16
+ name: string;
17
+ company: string;
18
+ role: string;
19
+ companyDescription: string;
20
+ recentNews?: string;
21
+ painPoints?: string[];
22
+ }
23
+
24
+ /**
25
+ * Cold email artifact produced by an agent
26
+ */
27
+ export interface ColdEmailArtifact {
28
+ subject: string;
29
+ body: string;
30
+ wordCount: number;
31
+ personalizationScore: number;
32
+ callToAction: string;
33
+ }
34
+
35
+ /**
36
+ * Default target for the viral battle
37
+ */
38
+ export const DEFAULT_TARGET: ColdEmailTarget = {
39
+ name: 'Sarah Chen',
40
+ company: 'TechFlow AI',
41
+ role: 'CEO & Co-founder',
42
+ companyDescription: 'YC W24 startup building AI-powered workflow automation for enterprise teams',
43
+ recentNews: 'Just raised $4M seed round, hiring aggressively',
44
+ painPoints: [
45
+ 'Scaling engineering team while maintaining quality',
46
+ 'Evaluating AI tools for internal use',
47
+ 'Building developer community around their product'
48
+ ]
49
+ };
50
+
51
+ /**
52
+ * Cold Email Battle Outcome Definition
53
+ *
54
+ * Success criteria:
55
+ * 1. Subject line under 50 characters
56
+ * 2. Body between 50-150 words
57
+ * 3. Contains personalization (mentions company/role/news)
58
+ * 4. Has clear call-to-action
59
+ * 5. Professional tone (no spam triggers)
60
+ */
61
+ export const coldEmailBattle: Outcome = {
62
+ name: 'cold_email_battle',
63
+ description: 'Cold Email Showdown - Write the most compelling cold email to a YC founder. Judged on personalization, clarity, and likelihood to get a response.',
64
+ payoutAmount: 0, // Sandbox mode
65
+ maxAttempts: 1,
66
+ timeLimitMs: 60000,
67
+ successCriteria: [
68
+ {
69
+ name: 'subject_length',
70
+ validator: 'validateSubjectLength',
71
+ params: { maxLength: 50 }
72
+ },
73
+ {
74
+ name: 'body_word_count',
75
+ validator: 'validateWordCount',
76
+ params: { min: 50, max: 150 }
77
+ },
78
+ {
79
+ name: 'has_personalization',
80
+ validator: 'validatePersonalization',
81
+ params: { requiredElements: ['company', 'role'] }
82
+ },
83
+ {
84
+ name: 'has_cta',
85
+ validator: 'validateCallToAction',
86
+ params: {}
87
+ }
88
+ ],
89
+ failureReasons: [
90
+ 'Subject line too long (max 50 characters)',
91
+ 'Body too short or too long (50-150 words required)',
92
+ 'Missing personalization - must mention company or role',
93
+ 'No clear call-to-action'
94
+ ]
95
+ };
96
+
97
+ export default coldEmailBattle;