outcome-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/README.md +261 -0
  2. package/package.json +95 -0
  3. package/src/agents/README.md +139 -0
  4. package/src/agents/adapters/anthropic.adapter.ts +166 -0
  5. package/src/agents/adapters/dalle.adapter.ts +145 -0
  6. package/src/agents/adapters/gemini.adapter.ts +134 -0
  7. package/src/agents/adapters/imagen.adapter.ts +106 -0
  8. package/src/agents/adapters/nano-banana.adapter.ts +129 -0
  9. package/src/agents/adapters/openai.adapter.ts +165 -0
  10. package/src/agents/adapters/veo.adapter.ts +130 -0
  11. package/src/agents/agent.schema.property.test.ts +379 -0
  12. package/src/agents/agent.schema.test.ts +148 -0
  13. package/src/agents/agent.schema.ts +263 -0
  14. package/src/agents/index.ts +60 -0
  15. package/src/agents/registered-agent.schema.ts +356 -0
  16. package/src/agents/registry.ts +97 -0
  17. package/src/agents/tournament-configs.property.test.ts +266 -0
  18. package/src/cli/README.md +145 -0
  19. package/src/cli/commands/define.ts +79 -0
  20. package/src/cli/commands/list.ts +46 -0
  21. package/src/cli/commands/logs.ts +83 -0
  22. package/src/cli/commands/run.ts +416 -0
  23. package/src/cli/commands/verify.ts +110 -0
  24. package/src/cli/index.ts +81 -0
  25. package/src/config/README.md +128 -0
  26. package/src/config/env.ts +262 -0
  27. package/src/config/index.ts +19 -0
  28. package/src/eval/README.md +318 -0
  29. package/src/eval/ai-judge.test.ts +435 -0
  30. package/src/eval/ai-judge.ts +368 -0
  31. package/src/eval/code-validators.ts +414 -0
  32. package/src/eval/evaluateOutcome.property.test.ts +1174 -0
  33. package/src/eval/evaluateOutcome.ts +591 -0
  34. package/src/eval/immigration-validators.ts +122 -0
  35. package/src/eval/index.ts +90 -0
  36. package/src/eval/judge-cache.ts +402 -0
  37. package/src/eval/tournament-validators.property.test.ts +439 -0
  38. package/src/eval/validators.property.test.ts +1118 -0
  39. package/src/eval/validators.ts +1199 -0
  40. package/src/eval/weighted-scorer.ts +285 -0
  41. package/src/index.ts +17 -0
  42. package/src/league/README.md +188 -0
  43. package/src/league/health-check.ts +353 -0
  44. package/src/league/index.ts +93 -0
  45. package/src/league/killAgent.ts +151 -0
  46. package/src/league/league.test.ts +1151 -0
  47. package/src/league/runLeague.ts +843 -0
  48. package/src/league/scoreAgent.ts +175 -0
  49. package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
  50. package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
  51. package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
  52. package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
  53. package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
  54. package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
  55. package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
  56. package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
  57. package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
  58. package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
  59. package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
  60. package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
  61. package/src/modules/omnibridge/api/.gitkeep +1 -0
  62. package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
  63. package/src/modules/omnibridge/auth/.gitkeep +1 -0
  64. package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
  65. package/src/modules/omnibridge/auth/session-vault.ts +577 -0
  66. package/src/modules/omnibridge/core/.gitkeep +1 -0
  67. package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
  68. package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
  69. package/src/modules/omnibridge/core/types.ts +610 -0
  70. package/src/modules/omnibridge/execution/.gitkeep +1 -0
  71. package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
  72. package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
  73. package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
  74. package/src/modules/omnibridge/index.ts +212 -0
  75. package/src/modules/omnibridge/omnibridge.ts +510 -0
  76. package/src/modules/omnibridge/verification/.gitkeep +1 -0
  77. package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
  78. package/src/outcomes/README.md +75 -0
  79. package/src/outcomes/acquire-pilot-customer.ts +297 -0
  80. package/src/outcomes/code-delivery-outcomes.ts +89 -0
  81. package/src/outcomes/code-outcomes.ts +256 -0
  82. package/src/outcomes/code_review_battle.test.ts +135 -0
  83. package/src/outcomes/code_review_battle.ts +135 -0
  84. package/src/outcomes/cold_email_battle.ts +97 -0
  85. package/src/outcomes/content_creation_battle.ts +160 -0
  86. package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
  87. package/src/outcomes/index.ts +107 -0
  88. package/src/outcomes/lead_gen_battle.test.ts +113 -0
  89. package/src/outcomes/lead_gen_battle.ts +99 -0
  90. package/src/outcomes/outcome.schema.property.test.ts +229 -0
  91. package/src/outcomes/outcome.schema.ts +187 -0
  92. package/src/outcomes/qualified_sales_interest.ts +118 -0
  93. package/src/outcomes/swarm_planner.property.test.ts +370 -0
  94. package/src/outcomes/swarm_planner.ts +96 -0
  95. package/src/outcomes/web_extraction.ts +234 -0
  96. package/src/runtime/README.md +220 -0
  97. package/src/runtime/agentRunner.test.ts +341 -0
  98. package/src/runtime/agentRunner.ts +746 -0
  99. package/src/runtime/claudeAdapter.ts +232 -0
  100. package/src/runtime/costTracker.ts +123 -0
  101. package/src/runtime/index.ts +34 -0
  102. package/src/runtime/modelAdapter.property.test.ts +305 -0
  103. package/src/runtime/modelAdapter.ts +144 -0
  104. package/src/runtime/openaiAdapter.ts +235 -0
  105. package/src/utils/README.md +122 -0
  106. package/src/utils/command-runner.ts +134 -0
  107. package/src/utils/cost-guard.ts +379 -0
  108. package/src/utils/errors.test.ts +290 -0
  109. package/src/utils/errors.ts +442 -0
  110. package/src/utils/index.ts +37 -0
  111. package/src/utils/logger.test.ts +361 -0
  112. package/src/utils/logger.ts +419 -0
  113. package/src/utils/output-parsers.ts +216 -0
@@ -0,0 +1,414 @@
1
+ /**
2
+ * Code Validators - SWE-bench style validation for code generation tasks
3
+ *
4
+ * Integrates software engineering evaluation into our existing validator system.
5
+ * These validators enable battles between code-focused agents (like Codeon).
6
+ *
7
+ * @module eval/code-validators
8
+ */
9
+
10
+ import type { ValidationResult } from './validators.js';
11
+
12
+ /**
13
+ * Code artifact content for SWE-bench style evaluation.
14
+ * Extends base artifact pattern to support code generation tasks.
15
+ */
16
+ export interface CodeArtifactContent {
17
+ /** The generated code/patch */
18
+ code: string;
19
+ /** Language of the code (python, typescript, etc.) */
20
+ language: string;
21
+ /** The issue/problem description that was given */
22
+ issueDescription: string;
23
+ /** Optional: File paths that were modified */
24
+ modifiedFiles?: string[];
25
+ /** Optional: Explanation of the fix */
26
+ explanation?: string;
27
+ }
28
+
29
+ /**
30
+ * Test case for code evaluation.
31
+ */
32
+ export interface TestCase {
33
+ /** Test name/description */
34
+ name: string;
35
+ /** Input to the function/code */
36
+ input: unknown;
37
+ /** Expected output */
38
+ expectedOutput: unknown;
39
+ /** Optional: Test type (unit, integration, edge-case) */
40
+ type?: 'unit' | 'integration' | 'edge-case';
41
+ }
42
+
43
+ /**
44
+ * Validates that generated code contains required syntax elements.
45
+ *
46
+ * @param code - The generated code to validate
47
+ * @param requiredElements - Array of syntax elements that must be present
48
+ * @returns ValidationResult indicating if all required elements are present
49
+ *
50
+ * @example
51
+ * validateCodeSyntax("def foo(): return 42", ["def", "return"])
52
+ * // { valid: true, errors: [] }
53
+ */
54
+ export function validateCodeSyntax(
55
+ code: string,
56
+ requiredElements: string[]
57
+ ): ValidationResult {
58
+ const missingElements = requiredElements.filter(
59
+ (element) => !code.includes(element)
60
+ );
61
+
62
+ if (missingElements.length === 0) {
63
+ return { valid: true, errors: [] };
64
+ }
65
+
66
+ return {
67
+ valid: false,
68
+ errors: [
69
+ `Missing required syntax elements: ${missingElements.join(', ')}`,
70
+ ],
71
+ };
72
+ }
73
+
74
+ /**
75
+ * Validates that code follows basic structure requirements.
76
+ * Checks for proper function/class definitions, imports, etc.
77
+ *
78
+ * @param code - The generated code
79
+ * @param language - Programming language
80
+ * @param requirements - Structure requirements to check
81
+ * @returns ValidationResult
82
+ */
83
+ export function validateCodeStructure(
84
+ code: string,
85
+ language: string,
86
+ requirements: {
87
+ mustHaveFunction?: boolean;
88
+ mustHaveClass?: boolean;
89
+ mustHaveImports?: boolean;
90
+ minLines?: number;
91
+ maxLines?: number;
92
+ }
93
+ ): ValidationResult {
94
+ const errors: string[] = [];
95
+ const lines = code.split('\n').filter((line) => line.trim().length > 0);
96
+
97
+ // Language-specific patterns
98
+ const patterns: Record<string, { function: RegExp; class: RegExp; import: RegExp }> = {
99
+ python: {
100
+ function: /def\s+\w+\s*\(/,
101
+ class: /class\s+\w+/,
102
+ import: /^(import|from)\s+/m,
103
+ },
104
+ typescript: {
105
+ function: /(function\s+\w+|const\s+\w+\s*=\s*(async\s*)?\(|=>\s*{)/,
106
+ class: /class\s+\w+/,
107
+ import: /^import\s+/m,
108
+ },
109
+ javascript: {
110
+ function: /(function\s+\w+|const\s+\w+\s*=\s*(async\s*)?\(|=>\s*{)/,
111
+ class: /class\s+\w+/,
112
+ import: /^(import|require)\s*/m,
113
+ },
114
+ };
115
+
116
+ const langPatterns = patterns[language] || patterns.javascript;
117
+
118
+ if (requirements.mustHaveFunction && !langPatterns.function.test(code)) {
119
+ errors.push(`Code must contain a function definition`);
120
+ }
121
+
122
+ if (requirements.mustHaveClass && !langPatterns.class.test(code)) {
123
+ errors.push(`Code must contain a class definition`);
124
+ }
125
+
126
+ if (requirements.mustHaveImports && !langPatterns.import.test(code)) {
127
+ errors.push(`Code must contain import statements`);
128
+ }
129
+
130
+ if (requirements.minLines && lines.length < requirements.minLines) {
131
+ errors.push(`Code too short: ${lines.length} lines, minimum ${requirements.minLines}`);
132
+ }
133
+
134
+ if (requirements.maxLines && lines.length > requirements.maxLines) {
135
+ errors.push(`Code too long: ${lines.length} lines, maximum ${requirements.maxLines}`);
136
+ }
137
+
138
+ return {
139
+ valid: errors.length === 0,
140
+ errors,
141
+ };
142
+ }
143
+
144
+ /**
145
+ * Validates that code produces expected outputs for given test cases.
146
+ * This is the core SWE-bench style evaluation.
147
+ *
148
+ * NOTE: In production, this would execute code in a sandboxed environment.
149
+ * For now, we do static analysis and pattern matching.
150
+ *
151
+ * @param code - The generated code
152
+ * @param testCases - Array of test cases to validate against
153
+ * @param options - Evaluation options
154
+ * @returns ValidationResult with test pass/fail details
155
+ */
156
+ export function validateTestCases(
157
+ code: string,
158
+ testCases: TestCase[],
159
+ options: {
160
+ requireAllPass?: boolean;
161
+ minPassRate?: number;
162
+ } = {}
163
+ ): ValidationResult {
164
+ const { requireAllPass = true, minPassRate = 1.0 } = options;
165
+ const errors: string[] = [];
166
+ let passedCount = 0;
167
+
168
+ // Static analysis: Check if code handles the test cases conceptually
169
+ // In production, this would actually run the code in a sandbox (Docker like SWE-bench)
170
+ for (const testCase of testCases) {
171
+ // Basic heuristic: Check if code could plausibly handle this case
172
+ // Real implementation would execute in Docker container like SWE-bench
173
+ const hasRelevantLogic =
174
+ code.includes('return') ||
175
+ code.includes('yield') ||
176
+ code.includes('console.log') ||
177
+ code.includes('print');
178
+
179
+ if (hasRelevantLogic) {
180
+ passedCount++;
181
+ } else {
182
+ errors.push(`Test "${testCase.name}" may fail - no return/output statement found`);
183
+ }
184
+ }
185
+
186
+ const passRate = testCases.length > 0 ? passedCount / testCases.length : 0;
187
+
188
+ if (requireAllPass && passedCount < testCases.length) {
189
+ return {
190
+ valid: false,
191
+ errors: [`Only ${passedCount}/${testCases.length} tests passed. ${errors.join('; ')}`],
192
+ };
193
+ }
194
+
195
+ if (passRate < minPassRate) {
196
+ return {
197
+ valid: false,
198
+ errors: [`Pass rate ${(passRate * 100).toFixed(1)}% below minimum ${(minPassRate * 100).toFixed(1)}%`],
199
+ };
200
+ }
201
+
202
+ return { valid: true, errors: [] };
203
+ }
204
+
205
+ /**
206
+ * Validates that a patch/diff correctly addresses the issue.
207
+ * SWE-bench core: Does this patch fix the GitHub issue?
208
+ *
209
+ * @param patch - The generated patch/diff
210
+ * @param issueKeywords - Keywords from the issue that should be addressed
211
+ * @param options - Validation options
212
+ * @returns ValidationResult
213
+ */
214
+ export function validatePatchRelevance(
215
+ patch: string,
216
+ issueKeywords: string[],
217
+ options: {
218
+ minKeywordsCovered?: number;
219
+ mustHaveAdditions?: boolean;
220
+ mustHaveDeletions?: boolean;
221
+ } = {}
222
+ ): ValidationResult {
223
+ const {
224
+ minKeywordsCovered = 1,
225
+ mustHaveAdditions = true,
226
+ mustHaveDeletions = false
227
+ } = options;
228
+ const errors: string[] = [];
229
+ const patchLower = patch.toLowerCase();
230
+
231
+ // Check keyword coverage
232
+ const coveredKeywords = issueKeywords.filter((kw) =>
233
+ patchLower.includes(kw.toLowerCase())
234
+ );
235
+
236
+ if (coveredKeywords.length < minKeywordsCovered) {
237
+ errors.push(
238
+ `Patch only addresses ${coveredKeywords.length}/${minKeywordsCovered} required keywords`
239
+ );
240
+ }
241
+
242
+ // Check for additions (lines starting with +)
243
+ const hasAdditions = /^\+[^+]/m.test(patch);
244
+ if (mustHaveAdditions && !hasAdditions) {
245
+ errors.push('Patch must contain code additions');
246
+ }
247
+
248
+ // Check for deletions (lines starting with -)
249
+ const hasDeletions = /^-[^-]/m.test(patch);
250
+ if (mustHaveDeletions && !hasDeletions) {
251
+ errors.push('Patch must contain code deletions');
252
+ }
253
+
254
+ return {
255
+ valid: errors.length === 0,
256
+ errors,
257
+ };
258
+ }
259
+
260
+ /**
261
+ * Validates code quality metrics.
262
+ * Checks for common issues like missing error handling, no comments, etc.
263
+ *
264
+ * @param code - The generated code
265
+ * @param requirements - Quality requirements
266
+ * @returns ValidationResult
267
+ */
268
+ export function validateCodeQuality(
269
+ code: string,
270
+ requirements: {
271
+ mustHaveErrorHandling?: boolean;
272
+ mustHaveComments?: boolean;
273
+ maxComplexity?: number;
274
+ noConsoleLog?: boolean;
275
+ } = {}
276
+ ): ValidationResult {
277
+ const errors: string[] = [];
278
+
279
+ // Error handling check
280
+ if (requirements.mustHaveErrorHandling) {
281
+ const hasErrorHandling =
282
+ code.includes('try') ||
283
+ code.includes('catch') ||
284
+ code.includes('except') ||
285
+ code.includes('throw') ||
286
+ code.includes('raise');
287
+
288
+ if (!hasErrorHandling) {
289
+ errors.push('Code should include error handling (try/catch/except)');
290
+ }
291
+ }
292
+
293
+ // Comments check
294
+ if (requirements.mustHaveComments) {
295
+ const hasComments =
296
+ code.includes('//') ||
297
+ code.includes('/*') ||
298
+ code.includes('#') ||
299
+ code.includes('"""') ||
300
+ code.includes("'''");
301
+
302
+ if (!hasComments) {
303
+ errors.push('Code should include comments or documentation');
304
+ }
305
+ }
306
+
307
+ // Console.log check (often unwanted in production code)
308
+ if (requirements.noConsoleLog) {
309
+ if (code.includes('console.log') || code.includes('print(')) {
310
+ errors.push('Code should not contain debug print statements');
311
+ }
312
+ }
313
+
314
+ // Complexity check (simplified: count control flow statements)
315
+ if (requirements.maxComplexity) {
316
+ const controlFlowPatterns = /\b(if|else|for|while|switch|case|try|catch)\b/g;
317
+ const matches = code.match(controlFlowPatterns) || [];
318
+ if (matches.length > requirements.maxComplexity) {
319
+ errors.push(
320
+ `Code complexity ${matches.length} exceeds maximum ${requirements.maxComplexity}`
321
+ );
322
+ }
323
+ }
324
+
325
+ return {
326
+ valid: errors.length === 0,
327
+ errors,
328
+ };
329
+ }
330
+
331
+ /**
332
+ * Combined SWE-bench style evaluation.
333
+ * Runs multiple validators to get a comprehensive score.
334
+ *
335
+ * @param artifact - The code artifact to evaluate
336
+ * @param config - Evaluation configuration
337
+ * @returns Combined ValidationResult with score
338
+ */
339
+ export function evaluateSWEBenchStyle(
340
+ artifact: CodeArtifactContent,
341
+ config: {
342
+ issueKeywords: string[];
343
+ requiredSyntax?: string[];
344
+ testCases?: TestCase[];
345
+ structureRequirements?: Parameters<typeof validateCodeStructure>[2];
346
+ qualityRequirements?: Parameters<typeof validateCodeQuality>[1];
347
+ }
348
+ ): ValidationResult & { score: number; breakdown: Record<string, boolean> } {
349
+ const breakdown: Record<string, boolean> = {};
350
+ const allErrors: string[] = [];
351
+
352
+ // 1. Syntax validation
353
+ if (config.requiredSyntax) {
354
+ const syntaxResult = validateCodeSyntax(artifact.code, config.requiredSyntax);
355
+ breakdown.syntax = syntaxResult.valid;
356
+ if (!syntaxResult.valid) allErrors.push(...syntaxResult.errors);
357
+ } else {
358
+ breakdown.syntax = true;
359
+ }
360
+
361
+ // 2. Structure validation
362
+ if (config.structureRequirements) {
363
+ const structureResult = validateCodeStructure(
364
+ artifact.code,
365
+ artifact.language,
366
+ config.structureRequirements
367
+ );
368
+ breakdown.structure = structureResult.valid;
369
+ if (!structureResult.valid) allErrors.push(...structureResult.errors);
370
+ } else {
371
+ breakdown.structure = true;
372
+ }
373
+
374
+ // 3. Test case validation
375
+ if (config.testCases && config.testCases.length > 0) {
376
+ const testResult = validateTestCases(artifact.code, config.testCases, {
377
+ requireAllPass: false,
378
+ minPassRate: 0.7,
379
+ });
380
+ breakdown.tests = testResult.valid;
381
+ if (!testResult.valid) allErrors.push(...testResult.errors);
382
+ } else {
383
+ breakdown.tests = true;
384
+ }
385
+
386
+ // 4. Relevance validation (does it address the issue?)
387
+ const relevanceResult = validatePatchRelevance(artifact.code, config.issueKeywords, {
388
+ minKeywordsCovered: Math.min(2, config.issueKeywords.length),
389
+ });
390
+ breakdown.relevance = relevanceResult.valid;
391
+ if (!relevanceResult.valid) allErrors.push(...relevanceResult.errors);
392
+
393
+ // 5. Quality validation
394
+ if (config.qualityRequirements) {
395
+ const qualityResult = validateCodeQuality(artifact.code, config.qualityRequirements);
396
+ breakdown.quality = qualityResult.valid;
397
+ if (!qualityResult.valid) allErrors.push(...qualityResult.errors);
398
+ } else {
399
+ breakdown.quality = true;
400
+ }
401
+
402
+ // Calculate score (weighted average)
403
+ const weights = { syntax: 0.15, structure: 0.15, tests: 0.35, relevance: 0.25, quality: 0.10 };
404
+ const score = Object.entries(breakdown).reduce((acc, [key, passed]) => {
405
+ return acc + (passed ? weights[key as keyof typeof weights] || 0 : 0);
406
+ }, 0);
407
+
408
+ return {
409
+ valid: allErrors.length === 0,
410
+ errors: allErrors,
411
+ score,
412
+ breakdown,
413
+ };
414
+ }