outcome-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/README.md +261 -0
  2. package/package.json +95 -0
  3. package/src/agents/README.md +139 -0
  4. package/src/agents/adapters/anthropic.adapter.ts +166 -0
  5. package/src/agents/adapters/dalle.adapter.ts +145 -0
  6. package/src/agents/adapters/gemini.adapter.ts +134 -0
  7. package/src/agents/adapters/imagen.adapter.ts +106 -0
  8. package/src/agents/adapters/nano-banana.adapter.ts +129 -0
  9. package/src/agents/adapters/openai.adapter.ts +165 -0
  10. package/src/agents/adapters/veo.adapter.ts +130 -0
  11. package/src/agents/agent.schema.property.test.ts +379 -0
  12. package/src/agents/agent.schema.test.ts +148 -0
  13. package/src/agents/agent.schema.ts +263 -0
  14. package/src/agents/index.ts +60 -0
  15. package/src/agents/registered-agent.schema.ts +356 -0
  16. package/src/agents/registry.ts +97 -0
  17. package/src/agents/tournament-configs.property.test.ts +266 -0
  18. package/src/cli/README.md +145 -0
  19. package/src/cli/commands/define.ts +79 -0
  20. package/src/cli/commands/list.ts +46 -0
  21. package/src/cli/commands/logs.ts +83 -0
  22. package/src/cli/commands/run.ts +416 -0
  23. package/src/cli/commands/verify.ts +110 -0
  24. package/src/cli/index.ts +81 -0
  25. package/src/config/README.md +128 -0
  26. package/src/config/env.ts +262 -0
  27. package/src/config/index.ts +19 -0
  28. package/src/eval/README.md +318 -0
  29. package/src/eval/ai-judge.test.ts +435 -0
  30. package/src/eval/ai-judge.ts +368 -0
  31. package/src/eval/code-validators.ts +414 -0
  32. package/src/eval/evaluateOutcome.property.test.ts +1174 -0
  33. package/src/eval/evaluateOutcome.ts +591 -0
  34. package/src/eval/immigration-validators.ts +122 -0
  35. package/src/eval/index.ts +90 -0
  36. package/src/eval/judge-cache.ts +402 -0
  37. package/src/eval/tournament-validators.property.test.ts +439 -0
  38. package/src/eval/validators.property.test.ts +1118 -0
  39. package/src/eval/validators.ts +1199 -0
  40. package/src/eval/weighted-scorer.ts +285 -0
  41. package/src/index.ts +17 -0
  42. package/src/league/README.md +188 -0
  43. package/src/league/health-check.ts +353 -0
  44. package/src/league/index.ts +93 -0
  45. package/src/league/killAgent.ts +151 -0
  46. package/src/league/league.test.ts +1151 -0
  47. package/src/league/runLeague.ts +843 -0
  48. package/src/league/scoreAgent.ts +175 -0
  49. package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
  50. package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
  51. package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
  52. package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
  53. package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
  54. package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
  55. package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
  56. package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
  57. package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
  58. package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
  59. package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
  60. package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
  61. package/src/modules/omnibridge/api/.gitkeep +1 -0
  62. package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
  63. package/src/modules/omnibridge/auth/.gitkeep +1 -0
  64. package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
  65. package/src/modules/omnibridge/auth/session-vault.ts +577 -0
  66. package/src/modules/omnibridge/core/.gitkeep +1 -0
  67. package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
  68. package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
  69. package/src/modules/omnibridge/core/types.ts +610 -0
  70. package/src/modules/omnibridge/execution/.gitkeep +1 -0
  71. package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
  72. package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
  73. package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
  74. package/src/modules/omnibridge/index.ts +212 -0
  75. package/src/modules/omnibridge/omnibridge.ts +510 -0
  76. package/src/modules/omnibridge/verification/.gitkeep +1 -0
  77. package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
  78. package/src/outcomes/README.md +75 -0
  79. package/src/outcomes/acquire-pilot-customer.ts +297 -0
  80. package/src/outcomes/code-delivery-outcomes.ts +89 -0
  81. package/src/outcomes/code-outcomes.ts +256 -0
  82. package/src/outcomes/code_review_battle.test.ts +135 -0
  83. package/src/outcomes/code_review_battle.ts +135 -0
  84. package/src/outcomes/cold_email_battle.ts +97 -0
  85. package/src/outcomes/content_creation_battle.ts +160 -0
  86. package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
  87. package/src/outcomes/index.ts +107 -0
  88. package/src/outcomes/lead_gen_battle.test.ts +113 -0
  89. package/src/outcomes/lead_gen_battle.ts +99 -0
  90. package/src/outcomes/outcome.schema.property.test.ts +229 -0
  91. package/src/outcomes/outcome.schema.ts +187 -0
  92. package/src/outcomes/qualified_sales_interest.ts +118 -0
  93. package/src/outcomes/swarm_planner.property.test.ts +370 -0
  94. package/src/outcomes/swarm_planner.ts +96 -0
  95. package/src/outcomes/web_extraction.ts +234 -0
  96. package/src/runtime/README.md +220 -0
  97. package/src/runtime/agentRunner.test.ts +341 -0
  98. package/src/runtime/agentRunner.ts +746 -0
  99. package/src/runtime/claudeAdapter.ts +232 -0
  100. package/src/runtime/costTracker.ts +123 -0
  101. package/src/runtime/index.ts +34 -0
  102. package/src/runtime/modelAdapter.property.test.ts +305 -0
  103. package/src/runtime/modelAdapter.ts +144 -0
  104. package/src/runtime/openaiAdapter.ts +235 -0
  105. package/src/utils/README.md +122 -0
  106. package/src/utils/command-runner.ts +134 -0
  107. package/src/utils/cost-guard.ts +379 -0
  108. package/src/utils/errors.test.ts +290 -0
  109. package/src/utils/errors.ts +442 -0
  110. package/src/utils/index.ts +37 -0
  111. package/src/utils/logger.test.ts +361 -0
  112. package/src/utils/logger.ts +419 -0
  113. package/src/utils/output-parsers.ts +216 -0
@@ -0,0 +1,368 @@
1
+ /**
2
+ * AI-Powered Evaluation System
3
+ *
4
+ * Implements AI judges for subjective bounty criteria evaluation.
5
+ * Supports GPT-4o and Claude Opus models for fair evaluation of creative tasks.
6
+ *
7
+ * @module eval/ai-judge
8
+ * @see Requirements 10.1, 10.2, 10.3, 10.4, 10.5
9
+ */
10
+
11
+ import Anthropic from '@anthropic-ai/sdk';
12
+ import OpenAI from 'openai';
13
+ import { getJudgeCache, type JudgeCache, type JudgeResult, type JudgeModel } from './judge-cache.js';
14
+ import { createHash } from 'crypto';
15
+
16
+ // Re-export types from judge-cache for convenience
17
+ export type { JudgeModel, JudgeResult } from './judge-cache.js';
18
+
19
+ /**
20
+ * Configuration for an AI judge evaluation.
21
+ *
22
+ * @see Requirements 10.1, 10.5
23
+ */
24
+ export interface JudgeConfig {
25
+ /** The AI model to use for evaluation */
26
+ model: JudgeModel;
27
+ /** The rubric describing evaluation criteria */
28
+ rubric: string;
29
+ /** Maximum score that can be awarded */
30
+ maxScore: number;
31
+ /** Optional temperature for model responses (default: 0.3 for consistency) */
32
+ temperature?: number;
33
+ /** Optional maximum tokens for response (default: 1024) */
34
+ maxTokens?: number;
35
+ }
36
+
37
+ /**
38
+ * Internal structure for parsed judge response.
39
+ */
40
+ interface ParsedJudgeResponse {
41
+ score: number;
42
+ reasoning: string;
43
+ highlights: string[];
44
+ }
45
+
46
+ /**
47
+ * Error thrown when AI judge evaluation fails.
48
+ */
49
+ export class AIJudgeError extends Error {
50
+ constructor(
51
+ message: string,
52
+ public readonly model: JudgeModel,
53
+ public readonly cause?: Error
54
+ ) {
55
+ super(message);
56
+ this.name = 'AIJudgeError';
57
+ }
58
+ }
59
+
60
+ /**
61
+ * Creates a hash key for caching judge results.
62
+ * Combines artifact content and rubric to ensure cache consistency.
63
+ *
64
+ * @param artifact - The artifact being evaluated
65
+ * @param rubric - The evaluation rubric
66
+ * @returns SHA-256 hash string for cache key
67
+ *
68
+ * @see Requirements 10.4
69
+ */
70
+ export function hashArtifact(artifact: unknown, rubric: string): string {
71
+ const content = JSON.stringify({ artifact, rubric });
72
+ return createHash('sha256').update(content).digest('hex');
73
+ }
74
+
75
+ /**
76
+ * Builds the evaluation prompt for the AI judge.
77
+ *
78
+ * @param artifact - The artifact to evaluate
79
+ * @param rubric - The evaluation rubric
80
+ * @param maxScore - Maximum score possible
81
+ * @returns Formatted prompt string
82
+ *
83
+ * @see Requirements 10.2
84
+ */
85
+ function buildJudgePrompt(artifact: unknown, rubric: string, maxScore: number): string {
86
+ const artifactStr = typeof artifact === 'string'
87
+ ? artifact
88
+ : JSON.stringify(artifact, null, 2);
89
+
90
+ return `You are an expert AI judge evaluating an agent's output against a specific rubric.
91
+
92
+ ## Evaluation Rubric
93
+ ${rubric}
94
+
95
+ ## Agent Output to Evaluate
96
+ ${artifactStr}
97
+
98
+ ## Instructions
99
+ 1. Carefully evaluate the agent's output against each criterion in the rubric
100
+ 2. Provide a score from 0 to ${maxScore} based on how well the output meets the criteria
101
+ 3. Explain your reasoning in detail
102
+ 4. Highlight any notable strengths or weaknesses
103
+
104
+ ## Response Format
105
+ Respond with a JSON object in this exact format:
106
+ {
107
+ "score": <number from 0 to ${maxScore}>,
108
+ "reasoning": "<detailed explanation of your evaluation>",
109
+ "highlights": ["<notable aspect 1>", "<notable aspect 2>", ...]
110
+ }
111
+
112
+ Respond ONLY with the JSON object, no additional text.`;
113
+ }
114
+
115
+ /**
116
+ * Parses the AI judge response into a structured format.
117
+ *
118
+ * @param response - Raw response from the AI model
119
+ * @param maxScore - Maximum score for validation
120
+ * @returns Parsed judge response
121
+ * @throws AIJudgeError if parsing fails
122
+ */
123
+ function parseJudgeResponse(response: string, maxScore: number, model: JudgeModel): ParsedJudgeResponse {
124
+ try {
125
+ // Try to extract JSON from the response
126
+ const jsonMatch = response.match(/\{[\s\S]*\}/);
127
+ if (!jsonMatch) {
128
+ throw new Error('No JSON object found in response');
129
+ }
130
+
131
+ const parsed = JSON.parse(jsonMatch[0]) as Record<string, unknown>;
132
+
133
+ // Validate required fields
134
+ if (typeof parsed.score !== 'number') {
135
+ throw new Error('Missing or invalid "score" field');
136
+ }
137
+ if (typeof parsed.reasoning !== 'string' || parsed.reasoning.trim() === '') {
138
+ throw new Error('Missing or invalid "reasoning" field');
139
+ }
140
+
141
+ // Clamp score to valid range
142
+ const score = Math.max(0, Math.min(maxScore, parsed.score));
143
+
144
+ // Parse highlights (optional, default to empty array)
145
+ const highlights = Array.isArray(parsed.highlights)
146
+ ? parsed.highlights.filter((h): h is string => typeof h === 'string')
147
+ : [];
148
+
149
+ return {
150
+ score,
151
+ reasoning: parsed.reasoning,
152
+ highlights,
153
+ };
154
+ } catch (error) {
155
+ const errorMessage = error instanceof Error ? error.message : 'Unknown parsing error';
156
+ throw new AIJudgeError(`Failed to parse judge response: ${errorMessage}`, model);
157
+ }
158
+ }
159
+
160
+ /**
161
+ * Evaluates an artifact using GPT-4o as the judge.
162
+ *
163
+ * @param artifact - The artifact to evaluate
164
+ * @param config - Judge configuration
165
+ * @returns Parsed judge response
166
+ */
167
+ async function evaluateWithGPT4o(
168
+ artifact: unknown,
169
+ config: JudgeConfig
170
+ ): Promise<ParsedJudgeResponse> {
171
+ const apiKey = process.env.OPENAI_API_KEY;
172
+ if (!apiKey) {
173
+ throw new AIJudgeError('OPENAI_API_KEY not configured', 'gpt-4o');
174
+ }
175
+
176
+ const client = new OpenAI({ apiKey });
177
+ const prompt = buildJudgePrompt(artifact, config.rubric, config.maxScore);
178
+
179
+ try {
180
+ const response = await client.chat.completions.create({
181
+ model: 'gpt-4o',
182
+ messages: [
183
+ {
184
+ role: 'system',
185
+ content: 'You are an expert AI judge. Evaluate outputs fairly and provide detailed reasoning.',
186
+ },
187
+ {
188
+ role: 'user',
189
+ content: prompt,
190
+ },
191
+ ],
192
+ temperature: config.temperature ?? 0.3,
193
+ max_tokens: config.maxTokens ?? 1024,
194
+ });
195
+
196
+ const content = response.choices[0]?.message?.content;
197
+ if (!content) {
198
+ throw new AIJudgeError('Empty response from GPT-4o', 'gpt-4o');
199
+ }
200
+
201
+ return parseJudgeResponse(content, config.maxScore, 'gpt-4o');
202
+ } catch (error) {
203
+ if (error instanceof AIJudgeError) {
204
+ throw error;
205
+ }
206
+ const cause = error instanceof Error ? error : new Error(String(error));
207
+ throw new AIJudgeError(`GPT-4o evaluation failed: ${cause.message}`, 'gpt-4o', cause);
208
+ }
209
+ }
210
+
211
+ /**
212
+ * Evaluates an artifact using Claude Opus as the judge.
213
+ *
214
+ * @param artifact - The artifact to evaluate
215
+ * @param config - Judge configuration
216
+ * @returns Parsed judge response
217
+ */
218
+ async function evaluateWithClaudeOpus(
219
+ artifact: unknown,
220
+ config: JudgeConfig
221
+ ): Promise<ParsedJudgeResponse> {
222
+ const apiKey = process.env.ANTHROPIC_API_KEY;
223
+ if (!apiKey) {
224
+ throw new AIJudgeError('ANTHROPIC_API_KEY not configured', 'claude-opus');
225
+ }
226
+
227
+ const client = new Anthropic({ apiKey });
228
+ const prompt = buildJudgePrompt(artifact, config.rubric, config.maxScore);
229
+
230
+ try {
231
+ const response = await client.messages.create({
232
+ model: 'claude-3-opus-20240229',
233
+ max_tokens: config.maxTokens ?? 1024,
234
+ system: 'You are an expert AI judge. Evaluate outputs fairly and provide detailed reasoning.',
235
+ messages: [
236
+ {
237
+ role: 'user',
238
+ content: prompt,
239
+ },
240
+ ],
241
+ });
242
+
243
+ const content = response.content[0];
244
+ if (content.type !== 'text' || !content.text) {
245
+ throw new AIJudgeError('Empty response from Claude Opus', 'claude-opus');
246
+ }
247
+
248
+ return parseJudgeResponse(content.text, config.maxScore, 'claude-opus');
249
+ } catch (error) {
250
+ if (error instanceof AIJudgeError) {
251
+ throw error;
252
+ }
253
+ const cause = error instanceof Error ? error : new Error(String(error));
254
+ throw new AIJudgeError(`Claude Opus evaluation failed: ${cause.message}`, 'claude-opus', cause);
255
+ }
256
+ }
257
+
258
+ /**
259
+ * Evaluates an artifact using an AI judge model.
260
+ *
261
+ * This function:
262
+ * 1. Checks the cache for existing results (idempotence)
263
+ * 2. If not cached, invokes the specified AI model
264
+ * 3. Parses and validates the response
265
+ * 4. Caches the result for future requests
266
+ *
267
+ * @param artifact - The artifact to evaluate
268
+ * @param config - Judge configuration including model, rubric, and maxScore
269
+ * @param cache - Optional cache instance (defaults to global cache)
270
+ * @returns JudgeResult with score, reasoning, and highlights
271
+ *
272
+ * @example
273
+ * const result = await evaluateWithAIJudge(
274
+ * { message: "Hello world", quality: "high" },
275
+ * {
276
+ * model: 'gpt-4o',
277
+ * rubric: 'Evaluate the message for clarity and professionalism...',
278
+ * maxScore: 10
279
+ * }
280
+ * );
281
+ *
282
+ * @see Requirements 10.1, 10.2, 10.3, 10.4, 10.5
283
+ */
284
+ export async function evaluateWithAIJudge(
285
+ artifact: unknown,
286
+ config: JudgeConfig,
287
+ cache?: JudgeCache
288
+ ): Promise<JudgeResult> {
289
+ // Use provided cache or get global cache
290
+ const judgeCache = cache ?? getJudgeCache();
291
+
292
+ // Generate cache key from artifact + rubric
293
+ const cacheKey = hashArtifact(artifact, config.rubric);
294
+
295
+ // Check cache first (idempotence)
296
+ const cached = await judgeCache.get(cacheKey);
297
+ if (cached) {
298
+ return {
299
+ ...cached,
300
+ cached: true,
301
+ };
302
+ }
303
+
304
+ // Evaluate with the specified model
305
+ let parsed: ParsedJudgeResponse;
306
+
307
+ switch (config.model) {
308
+ case 'gpt-4o':
309
+ parsed = await evaluateWithGPT4o(artifact, config);
310
+ break;
311
+ case 'claude-opus':
312
+ parsed = await evaluateWithClaudeOpus(artifact, config);
313
+ break;
314
+ default: {
315
+ const exhaustiveCheck: never = config.model;
316
+ throw new AIJudgeError(`Unsupported judge model: ${exhaustiveCheck}`, config.model);
317
+ }
318
+ }
319
+
320
+ // Build result
321
+ const result: JudgeResult = {
322
+ score: parsed.score,
323
+ normalizedScore: config.maxScore > 0 ? parsed.score / config.maxScore : 0,
324
+ reasoning: parsed.reasoning,
325
+ highlights: parsed.highlights,
326
+ model: config.model,
327
+ cached: false,
328
+ evaluatedAt: new Date().toISOString(),
329
+ };
330
+
331
+ // Cache the result (without the 'cached' flag)
332
+ const cacheableResult: Omit<JudgeResult, 'cached'> & { cached?: boolean } = { ...result };
333
+ delete cacheableResult.cached;
334
+ await judgeCache.set(cacheKey, cacheableResult as JudgeResult);
335
+
336
+ return result;
337
+ }
338
+
339
+ /**
340
+ * Validates a JudgeConfig object.
341
+ *
342
+ * @param config - Configuration to validate
343
+ * @returns true if valid
344
+ * @throws Error if invalid
345
+ */
346
+ export function validateJudgeConfig(config: JudgeConfig): boolean {
347
+ if (!['gpt-4o', 'claude-opus'].includes(config.model)) {
348
+ throw new Error(`Invalid judge model: ${config.model}`);
349
+ }
350
+
351
+ if (typeof config.rubric !== 'string' || config.rubric.trim() === '') {
352
+ throw new Error('Rubric must be a non-empty string');
353
+ }
354
+
355
+ if (typeof config.maxScore !== 'number' || config.maxScore <= 0) {
356
+ throw new Error('maxScore must be a positive number');
357
+ }
358
+
359
+ if (config.temperature !== undefined && (config.temperature < 0 || config.temperature > 2)) {
360
+ throw new Error('temperature must be between 0 and 2');
361
+ }
362
+
363
+ if (config.maxTokens !== undefined && (config.maxTokens < 1 || config.maxTokens > 4096)) {
364
+ throw new Error('maxTokens must be between 1 and 4096');
365
+ }
366
+
367
+ return true;
368
+ }