outcome-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/README.md +261 -0
  2. package/package.json +95 -0
  3. package/src/agents/README.md +139 -0
  4. package/src/agents/adapters/anthropic.adapter.ts +166 -0
  5. package/src/agents/adapters/dalle.adapter.ts +145 -0
  6. package/src/agents/adapters/gemini.adapter.ts +134 -0
  7. package/src/agents/adapters/imagen.adapter.ts +106 -0
  8. package/src/agents/adapters/nano-banana.adapter.ts +129 -0
  9. package/src/agents/adapters/openai.adapter.ts +165 -0
  10. package/src/agents/adapters/veo.adapter.ts +130 -0
  11. package/src/agents/agent.schema.property.test.ts +379 -0
  12. package/src/agents/agent.schema.test.ts +148 -0
  13. package/src/agents/agent.schema.ts +263 -0
  14. package/src/agents/index.ts +60 -0
  15. package/src/agents/registered-agent.schema.ts +356 -0
  16. package/src/agents/registry.ts +97 -0
  17. package/src/agents/tournament-configs.property.test.ts +266 -0
  18. package/src/cli/README.md +145 -0
  19. package/src/cli/commands/define.ts +79 -0
  20. package/src/cli/commands/list.ts +46 -0
  21. package/src/cli/commands/logs.ts +83 -0
  22. package/src/cli/commands/run.ts +416 -0
  23. package/src/cli/commands/verify.ts +110 -0
  24. package/src/cli/index.ts +81 -0
  25. package/src/config/README.md +128 -0
  26. package/src/config/env.ts +262 -0
  27. package/src/config/index.ts +19 -0
  28. package/src/eval/README.md +318 -0
  29. package/src/eval/ai-judge.test.ts +435 -0
  30. package/src/eval/ai-judge.ts +368 -0
  31. package/src/eval/code-validators.ts +414 -0
  32. package/src/eval/evaluateOutcome.property.test.ts +1174 -0
  33. package/src/eval/evaluateOutcome.ts +591 -0
  34. package/src/eval/immigration-validators.ts +122 -0
  35. package/src/eval/index.ts +90 -0
  36. package/src/eval/judge-cache.ts +402 -0
  37. package/src/eval/tournament-validators.property.test.ts +439 -0
  38. package/src/eval/validators.property.test.ts +1118 -0
  39. package/src/eval/validators.ts +1199 -0
  40. package/src/eval/weighted-scorer.ts +285 -0
  41. package/src/index.ts +17 -0
  42. package/src/league/README.md +188 -0
  43. package/src/league/health-check.ts +353 -0
  44. package/src/league/index.ts +93 -0
  45. package/src/league/killAgent.ts +151 -0
  46. package/src/league/league.test.ts +1151 -0
  47. package/src/league/runLeague.ts +843 -0
  48. package/src/league/scoreAgent.ts +175 -0
  49. package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
  50. package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
  51. package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
  52. package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
  53. package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
  54. package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
  55. package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
  56. package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
  57. package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
  58. package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
  59. package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
  60. package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
  61. package/src/modules/omnibridge/api/.gitkeep +1 -0
  62. package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
  63. package/src/modules/omnibridge/auth/.gitkeep +1 -0
  64. package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
  65. package/src/modules/omnibridge/auth/session-vault.ts +577 -0
  66. package/src/modules/omnibridge/core/.gitkeep +1 -0
  67. package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
  68. package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
  69. package/src/modules/omnibridge/core/types.ts +610 -0
  70. package/src/modules/omnibridge/execution/.gitkeep +1 -0
  71. package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
  72. package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
  73. package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
  74. package/src/modules/omnibridge/index.ts +212 -0
  75. package/src/modules/omnibridge/omnibridge.ts +510 -0
  76. package/src/modules/omnibridge/verification/.gitkeep +1 -0
  77. package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
  78. package/src/outcomes/README.md +75 -0
  79. package/src/outcomes/acquire-pilot-customer.ts +297 -0
  80. package/src/outcomes/code-delivery-outcomes.ts +89 -0
  81. package/src/outcomes/code-outcomes.ts +256 -0
  82. package/src/outcomes/code_review_battle.test.ts +135 -0
  83. package/src/outcomes/code_review_battle.ts +135 -0
  84. package/src/outcomes/cold_email_battle.ts +97 -0
  85. package/src/outcomes/content_creation_battle.ts +160 -0
  86. package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
  87. package/src/outcomes/index.ts +107 -0
  88. package/src/outcomes/lead_gen_battle.test.ts +113 -0
  89. package/src/outcomes/lead_gen_battle.ts +99 -0
  90. package/src/outcomes/outcome.schema.property.test.ts +229 -0
  91. package/src/outcomes/outcome.schema.ts +187 -0
  92. package/src/outcomes/qualified_sales_interest.ts +118 -0
  93. package/src/outcomes/swarm_planner.property.test.ts +370 -0
  94. package/src/outcomes/swarm_planner.ts +96 -0
  95. package/src/outcomes/web_extraction.ts +234 -0
  96. package/src/runtime/README.md +220 -0
  97. package/src/runtime/agentRunner.test.ts +341 -0
  98. package/src/runtime/agentRunner.ts +746 -0
  99. package/src/runtime/claudeAdapter.ts +232 -0
  100. package/src/runtime/costTracker.ts +123 -0
  101. package/src/runtime/index.ts +34 -0
  102. package/src/runtime/modelAdapter.property.test.ts +305 -0
  103. package/src/runtime/modelAdapter.ts +144 -0
  104. package/src/runtime/openaiAdapter.ts +235 -0
  105. package/src/utils/README.md +122 -0
  106. package/src/utils/command-runner.ts +134 -0
  107. package/src/utils/cost-guard.ts +379 -0
  108. package/src/utils/errors.test.ts +290 -0
  109. package/src/utils/errors.ts +442 -0
  110. package/src/utils/index.ts +37 -0
  111. package/src/utils/logger.test.ts +361 -0
  112. package/src/utils/logger.ts +419 -0
  113. package/src/utils/output-parsers.ts +216 -0
@@ -0,0 +1,318 @@
1
+ # Evaluation System
2
+
3
+ Evaluation of agent artifacts against outcome success criteria, supporting both binary and weighted scoring modes.
4
+
5
+ ## Overview
6
+
7
+ The evaluation system determines whether an agent has successfully achieved an outcome by running all success criteria validators against the agent's artifact. Evaluation is **deterministic** - the same inputs always produce the same result.
8
+
9
+ ## Evaluation Modes
10
+
11
+ ### Binary Evaluation (Default)
12
+ Traditional pass/fail evaluation where all criteria must pass for success. Used for bounty payouts.
13
+
14
+ ### Weighted Scoring
15
+ Granular scoring with weighted criteria for partial success. Used for leaderboard rankings and agent comparison. Agents are rewarded based on criterion importance even without full success.
16
+
17
+ ## Key Principles
18
+
19
+ 1. **Deterministic** - Same inputs always produce identical outputs
20
+ 2. **Fail-Closed** - When in doubt, return FAILURE with no payout
21
+ 3. **Observable** - All validation results are logged with structured reasons
22
+ 4. **Granular Scoring** - Weighted evaluation rewards partial success for rankings
23
+
24
+ ## Validators
25
+
26
+ Pure validation functions that check individual success criteria:
27
+
28
+ | Validator | Purpose | Requirements |
29
+ |-----------|---------|--------------|
30
+ | `validateBuyingIntent` | Checks message contains buying intent keywords | 8.1 |
31
+ | `validateCompanySize` | Verifies company has minimum employee count | 8.2 |
32
+ | `validateRole` | Ensures role is not in excluded list | 8.3 |
33
+ | `validateMessageLength` | Validates message meets minimum word count | 8.4 |
34
+ | `validateEmail` | Checks email syntax is valid | 8.5 |
35
+
36
+ ## Usage
37
+
38
+ ```typescript
39
+ import {
40
+ validateBuyingIntent,
41
+ validateCompanySize,
42
+ validateRole,
43
+ validateMessageLength,
44
+ validateEmail,
45
+ } from './validators.js';
46
+
47
+ // Each validator returns { valid: boolean, errors: string[] }
48
+ const intentResult = validateBuyingIntent(
49
+ "I'd like to schedule a demo",
50
+ ['pricing', 'demo', 'next steps']
51
+ );
52
+ // { valid: true, errors: [] }
53
+
54
+ const sizeResult = validateCompanySize(25, 50);
55
+ // { valid: false, errors: ['Company too small - must have at least 50 employees, got 25'] }
56
+ ```
57
+
58
+ ## ValidationResult Interface
59
+
60
+ ```typescript
61
+ interface ValidationResult {
62
+ /** Whether validation passed */
63
+ valid: boolean;
64
+ /** Error messages if validation failed */
65
+ errors: string[];
66
+ }
67
+ ```
68
+
69
+ ## Evaluation Orchestration
70
+
71
+ The `evaluateOutcome` function orchestrates binary evaluation of agent artifacts:
72
+
73
+ ```typescript
74
+ import { evaluateOutcome } from './evaluateOutcome.js';
75
+ import { qualifiedSalesInterest } from '../outcomes/qualified_sales_interest.js';
76
+
77
+ const artifact = {
78
+ agentId: 'agent-001',
79
+ outcomeId: 'qualified_sales_interest',
80
+ attemptNumber: 1,
81
+ content: {
82
+ message: "I'd love to schedule a demo to discuss pricing for our team",
83
+ targetEmail: 'john@acme.com',
84
+ targetCompany: 'Acme Corp',
85
+ targetCompanySize: 150,
86
+ targetRole: 'VP of Engineering',
87
+ },
88
+ timestamp: new Date().toISOString(),
89
+ };
90
+
91
+ const result = await evaluateOutcome(qualifiedSalesInterest, artifact);
92
+
93
+ if (result.status === 'SUCCESS') {
94
+ console.log('Payout:', result.verificationDetails?.payoutAmount);
95
+ } else {
96
+ console.log('Failed:', result.reason);
97
+ }
98
+ ```
99
+
100
+ ## EvaluationResult Interface
101
+
102
+ ```typescript
103
+ interface EvaluationResult {
104
+ /** Binary status - exactly SUCCESS or FAILURE */
105
+ status: 'SUCCESS' | 'FAILURE';
106
+ /** Human-readable reason for the result */
107
+ reason: string;
108
+ /** Results for each individual criterion */
109
+ criteriaResults: CriterionResult[];
110
+ /** Verification details included only on SUCCESS */
111
+ verificationDetails?: Record<string, unknown>;
112
+ }
113
+
114
+ interface CriterionResult {
115
+ /** Name of the criterion that was evaluated */
116
+ name: string;
117
+ /** Whether the criterion passed */
118
+ passed: boolean;
119
+ /** Human-readable reason for the result */
120
+ reason: string;
121
+ }
122
+ ```
123
+
124
+ ## Weighted Scoring System
125
+
126
+ The weighted scoring system (`weighted-scorer.ts`) provides granular evaluation for leaderboard rankings:
127
+
128
+ ### WeightedCriterion Interface
129
+
130
+ ```typescript
131
+ interface WeightedCriterion {
132
+ name: string; // Criterion identifier
133
+ weight: number; // Importance (0.0 to 1.0, sum should be 1.0)
134
+ validator: (artifact: unknown) => WeightedValidationResult;
135
+ }
136
+
137
+ interface WeightedValidationResult {
138
+ success: boolean; // Whether criterion passed threshold
139
+ score: number; // Numeric score (0.0 to 1.0)
140
+ reason: string; // Human-readable explanation
141
+ details?: Record<string, unknown>;
142
+ }
143
+ ```
144
+
145
+ ### Usage
146
+
147
+ ```typescript
148
+ import {
149
+ calculateWeightedScore,
150
+ rankLeaderboardEntries,
151
+ createLeaderboardEntry,
152
+ } from './weighted-scorer.js';
153
+
154
+ // Define weighted criteria
155
+ const criteria = [
156
+ { name: 'accuracy', weight: 0.4, validator: validateAccuracy },
157
+ { name: 'completeness', weight: 0.3, validator: validateCompleteness },
158
+ { name: 'format', weight: 0.3, validator: validateFormat },
159
+ ];
160
+
161
+ // Calculate weighted score
162
+ const result = calculateWeightedScore(artifact, criteria, 0.7);
163
+ // { finalScore: 0.85, passed: true, criteriaResults: [...] }
164
+
165
+ // Create leaderboard entries from battle results
166
+ const entry = createLeaderboardEntry(
167
+ 'agent-001',
168
+ 'My Agent',
169
+ 'user-123',
170
+ [{ score: 0.85, tokensUsed: 1500, earnings: 250 }]
171
+ );
172
+
173
+ // Rank entries (primary: score, tiebreaker: efficiency)
174
+ const ranked = rankLeaderboardEntries([entry1, entry2, entry3]);
175
+ ```
176
+
177
+ ### Leaderboard Ranking
178
+
179
+ Agents are ranked by:
180
+ 1. **Primary**: Cumulative weighted score (descending)
181
+ 2. **Tiebreaker**: Efficiency - score per token (descending)
182
+
183
+ ```typescript
184
+ interface WeightedLeaderboardEntry {
185
+ rank: number; // Position (1-indexed)
186
+ agentId: string;
187
+ agentName: string;
188
+ userId: string;
189
+ cumulativeScore: number; // Sum of all battle scores
190
+ totalTokensUsed: number;
191
+ efficiency: number; // score / tokens (tiebreaker)
192
+ battlesCount: number;
193
+ totalEarnings: number;
194
+ }
195
+ ```
196
+
197
+ ## Related Requirements
198
+
199
+ - **Requirement 5**: Binary Evaluation (5.1, 5.2, 5.3, 5.4)
200
+ - **Requirement 8**: Demo Outcome Implementation (qualified_sales_interest)
201
+ - **Requirement 9**: Weighted Scoring System (9.1, 9.2, 9.3, 9.4)
202
+
203
+ ## AI-Powered Evaluation (AI Judge)
204
+
205
+ For subjective criteria that cannot be evaluated deterministically (e.g., creativity, tone, persuasiveness), the AI Judge system provides LLM-based evaluation using GPT-4o or Claude Opus.
206
+
207
+ ### Supported Models
208
+
209
+ | Model | ID | Best For |
210
+ |-------|-----|----------|
211
+ | GPT-4o | `gpt-4o` | General evaluation, fast responses |
212
+ | Claude Opus | `claude-opus` | Nuanced reasoning, detailed feedback |
213
+
214
+ ### JudgeConfig Interface
215
+
216
+ ```typescript
217
+ interface JudgeConfig {
218
+ model: 'gpt-4o' | 'claude-opus'; // AI model to use
219
+ rubric: string; // Evaluation criteria description
220
+ maxScore: number; // Maximum score (e.g., 10)
221
+ temperature?: number; // Response randomness (default: 0.3)
222
+ maxTokens?: number; // Max response tokens (default: 1024)
223
+ }
224
+ ```
225
+
226
+ ### JudgeResult Interface
227
+
228
+ ```typescript
229
+ interface JudgeResult {
230
+ score: number; // Raw score (0 to maxScore)
231
+ normalizedScore: number; // Normalized (0.0 to 1.0)
232
+ reasoning: string; // Detailed evaluation explanation
233
+ highlights: string[]; // Notable aspects of the artifact
234
+ model: JudgeModel; // Model that performed evaluation
235
+ cached: boolean; // Whether result was from cache
236
+ evaluatedAt: string; // ISO timestamp
237
+ }
238
+ ```
239
+
240
+ ### Usage
241
+
242
+ ```typescript
243
+ import { evaluateWithAIJudge, validateJudgeConfig } from './ai-judge.js';
244
+
245
+ const config = {
246
+ model: 'gpt-4o',
247
+ rubric: `Evaluate the sales email for:
248
+ 1. Professionalism (0-3 points)
249
+ 2. Personalization (0-3 points)
250
+ 3. Clear call-to-action (0-2 points)
251
+ 4. Appropriate length (0-2 points)`,
252
+ maxScore: 10,
253
+ };
254
+
255
+ // Validate config before use
256
+ validateJudgeConfig(config);
257
+
258
+ const result = await evaluateWithAIJudge(artifact, config);
259
+
260
+ console.log(`Score: ${result.score}/${config.maxScore}`);
261
+ console.log(`Normalized: ${result.normalizedScore}`);
262
+ console.log(`Reasoning: ${result.reasoning}`);
263
+ console.log(`Cached: ${result.cached}`);
264
+ ```
265
+
266
+ ### Caching (Idempotence)
267
+
268
+ AI judge results are cached based on a SHA-256 hash of the artifact + rubric combination. This ensures:
269
+ - **Idempotence**: Same artifact + rubric always returns same result
270
+ - **Cost efficiency**: Avoids redundant API calls
271
+ - **Consistency**: Cached results maintain evaluation stability
272
+
273
+ ```typescript
274
+ import { hashArtifact } from './ai-judge.js';
275
+
276
+ // Generate cache key manually if needed
277
+ const cacheKey = hashArtifact(artifact, rubric);
278
+ ```
279
+
280
+ ### Error Handling
281
+
282
+ ```typescript
283
+ import { AIJudgeError } from './ai-judge.js';
284
+
285
+ try {
286
+ const result = await evaluateWithAIJudge(artifact, config);
287
+ } catch (error) {
288
+ if (error instanceof AIJudgeError) {
289
+ console.error(`AI Judge failed (${error.model}): ${error.message}`);
290
+ }
291
+ }
292
+ ```
293
+
294
+ ### Environment Variables
295
+
296
+ ```bash
297
+ # Required for GPT-4o judge
298
+ OPENAI_API_KEY=sk-proj-...
299
+
300
+ # Required for Claude Opus judge
301
+ ANTHROPIC_API_KEY=sk-ant-...
302
+ ```
303
+
304
+ ## Related Requirements
305
+
306
+ - **Requirement 5**: Binary Evaluation (5.1, 5.2, 5.3, 5.4)
307
+ - **Requirement 8**: Demo Outcome Implementation (qualified_sales_interest)
308
+ - **Requirement 9**: Weighted Scoring System (9.1, 9.2, 9.3, 9.4)
309
+ - **Requirement 10**: AI-Powered Evaluation (10.1, 10.2, 10.3, 10.4, 10.5)
310
+
311
+ ## Files
312
+
313
+ - `validators.ts` - Pure validation functions
314
+ - `evaluateOutcome.ts` - Binary evaluation orchestration
315
+ - `weighted-scorer.ts` - Weighted scoring for leaderboards
316
+ - `ai-judge.ts` - AI-powered evaluation for subjective criteria
317
+ - `judge-cache.ts` - Caching layer for AI judge results
318
+ - `index.ts` - Module exports