outcome-cli 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +261 -0
- package/package.json +95 -0
- package/src/agents/README.md +139 -0
- package/src/agents/adapters/anthropic.adapter.ts +166 -0
- package/src/agents/adapters/dalle.adapter.ts +145 -0
- package/src/agents/adapters/gemini.adapter.ts +134 -0
- package/src/agents/adapters/imagen.adapter.ts +106 -0
- package/src/agents/adapters/nano-banana.adapter.ts +129 -0
- package/src/agents/adapters/openai.adapter.ts +165 -0
- package/src/agents/adapters/veo.adapter.ts +130 -0
- package/src/agents/agent.schema.property.test.ts +379 -0
- package/src/agents/agent.schema.test.ts +148 -0
- package/src/agents/agent.schema.ts +263 -0
- package/src/agents/index.ts +60 -0
- package/src/agents/registered-agent.schema.ts +356 -0
- package/src/agents/registry.ts +97 -0
- package/src/agents/tournament-configs.property.test.ts +266 -0
- package/src/cli/README.md +145 -0
- package/src/cli/commands/define.ts +79 -0
- package/src/cli/commands/list.ts +46 -0
- package/src/cli/commands/logs.ts +83 -0
- package/src/cli/commands/run.ts +416 -0
- package/src/cli/commands/verify.ts +110 -0
- package/src/cli/index.ts +81 -0
- package/src/config/README.md +128 -0
- package/src/config/env.ts +262 -0
- package/src/config/index.ts +19 -0
- package/src/eval/README.md +318 -0
- package/src/eval/ai-judge.test.ts +435 -0
- package/src/eval/ai-judge.ts +368 -0
- package/src/eval/code-validators.ts +414 -0
- package/src/eval/evaluateOutcome.property.test.ts +1174 -0
- package/src/eval/evaluateOutcome.ts +591 -0
- package/src/eval/immigration-validators.ts +122 -0
- package/src/eval/index.ts +90 -0
- package/src/eval/judge-cache.ts +402 -0
- package/src/eval/tournament-validators.property.test.ts +439 -0
- package/src/eval/validators.property.test.ts +1118 -0
- package/src/eval/validators.ts +1199 -0
- package/src/eval/weighted-scorer.ts +285 -0
- package/src/index.ts +17 -0
- package/src/league/README.md +188 -0
- package/src/league/health-check.ts +353 -0
- package/src/league/index.ts +93 -0
- package/src/league/killAgent.ts +151 -0
- package/src/league/league.test.ts +1151 -0
- package/src/league/runLeague.ts +843 -0
- package/src/league/scoreAgent.ts +175 -0
- package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
- package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
- package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
- package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
- package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
- package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
- package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
- package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
- package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
- package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
- package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
- package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
- package/src/modules/omnibridge/api/.gitkeep +1 -0
- package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
- package/src/modules/omnibridge/auth/.gitkeep +1 -0
- package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
- package/src/modules/omnibridge/auth/session-vault.ts +577 -0
- package/src/modules/omnibridge/core/.gitkeep +1 -0
- package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
- package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
- package/src/modules/omnibridge/core/types.ts +610 -0
- package/src/modules/omnibridge/execution/.gitkeep +1 -0
- package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
- package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
- package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
- package/src/modules/omnibridge/index.ts +212 -0
- package/src/modules/omnibridge/omnibridge.ts +510 -0
- package/src/modules/omnibridge/verification/.gitkeep +1 -0
- package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
- package/src/outcomes/README.md +75 -0
- package/src/outcomes/acquire-pilot-customer.ts +297 -0
- package/src/outcomes/code-delivery-outcomes.ts +89 -0
- package/src/outcomes/code-outcomes.ts +256 -0
- package/src/outcomes/code_review_battle.test.ts +135 -0
- package/src/outcomes/code_review_battle.ts +135 -0
- package/src/outcomes/cold_email_battle.ts +97 -0
- package/src/outcomes/content_creation_battle.ts +160 -0
- package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
- package/src/outcomes/index.ts +107 -0
- package/src/outcomes/lead_gen_battle.test.ts +113 -0
- package/src/outcomes/lead_gen_battle.ts +99 -0
- package/src/outcomes/outcome.schema.property.test.ts +229 -0
- package/src/outcomes/outcome.schema.ts +187 -0
- package/src/outcomes/qualified_sales_interest.ts +118 -0
- package/src/outcomes/swarm_planner.property.test.ts +370 -0
- package/src/outcomes/swarm_planner.ts +96 -0
- package/src/outcomes/web_extraction.ts +234 -0
- package/src/runtime/README.md +220 -0
- package/src/runtime/agentRunner.test.ts +341 -0
- package/src/runtime/agentRunner.ts +746 -0
- package/src/runtime/claudeAdapter.ts +232 -0
- package/src/runtime/costTracker.ts +123 -0
- package/src/runtime/index.ts +34 -0
- package/src/runtime/modelAdapter.property.test.ts +305 -0
- package/src/runtime/modelAdapter.ts +144 -0
- package/src/runtime/openaiAdapter.ts +235 -0
- package/src/utils/README.md +122 -0
- package/src/utils/command-runner.ts +134 -0
- package/src/utils/cost-guard.ts +379 -0
- package/src/utils/errors.test.ts +290 -0
- package/src/utils/errors.ts +442 -0
- package/src/utils/index.ts +37 -0
- package/src/utils/logger.test.ts +361 -0
- package/src/utils/logger.ts +419 -0
- package/src/utils/output-parsers.ts +216 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
# Evaluation System
|
|
2
|
+
|
|
3
|
+
Evaluation of agent artifacts against outcome success criteria, supporting both binary and weighted scoring modes.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The evaluation system determines whether an agent has successfully achieved an outcome by running all success criteria validators against the agent's artifact. Evaluation is **deterministic** - the same inputs always produce the same result.
|
|
8
|
+
|
|
9
|
+
## Evaluation Modes
|
|
10
|
+
|
|
11
|
+
### Binary Evaluation (Default)
|
|
12
|
+
Traditional pass/fail evaluation where all criteria must pass for success. Used for bounty payouts.
|
|
13
|
+
|
|
14
|
+
### Weighted Scoring
|
|
15
|
+
Granular scoring with weighted criteria for partial success. Used for leaderboard rankings and agent comparison. Agents are rewarded based on criterion importance even without full success.
|
|
16
|
+
|
|
17
|
+
## Key Principles
|
|
18
|
+
|
|
19
|
+
1. **Deterministic** - Same inputs always produce identical outputs
|
|
20
|
+
2. **Fail-Closed** - When in doubt, return FAILURE with no payout
|
|
21
|
+
3. **Observable** - All validation results are logged with structured reasons
|
|
22
|
+
4. **Granular Scoring** - Weighted evaluation rewards partial success for rankings
|
|
23
|
+
|
|
24
|
+
## Validators
|
|
25
|
+
|
|
26
|
+
Pure validation functions that check individual success criteria:
|
|
27
|
+
|
|
28
|
+
| Validator | Purpose | Requirements |
|
|
29
|
+
|-----------|---------|--------------|
|
|
30
|
+
| `validateBuyingIntent` | Checks message contains buying intent keywords | 8.1 |
|
|
31
|
+
| `validateCompanySize` | Verifies company has minimum employee count | 8.2 |
|
|
32
|
+
| `validateRole` | Ensures role is not in excluded list | 8.3 |
|
|
33
|
+
| `validateMessageLength` | Validates message meets minimum word count | 8.4 |
|
|
34
|
+
| `validateEmail` | Checks email syntax is valid | 8.5 |
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
```typescript
|
|
39
|
+
import {
|
|
40
|
+
validateBuyingIntent,
|
|
41
|
+
validateCompanySize,
|
|
42
|
+
validateRole,
|
|
43
|
+
validateMessageLength,
|
|
44
|
+
validateEmail,
|
|
45
|
+
} from './validators.js';
|
|
46
|
+
|
|
47
|
+
// Each validator returns { valid: boolean, errors: string[] }
|
|
48
|
+
const intentResult = validateBuyingIntent(
|
|
49
|
+
"I'd like to schedule a demo",
|
|
50
|
+
['pricing', 'demo', 'next steps']
|
|
51
|
+
);
|
|
52
|
+
// { valid: true, errors: [] }
|
|
53
|
+
|
|
54
|
+
const sizeResult = validateCompanySize(25, 50);
|
|
55
|
+
// { valid: false, errors: ['Company too small - must have at least 50 employees, got 25'] }
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## ValidationResult Interface
|
|
59
|
+
|
|
60
|
+
```typescript
|
|
61
|
+
interface ValidationResult {
|
|
62
|
+
/** Whether validation passed */
|
|
63
|
+
valid: boolean;
|
|
64
|
+
/** Error messages if validation failed */
|
|
65
|
+
errors: string[];
|
|
66
|
+
}
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Evaluation Orchestration
|
|
70
|
+
|
|
71
|
+
The `evaluateOutcome` function orchestrates binary evaluation of agent artifacts:
|
|
72
|
+
|
|
73
|
+
```typescript
|
|
74
|
+
import { evaluateOutcome } from './evaluateOutcome.js';
|
|
75
|
+
import { qualifiedSalesInterest } from '../outcomes/qualified_sales_interest.js';
|
|
76
|
+
|
|
77
|
+
const artifact = {
|
|
78
|
+
agentId: 'agent-001',
|
|
79
|
+
outcomeId: 'qualified_sales_interest',
|
|
80
|
+
attemptNumber: 1,
|
|
81
|
+
content: {
|
|
82
|
+
message: "I'd love to schedule a demo to discuss pricing for our team",
|
|
83
|
+
targetEmail: 'john@acme.com',
|
|
84
|
+
targetCompany: 'Acme Corp',
|
|
85
|
+
targetCompanySize: 150,
|
|
86
|
+
targetRole: 'VP of Engineering',
|
|
87
|
+
},
|
|
88
|
+
timestamp: new Date().toISOString(),
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
const result = await evaluateOutcome(qualifiedSalesInterest, artifact);
|
|
92
|
+
|
|
93
|
+
if (result.status === 'SUCCESS') {
|
|
94
|
+
console.log('Payout:', result.verificationDetails?.payoutAmount);
|
|
95
|
+
} else {
|
|
96
|
+
console.log('Failed:', result.reason);
|
|
97
|
+
}
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## EvaluationResult Interface
|
|
101
|
+
|
|
102
|
+
```typescript
|
|
103
|
+
interface EvaluationResult {
|
|
104
|
+
/** Binary status - exactly SUCCESS or FAILURE */
|
|
105
|
+
status: 'SUCCESS' | 'FAILURE';
|
|
106
|
+
/** Human-readable reason for the result */
|
|
107
|
+
reason: string;
|
|
108
|
+
/** Results for each individual criterion */
|
|
109
|
+
criteriaResults: CriterionResult[];
|
|
110
|
+
/** Verification details included only on SUCCESS */
|
|
111
|
+
verificationDetails?: Record<string, unknown>;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
interface CriterionResult {
|
|
115
|
+
/** Name of the criterion that was evaluated */
|
|
116
|
+
name: string;
|
|
117
|
+
/** Whether the criterion passed */
|
|
118
|
+
passed: boolean;
|
|
119
|
+
/** Human-readable reason for the result */
|
|
120
|
+
reason: string;
|
|
121
|
+
}
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Weighted Scoring System
|
|
125
|
+
|
|
126
|
+
The weighted scoring system (`weighted-scorer.ts`) provides granular evaluation for leaderboard rankings:
|
|
127
|
+
|
|
128
|
+
### WeightedCriterion Interface
|
|
129
|
+
|
|
130
|
+
```typescript
|
|
131
|
+
interface WeightedCriterion {
|
|
132
|
+
name: string; // Criterion identifier
|
|
133
|
+
weight: number; // Importance (0.0 to 1.0, sum should be 1.0)
|
|
134
|
+
validator: (artifact: unknown) => WeightedValidationResult;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
interface WeightedValidationResult {
|
|
138
|
+
success: boolean; // Whether criterion passed threshold
|
|
139
|
+
score: number; // Numeric score (0.0 to 1.0)
|
|
140
|
+
reason: string; // Human-readable explanation
|
|
141
|
+
details?: Record<string, unknown>;
|
|
142
|
+
}
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Usage
|
|
146
|
+
|
|
147
|
+
```typescript
|
|
148
|
+
import {
|
|
149
|
+
calculateWeightedScore,
|
|
150
|
+
rankLeaderboardEntries,
|
|
151
|
+
createLeaderboardEntry,
|
|
152
|
+
} from './weighted-scorer.js';
|
|
153
|
+
|
|
154
|
+
// Define weighted criteria
|
|
155
|
+
const criteria = [
|
|
156
|
+
{ name: 'accuracy', weight: 0.4, validator: validateAccuracy },
|
|
157
|
+
{ name: 'completeness', weight: 0.3, validator: validateCompleteness },
|
|
158
|
+
{ name: 'format', weight: 0.3, validator: validateFormat },
|
|
159
|
+
];
|
|
160
|
+
|
|
161
|
+
// Calculate weighted score
|
|
162
|
+
const result = calculateWeightedScore(artifact, criteria, 0.7);
|
|
163
|
+
// { finalScore: 0.85, passed: true, criteriaResults: [...] }
|
|
164
|
+
|
|
165
|
+
// Create leaderboard entries from battle results
|
|
166
|
+
const entry = createLeaderboardEntry(
|
|
167
|
+
'agent-001',
|
|
168
|
+
'My Agent',
|
|
169
|
+
'user-123',
|
|
170
|
+
[{ score: 0.85, tokensUsed: 1500, earnings: 250 }]
|
|
171
|
+
);
|
|
172
|
+
|
|
173
|
+
// Rank entries (primary: score, tiebreaker: efficiency)
|
|
174
|
+
const ranked = rankLeaderboardEntries([entry1, entry2, entry3]);
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Leaderboard Ranking
|
|
178
|
+
|
|
179
|
+
Agents are ranked by:
|
|
180
|
+
1. **Primary**: Cumulative weighted score (descending)
|
|
181
|
+
2. **Tiebreaker**: Efficiency - score per token (descending)
|
|
182
|
+
|
|
183
|
+
```typescript
|
|
184
|
+
interface WeightedLeaderboardEntry {
|
|
185
|
+
rank: number; // Position (1-indexed)
|
|
186
|
+
agentId: string;
|
|
187
|
+
agentName: string;
|
|
188
|
+
userId: string;
|
|
189
|
+
cumulativeScore: number; // Sum of all battle scores
|
|
190
|
+
totalTokensUsed: number;
|
|
191
|
+
efficiency: number; // score / tokens (tiebreaker)
|
|
192
|
+
battlesCount: number;
|
|
193
|
+
totalEarnings: number;
|
|
194
|
+
}
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
## Related Requirements
|
|
198
|
+
|
|
199
|
+
- **Requirement 5**: Binary Evaluation (5.1, 5.2, 5.3, 5.4)
|
|
200
|
+
- **Requirement 8**: Demo Outcome Implementation (qualified_sales_interest)
|
|
201
|
+
- **Requirement 9**: Weighted Scoring System (9.1, 9.2, 9.3, 9.4)
|
|
202
|
+
|
|
203
|
+
## AI-Powered Evaluation (AI Judge)
|
|
204
|
+
|
|
205
|
+
For subjective criteria that cannot be evaluated deterministically (e.g., creativity, tone, persuasiveness), the AI Judge system provides LLM-based evaluation using GPT-4o or Claude Opus.
|
|
206
|
+
|
|
207
|
+
### Supported Models
|
|
208
|
+
|
|
209
|
+
| Model | ID | Best For |
|
|
210
|
+
|-------|-----|----------|
|
|
211
|
+
| GPT-4o | `gpt-4o` | General evaluation, fast responses |
|
|
212
|
+
| Claude Opus | `claude-opus` | Nuanced reasoning, detailed feedback |
|
|
213
|
+
|
|
214
|
+
### JudgeConfig Interface
|
|
215
|
+
|
|
216
|
+
```typescript
|
|
217
|
+
interface JudgeConfig {
|
|
218
|
+
model: 'gpt-4o' | 'claude-opus'; // AI model to use
|
|
219
|
+
rubric: string; // Evaluation criteria description
|
|
220
|
+
maxScore: number; // Maximum score (e.g., 10)
|
|
221
|
+
temperature?: number; // Response randomness (default: 0.3)
|
|
222
|
+
maxTokens?: number; // Max response tokens (default: 1024)
|
|
223
|
+
}
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### JudgeResult Interface
|
|
227
|
+
|
|
228
|
+
```typescript
|
|
229
|
+
interface JudgeResult {
|
|
230
|
+
score: number; // Raw score (0 to maxScore)
|
|
231
|
+
normalizedScore: number; // Normalized (0.0 to 1.0)
|
|
232
|
+
reasoning: string; // Detailed evaluation explanation
|
|
233
|
+
highlights: string[]; // Notable aspects of the artifact
|
|
234
|
+
model: JudgeModel; // Model that performed evaluation
|
|
235
|
+
cached: boolean; // Whether result was from cache
|
|
236
|
+
evaluatedAt: string; // ISO timestamp
|
|
237
|
+
}
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### Usage
|
|
241
|
+
|
|
242
|
+
```typescript
|
|
243
|
+
import { evaluateWithAIJudge, validateJudgeConfig } from './ai-judge.js';
|
|
244
|
+
|
|
245
|
+
const config = {
|
|
246
|
+
model: 'gpt-4o',
|
|
247
|
+
rubric: `Evaluate the sales email for:
|
|
248
|
+
1. Professionalism (0-3 points)
|
|
249
|
+
2. Personalization (0-3 points)
|
|
250
|
+
3. Clear call-to-action (0-2 points)
|
|
251
|
+
4. Appropriate length (0-2 points)`,
|
|
252
|
+
maxScore: 10,
|
|
253
|
+
};
|
|
254
|
+
|
|
255
|
+
// Validate config before use
|
|
256
|
+
validateJudgeConfig(config);
|
|
257
|
+
|
|
258
|
+
const result = await evaluateWithAIJudge(artifact, config);
|
|
259
|
+
|
|
260
|
+
console.log(`Score: ${result.score}/${config.maxScore}`);
|
|
261
|
+
console.log(`Normalized: ${result.normalizedScore}`);
|
|
262
|
+
console.log(`Reasoning: ${result.reasoning}`);
|
|
263
|
+
console.log(`Cached: ${result.cached}`);
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### Caching (Idempotence)
|
|
267
|
+
|
|
268
|
+
AI judge results are cached based on a SHA-256 hash of the artifact + rubric combination. This ensures:
|
|
269
|
+
- **Idempotence**: Same artifact + rubric always returns same result
|
|
270
|
+
- **Cost efficiency**: Avoids redundant API calls
|
|
271
|
+
- **Consistency**: Cached results maintain evaluation stability
|
|
272
|
+
|
|
273
|
+
```typescript
|
|
274
|
+
import { hashArtifact } from './ai-judge.js';
|
|
275
|
+
|
|
276
|
+
// Generate cache key manually if needed
|
|
277
|
+
const cacheKey = hashArtifact(artifact, rubric);
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
### Error Handling
|
|
281
|
+
|
|
282
|
+
```typescript
|
|
283
|
+
import { AIJudgeError } from './ai-judge.js';
|
|
284
|
+
|
|
285
|
+
try {
|
|
286
|
+
const result = await evaluateWithAIJudge(artifact, config);
|
|
287
|
+
} catch (error) {
|
|
288
|
+
if (error instanceof AIJudgeError) {
|
|
289
|
+
console.error(`AI Judge failed (${error.model}): ${error.message}`);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
### Environment Variables
|
|
295
|
+
|
|
296
|
+
```bash
|
|
297
|
+
# Required for GPT-4o judge
|
|
298
|
+
OPENAI_API_KEY=sk-proj-...
|
|
299
|
+
|
|
300
|
+
# Required for Claude Opus judge
|
|
301
|
+
ANTHROPIC_API_KEY=sk-ant-...
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
## Related Requirements
|
|
305
|
+
|
|
306
|
+
- **Requirement 5**: Binary Evaluation (5.1, 5.2, 5.3, 5.4)
|
|
307
|
+
- **Requirement 8**: Demo Outcome Implementation (qualified_sales_interest)
|
|
308
|
+
- **Requirement 9**: Weighted Scoring System (9.1, 9.2, 9.3, 9.4)
|
|
309
|
+
- **Requirement 10**: AI-Powered Evaluation (10.1, 10.2, 10.3, 10.4, 10.5)
|
|
310
|
+
|
|
311
|
+
## Files
|
|
312
|
+
|
|
313
|
+
- `validators.ts` - Pure validation functions
|
|
314
|
+
- `evaluateOutcome.ts` - Binary evaluation orchestration
|
|
315
|
+
- `weighted-scorer.ts` - Weighted scoring for leaderboards
|
|
316
|
+
- `ai-judge.ts` - AI-powered evaluation for subjective criteria
|
|
317
|
+
- `judge-cache.ts` - Caching layer for AI judge results
|
|
318
|
+
- `index.ts` - Module exports
|