outcome-cli 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +261 -0
- package/package.json +95 -0
- package/src/agents/README.md +139 -0
- package/src/agents/adapters/anthropic.adapter.ts +166 -0
- package/src/agents/adapters/dalle.adapter.ts +145 -0
- package/src/agents/adapters/gemini.adapter.ts +134 -0
- package/src/agents/adapters/imagen.adapter.ts +106 -0
- package/src/agents/adapters/nano-banana.adapter.ts +129 -0
- package/src/agents/adapters/openai.adapter.ts +165 -0
- package/src/agents/adapters/veo.adapter.ts +130 -0
- package/src/agents/agent.schema.property.test.ts +379 -0
- package/src/agents/agent.schema.test.ts +148 -0
- package/src/agents/agent.schema.ts +263 -0
- package/src/agents/index.ts +60 -0
- package/src/agents/registered-agent.schema.ts +356 -0
- package/src/agents/registry.ts +97 -0
- package/src/agents/tournament-configs.property.test.ts +266 -0
- package/src/cli/README.md +145 -0
- package/src/cli/commands/define.ts +79 -0
- package/src/cli/commands/list.ts +46 -0
- package/src/cli/commands/logs.ts +83 -0
- package/src/cli/commands/run.ts +416 -0
- package/src/cli/commands/verify.ts +110 -0
- package/src/cli/index.ts +81 -0
- package/src/config/README.md +128 -0
- package/src/config/env.ts +262 -0
- package/src/config/index.ts +19 -0
- package/src/eval/README.md +318 -0
- package/src/eval/ai-judge.test.ts +435 -0
- package/src/eval/ai-judge.ts +368 -0
- package/src/eval/code-validators.ts +414 -0
- package/src/eval/evaluateOutcome.property.test.ts +1174 -0
- package/src/eval/evaluateOutcome.ts +591 -0
- package/src/eval/immigration-validators.ts +122 -0
- package/src/eval/index.ts +90 -0
- package/src/eval/judge-cache.ts +402 -0
- package/src/eval/tournament-validators.property.test.ts +439 -0
- package/src/eval/validators.property.test.ts +1118 -0
- package/src/eval/validators.ts +1199 -0
- package/src/eval/weighted-scorer.ts +285 -0
- package/src/index.ts +17 -0
- package/src/league/README.md +188 -0
- package/src/league/health-check.ts +353 -0
- package/src/league/index.ts +93 -0
- package/src/league/killAgent.ts +151 -0
- package/src/league/league.test.ts +1151 -0
- package/src/league/runLeague.ts +843 -0
- package/src/league/scoreAgent.ts +175 -0
- package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
- package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
- package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
- package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
- package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
- package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
- package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
- package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
- package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
- package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
- package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
- package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
- package/src/modules/omnibridge/api/.gitkeep +1 -0
- package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
- package/src/modules/omnibridge/auth/.gitkeep +1 -0
- package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
- package/src/modules/omnibridge/auth/session-vault.ts +577 -0
- package/src/modules/omnibridge/core/.gitkeep +1 -0
- package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
- package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
- package/src/modules/omnibridge/core/types.ts +610 -0
- package/src/modules/omnibridge/execution/.gitkeep +1 -0
- package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
- package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
- package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
- package/src/modules/omnibridge/index.ts +212 -0
- package/src/modules/omnibridge/omnibridge.ts +510 -0
- package/src/modules/omnibridge/verification/.gitkeep +1 -0
- package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
- package/src/outcomes/README.md +75 -0
- package/src/outcomes/acquire-pilot-customer.ts +297 -0
- package/src/outcomes/code-delivery-outcomes.ts +89 -0
- package/src/outcomes/code-outcomes.ts +256 -0
- package/src/outcomes/code_review_battle.test.ts +135 -0
- package/src/outcomes/code_review_battle.ts +135 -0
- package/src/outcomes/cold_email_battle.ts +97 -0
- package/src/outcomes/content_creation_battle.ts +160 -0
- package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
- package/src/outcomes/index.ts +107 -0
- package/src/outcomes/lead_gen_battle.test.ts +113 -0
- package/src/outcomes/lead_gen_battle.ts +99 -0
- package/src/outcomes/outcome.schema.property.test.ts +229 -0
- package/src/outcomes/outcome.schema.ts +187 -0
- package/src/outcomes/qualified_sales_interest.ts +118 -0
- package/src/outcomes/swarm_planner.property.test.ts +370 -0
- package/src/outcomes/swarm_planner.ts +96 -0
- package/src/outcomes/web_extraction.ts +234 -0
- package/src/runtime/README.md +220 -0
- package/src/runtime/agentRunner.test.ts +341 -0
- package/src/runtime/agentRunner.ts +746 -0
- package/src/runtime/claudeAdapter.ts +232 -0
- package/src/runtime/costTracker.ts +123 -0
- package/src/runtime/index.ts +34 -0
- package/src/runtime/modelAdapter.property.test.ts +305 -0
- package/src/runtime/modelAdapter.ts +144 -0
- package/src/runtime/openaiAdapter.ts +235 -0
- package/src/utils/README.md +122 -0
- package/src/utils/command-runner.ts +134 -0
- package/src/utils/cost-guard.ts +379 -0
- package/src/utils/errors.test.ts +290 -0
- package/src/utils/errors.ts +442 -0
- package/src/utils/index.ts +37 -0
- package/src/utils/logger.test.ts +361 -0
- package/src/utils/logger.ts +419 -0
- package/src/utils/output-parsers.ts +216 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Weighted Scoring System
|
|
3
|
+
*
|
|
4
|
+
* Implements granular scoring with weighted criteria instead of binary evaluation.
|
|
5
|
+
* Agents are rewarded for partial success based on criterion importance.
|
|
6
|
+
*
|
|
7
|
+
* @module eval/weighted-scorer
|
|
8
|
+
* @see Requirements 9.1, 9.2, 9.3, 9.4
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Result of a weighted validation operation.
|
|
13
|
+
*
|
|
14
|
+
* Unlike binary validation, this includes a numeric score (0.0 to 1.0)
|
|
15
|
+
* representing partial success.
|
|
16
|
+
*
|
|
17
|
+
* @see Requirements 9.1
|
|
18
|
+
*/
|
|
19
|
+
export interface WeightedValidationResult {
|
|
20
|
+
/** Whether the criterion passed (score >= threshold) */
|
|
21
|
+
success: boolean;
|
|
22
|
+
/** Numeric score from 0.0 to 1.0 */
|
|
23
|
+
score: number;
|
|
24
|
+
/** Human-readable reason for the result */
|
|
25
|
+
reason: string;
|
|
26
|
+
/** Optional additional details */
|
|
27
|
+
details?: Record<string, unknown>;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// DAWS presets for code generation (Outcome pivot)
|
|
31
|
+
// GCR: tests pass, QAS: AI review, CEF: cost efficiency, DTT: turnaround, RES: resilience
|
|
32
|
+
export const DAWS_CODE_WEIGHTS = {
|
|
33
|
+
GCR: 0.35,
|
|
34
|
+
QAS: 0.25,
|
|
35
|
+
CEF: 0.15,
|
|
36
|
+
DTT: 0.15,
|
|
37
|
+
RES: 0.10,
|
|
38
|
+
} as const;
|
|
39
|
+
|
|
40
|
+
export type DawsMetric = keyof typeof DAWS_CODE_WEIGHTS;
|
|
41
|
+
|
|
42
|
+
export interface DawsScores {
|
|
43
|
+
GCR: number; // tests / correctness
|
|
44
|
+
QAS: number; // AI review score (0..1)
|
|
45
|
+
CEF: number; // cost efficiency normalized 0..1
|
|
46
|
+
DTT: number; // speed normalized 0..1
|
|
47
|
+
RES: number; // resilience / retries normalized 0..1
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Calculates DAWS weighted score for code-gen tasks using the code defaults.
|
|
52
|
+
*/
|
|
53
|
+
export function calculateDawsCodeScore(scores: DawsScores): ScoringResult {
|
|
54
|
+
const criteria: WeightedCriterion[] = [
|
|
55
|
+
{ name: 'GCR', weight: DAWS_CODE_WEIGHTS.GCR, validator: () => ({ success: scores.GCR >= 1, score: scores.GCR, reason: 'Goal completion (tests)' }) },
|
|
56
|
+
{ name: 'QAS', weight: DAWS_CODE_WEIGHTS.QAS, validator: () => ({ success: scores.QAS >= 0.7, score: scores.QAS, reason: 'Quality alignment' }) },
|
|
57
|
+
{ name: 'CEF', weight: DAWS_CODE_WEIGHTS.CEF, validator: () => ({ success: scores.CEF >= 0.5, score: scores.CEF, reason: 'Cost efficiency' }) },
|
|
58
|
+
{ name: 'DTT', weight: DAWS_CODE_WEIGHTS.DTT, validator: () => ({ success: scores.DTT >= 0.5, score: scores.DTT, reason: 'Decision turnaround time' }) },
|
|
59
|
+
{ name: 'RES', weight: DAWS_CODE_WEIGHTS.RES, validator: () => ({ success: scores.RES >= 0.5, score: scores.RES, reason: 'Resilience' }) },
|
|
60
|
+
];
|
|
61
|
+
|
|
62
|
+
return calculateWeightedScore(scores, criteria);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* A weighted criterion for evaluation.
|
|
67
|
+
*
|
|
68
|
+
* Each criterion has a weight representing its importance in the final score.
|
|
69
|
+
* The sum of all weights should equal 1.0.
|
|
70
|
+
*
|
|
71
|
+
* @see Requirements 9.1
|
|
72
|
+
*/
|
|
73
|
+
export interface WeightedCriterion {
|
|
74
|
+
/** Name of the criterion */
|
|
75
|
+
name: string;
|
|
76
|
+
/** Weight of this criterion (0.0 to 1.0, sum of all weights should be 1.0) */
|
|
77
|
+
weight: number;
|
|
78
|
+
/** Validator function that returns a WeightedValidationResult */
|
|
79
|
+
validator: (artifact: unknown) => WeightedValidationResult;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Result of a single criterion evaluation.
|
|
84
|
+
*/
|
|
85
|
+
export interface CriterionEvaluationResult {
|
|
86
|
+
/** Name of the criterion */
|
|
87
|
+
name: string;
|
|
88
|
+
/** Weight of this criterion */
|
|
89
|
+
weight: number;
|
|
90
|
+
/** Score achieved (0.0 to 1.0) */
|
|
91
|
+
score: number;
|
|
92
|
+
/** Human-readable reason */
|
|
93
|
+
reason: string;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Final scoring result after evaluating all criteria.
|
|
98
|
+
*
|
|
99
|
+
* @see Requirements 9.2
|
|
100
|
+
*/
|
|
101
|
+
export interface ScoringResult {
|
|
102
|
+
/** Final weighted score (0.0 to 1.0) */
|
|
103
|
+
finalScore: number;
|
|
104
|
+
/** Whether the artifact passed (finalScore >= passThreshold) */
|
|
105
|
+
passed: boolean;
|
|
106
|
+
/** Results for each individual criterion */
|
|
107
|
+
criteriaResults: CriterionEvaluationResult[];
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Leaderboard entry with weighted scoring fields.
|
|
112
|
+
*
|
|
113
|
+
* @see Requirements 9.3, 9.4
|
|
114
|
+
*/
|
|
115
|
+
export interface WeightedLeaderboardEntry {
|
|
116
|
+
/** Rank position (1-indexed) */
|
|
117
|
+
rank: number;
|
|
118
|
+
/** Agent identifier */
|
|
119
|
+
agentId: string;
|
|
120
|
+
/** Agent display name */
|
|
121
|
+
agentName: string;
|
|
122
|
+
/** Owner user ID */
|
|
123
|
+
userId: string;
|
|
124
|
+
/** Cumulative weighted score across all battles */
|
|
125
|
+
cumulativeScore: number;
|
|
126
|
+
/** Total tokens used across all battles */
|
|
127
|
+
totalTokensUsed: number;
|
|
128
|
+
/** Efficiency: score per token (used as tiebreaker) */
|
|
129
|
+
efficiency: number;
|
|
130
|
+
/** Number of battles participated */
|
|
131
|
+
battlesCount: number;
|
|
132
|
+
/** Total earnings in USD */
|
|
133
|
+
totalEarnings: number;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Validates that weights sum to approximately 1.0.
|
|
138
|
+
*
|
|
139
|
+
* @param criteria - Array of weighted criteria
|
|
140
|
+
* @returns true if weights sum to 1.0 (within floating point tolerance)
|
|
141
|
+
*/
|
|
142
|
+
export function validateWeights(criteria: WeightedCriterion[]): boolean {
|
|
143
|
+
const sum = criteria.reduce((acc, c) => acc + c.weight, 0);
|
|
144
|
+
return Math.abs(sum - 1.0) < 0.0001;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Calculates the weighted score for an artifact against a set of criteria.
|
|
149
|
+
*
|
|
150
|
+
* The final score is computed as the weighted average of all criterion scores:
|
|
151
|
+
* finalScore = Σ(weight_i × score_i)
|
|
152
|
+
*
|
|
153
|
+
* @param artifact - The artifact to evaluate
|
|
154
|
+
* @param criteria - Array of weighted criteria to evaluate against
|
|
155
|
+
* @param passThreshold - Minimum score to pass (default 0.7)
|
|
156
|
+
* @returns ScoringResult with final score and individual criterion results
|
|
157
|
+
*
|
|
158
|
+
* @example
|
|
159
|
+
* const result = calculateWeightedScore(artifact, [
|
|
160
|
+
* { name: 'accuracy', weight: 0.4, validator: validateAccuracy },
|
|
161
|
+
* { name: 'completeness', weight: 0.3, validator: validateCompleteness },
|
|
162
|
+
* { name: 'format', weight: 0.3, validator: validateFormat },
|
|
163
|
+
* ]);
|
|
164
|
+
*
|
|
165
|
+
* @see Requirements 9.2
|
|
166
|
+
*/
|
|
167
|
+
export function calculateWeightedScore(
|
|
168
|
+
artifact: unknown,
|
|
169
|
+
criteria: WeightedCriterion[],
|
|
170
|
+
passThreshold: number = 0.7
|
|
171
|
+
): ScoringResult {
|
|
172
|
+
// Evaluate each criterion
|
|
173
|
+
const criteriaResults: CriterionEvaluationResult[] = criteria.map((criterion) => {
|
|
174
|
+
try {
|
|
175
|
+
const result = criterion.validator(artifact);
|
|
176
|
+
return {
|
|
177
|
+
name: criterion.name,
|
|
178
|
+
weight: criterion.weight,
|
|
179
|
+
score: Math.max(0, Math.min(1, result.score)), // Clamp to [0, 1]
|
|
180
|
+
reason: result.reason,
|
|
181
|
+
};
|
|
182
|
+
} catch (error) {
|
|
183
|
+
// Fail closed on validator errors
|
|
184
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
185
|
+
return {
|
|
186
|
+
name: criterion.name,
|
|
187
|
+
weight: criterion.weight,
|
|
188
|
+
score: 0,
|
|
189
|
+
reason: `Validator error: ${errorMessage}`,
|
|
190
|
+
};
|
|
191
|
+
}
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
// Calculate weighted average
|
|
195
|
+
const finalScore = criteriaResults.reduce(
|
|
196
|
+
(sum, result) => sum + result.weight * result.score,
|
|
197
|
+
0
|
|
198
|
+
);
|
|
199
|
+
|
|
200
|
+
return {
|
|
201
|
+
finalScore,
|
|
202
|
+
passed: finalScore >= passThreshold,
|
|
203
|
+
criteriaResults,
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
/**
|
|
208
|
+
* Calculates efficiency (score per token) for tiebreaker purposes.
|
|
209
|
+
*
|
|
210
|
+
* @param score - The weighted score achieved
|
|
211
|
+
* @param tokensUsed - Number of tokens used
|
|
212
|
+
* @returns Efficiency ratio (higher is better)
|
|
213
|
+
*
|
|
214
|
+
* @see Requirements 9.4
|
|
215
|
+
*/
|
|
216
|
+
export function calculateEfficiency(score: number, tokensUsed: number): number {
|
|
217
|
+
if (tokensUsed <= 0) {
|
|
218
|
+
return score > 0 ? Infinity : 0;
|
|
219
|
+
}
|
|
220
|
+
return score / tokensUsed;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Ranks leaderboard entries by cumulative weighted score.
|
|
225
|
+
*
|
|
226
|
+
* Primary sort: cumulative score (descending)
|
|
227
|
+
* Tiebreaker: efficiency (score per token, descending)
|
|
228
|
+
*
|
|
229
|
+
* @param entries - Array of leaderboard entries to rank
|
|
230
|
+
* @returns Sorted array with updated rank positions
|
|
231
|
+
*
|
|
232
|
+
* @see Requirements 9.3, 9.4
|
|
233
|
+
*/
|
|
234
|
+
export function rankLeaderboardEntries(
|
|
235
|
+
entries: WeightedLeaderboardEntry[]
|
|
236
|
+
): WeightedLeaderboardEntry[] {
|
|
237
|
+
// Sort by cumulative score (descending), then by efficiency (descending) for ties
|
|
238
|
+
const sorted = [...entries].sort((a, b) => {
|
|
239
|
+
// Primary: cumulative score
|
|
240
|
+
const scoreDiff = b.cumulativeScore - a.cumulativeScore;
|
|
241
|
+
if (Math.abs(scoreDiff) > 0.0001) {
|
|
242
|
+
return scoreDiff;
|
|
243
|
+
}
|
|
244
|
+
// Tiebreaker: efficiency (score per token)
|
|
245
|
+
return b.efficiency - a.efficiency;
|
|
246
|
+
});
|
|
247
|
+
|
|
248
|
+
// Update ranks
|
|
249
|
+
return sorted.map((entry, index) => ({
|
|
250
|
+
...entry,
|
|
251
|
+
rank: index + 1,
|
|
252
|
+
}));
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* Creates a leaderboard entry from battle results.
|
|
257
|
+
*
|
|
258
|
+
* @param agentId - Agent identifier
|
|
259
|
+
* @param agentName - Agent display name
|
|
260
|
+
* @param userId - Owner user ID
|
|
261
|
+
* @param battleResults - Array of battle results with scores and tokens
|
|
262
|
+
* @returns WeightedLeaderboardEntry with calculated fields
|
|
263
|
+
*/
|
|
264
|
+
export function createLeaderboardEntry(
|
|
265
|
+
agentId: string,
|
|
266
|
+
agentName: string,
|
|
267
|
+
userId: string,
|
|
268
|
+
battleResults: Array<{ score: number; tokensUsed: number; earnings: number }>
|
|
269
|
+
): WeightedLeaderboardEntry {
|
|
270
|
+
const cumulativeScore = battleResults.reduce((sum, r) => sum + r.score, 0);
|
|
271
|
+
const totalTokensUsed = battleResults.reduce((sum, r) => sum + r.tokensUsed, 0);
|
|
272
|
+
const totalEarnings = battleResults.reduce((sum, r) => sum + r.earnings, 0);
|
|
273
|
+
|
|
274
|
+
return {
|
|
275
|
+
rank: 0, // Will be set by rankLeaderboardEntries
|
|
276
|
+
agentId,
|
|
277
|
+
agentName,
|
|
278
|
+
userId,
|
|
279
|
+
cumulativeScore,
|
|
280
|
+
totalTokensUsed,
|
|
281
|
+
efficiency: calculateEfficiency(cumulativeScore, totalTokensUsed),
|
|
282
|
+
battlesCount: battleResults.length,
|
|
283
|
+
totalEarnings,
|
|
284
|
+
};
|
|
285
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Earnd Bounty Engine
|
|
3
|
+
*
|
|
4
|
+
* Outcome-based AI agent competition system where business outcomes are defined as code,
|
|
5
|
+
* multiple agents compete to achieve outcomes, success is deterministically evaluated,
|
|
6
|
+
* and payment is only possible when success criteria are met.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
export const VERSION = '1.0.0';
|
|
10
|
+
|
|
11
|
+
// Re-export modules with explicit naming to avoid conflicts
|
|
12
|
+
export * as outcomes from './outcomes/index.js';
|
|
13
|
+
export * as agents from './agents/index.js';
|
|
14
|
+
export * as eval from './eval/index.js';
|
|
15
|
+
export * as jobs from './jobs/index.js';
|
|
16
|
+
export * as runtime from './runtime/index.js';
|
|
17
|
+
export * as utils from './utils/index.js';
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# League Module
|
|
2
|
+
|
|
3
|
+
The league module implements the parallel agent competition system where N agents compete to achieve an outcome. The first agent to succeed wins the bounty.
|
|
4
|
+
|
|
5
|
+
## Components
|
|
6
|
+
|
|
7
|
+
### League Runner (`runLeague.ts`)
|
|
8
|
+
|
|
9
|
+
Main orchestration for parallel agent execution.
|
|
10
|
+
|
|
11
|
+
**Key Interfaces:**
|
|
12
|
+
|
|
13
|
+
- `LeagueConfig` - Configuration for a league run (outcomeId, agentCount, globalSpendCeiling, etc.)
|
|
14
|
+
- `LeagueResult` - Result of a league run (winnerId, agents, totalCost, duration)
|
|
15
|
+
- `AgentResult` - Result for individual agent (status, killReason, tokensSpent, evaluationResult)
|
|
16
|
+
|
|
17
|
+
**Key Functions:**
|
|
18
|
+
|
|
19
|
+
- `runLeague(config)` - Runs N agents in parallel competing for an outcome
|
|
20
|
+
- `runLeagueMock(config)` - Runs league in mock mode without real API calls
|
|
21
|
+
|
|
22
|
+
**Usage:**
|
|
23
|
+
|
|
24
|
+
```typescript
|
|
25
|
+
import { runLeague, runLeagueMock } from './runLeague.js';
|
|
26
|
+
|
|
27
|
+
const result = await runLeague({
|
|
28
|
+
outcomeId: 'qualified_sales_interest',
|
|
29
|
+
agentCount: 3,
|
|
30
|
+
globalSpendCeiling: 50000,
|
|
31
|
+
agentConfigs: [agent1, agent2, agent3],
|
|
32
|
+
outcome: qualifiedSalesInterest,
|
|
33
|
+
lead: leadData,
|
|
34
|
+
apiKey: process.env.ANTHROPIC_API_KEY,
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
if (result.winnerId) {
|
|
38
|
+
console.log(`Winner: ${result.winnerId}`);
|
|
39
|
+
console.log(`Total cost: ${result.totalCost} tokens`);
|
|
40
|
+
}
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Kill Agent (`killAgent.ts`)
|
|
44
|
+
|
|
45
|
+
Agent termination logic based on limits.
|
|
46
|
+
|
|
47
|
+
**Key Functions:**
|
|
48
|
+
|
|
49
|
+
- `shouldKillAgent(agent, limits)` - Checks if agent should be terminated
|
|
50
|
+
- `killAgent(agentId, reason)` - Terminates an agent
|
|
51
|
+
- `checkAllAgents(agents, limits)` - Checks all agents for termination
|
|
52
|
+
|
|
53
|
+
**Kill Reasons:**
|
|
54
|
+
|
|
55
|
+
- `cost_exceeded` - Agent exceeded token ceiling
|
|
56
|
+
- `attempts_exceeded` - Agent exceeded max attempts
|
|
57
|
+
- `timeout` - Agent exceeded runtime limit
|
|
58
|
+
- `competitor_won` - Another agent achieved success first
|
|
59
|
+
|
|
60
|
+
### Score Agent (`scoreAgent.ts`)
|
|
61
|
+
|
|
62
|
+
Agent scoring and winner determination.
|
|
63
|
+
|
|
64
|
+
**Key Functions:**
|
|
65
|
+
|
|
66
|
+
- `scoreAgent(result, metrics)` - Creates a score for an agent
|
|
67
|
+
- `determineWinner(scores)` - Finds the winning agent
|
|
68
|
+
- `rankAgents(scores)` - Ranks all agents by performance
|
|
69
|
+
- `calculateLeagueStats(scores)` - Calculates aggregate statistics
|
|
70
|
+
|
|
71
|
+
## League Execution Flow
|
|
72
|
+
|
|
73
|
+
```text
|
|
74
|
+
1. Start N agents in parallel
|
|
75
|
+
2. Each agent attempts to achieve the outcome
|
|
76
|
+
3. First agent to get SUCCESS evaluation wins
|
|
77
|
+
4. All other agents are terminated with 'competitor_won'
|
|
78
|
+
5. If no agent succeeds, league ends with no winner
|
|
79
|
+
6. Global spend ceiling terminates all agents if exceeded
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Requirements Reference
|
|
83
|
+
|
|
84
|
+
- **4.1** - Spin up N agents in parallel
|
|
85
|
+
- **4.3** - Terminate agent on attempt limit exceeded
|
|
86
|
+
- **4.4** - Terminate agent on cost ceiling exceeded
|
|
87
|
+
- **4.5** - Promote winning agent and halt others
|
|
88
|
+
- **10.2** - Enforce max runtime per agent
|
|
89
|
+
- **10.3** - Enforce global spend ceiling
|
|
90
|
+
|
|
91
|
+
### Multi-Step Orchestrator (`multi-step-orchestrator.ts`)
|
|
92
|
+
|
|
93
|
+
Manages multi-step bounty execution with sequential dependent tasks.
|
|
94
|
+
|
|
95
|
+
**Key Interfaces:**
|
|
96
|
+
|
|
97
|
+
- `TaskNode` - A single task in a multi-step bounty (id, name, description, dependencies, validator)
|
|
98
|
+
- `MultiStepBounty` - Bounty definition with task graph (tasks, finalTaskId, payoutAmount)
|
|
99
|
+
- `TaskContext` - Context passed to agent for task execution (task, dependencyOutputs, bounty info)
|
|
100
|
+
- `TaskExecutionResult` - Result of a single task (success, score, output, tokensUsed)
|
|
101
|
+
- `MultiStepResult` - Overall bounty execution result (taskResults, skippedTaskIds, overallScore)
|
|
102
|
+
- `MultiStepAgent` - Agent interface for multi-step execution
|
|
103
|
+
|
|
104
|
+
**Error Classes:**
|
|
105
|
+
|
|
106
|
+
- `CyclicDependencyError` - Thrown when task graph contains cycles
|
|
107
|
+
- `InvalidDependencyError` - Thrown when task references non-existent dependency
|
|
108
|
+
- `InvalidFinalTaskError` - Thrown when final task ID doesn't exist
|
|
109
|
+
- `TaskExecutionError` - Thrown when a task fails during execution
|
|
110
|
+
|
|
111
|
+
**Usage:**
|
|
112
|
+
|
|
113
|
+
```typescript
|
|
114
|
+
import {
|
|
115
|
+
MultiStepBounty,
|
|
116
|
+
TaskNode,
|
|
117
|
+
CyclicDependencyError
|
|
118
|
+
} from './multi-step-orchestrator.js';
|
|
119
|
+
|
|
120
|
+
const bounty: MultiStepBounty = {
|
|
121
|
+
id: 'research-bounty',
|
|
122
|
+
name: 'Market Research',
|
|
123
|
+
description: 'Complete market research workflow',
|
|
124
|
+
tasks: [
|
|
125
|
+
{ id: 'gather', name: 'Gather Data', dependencies: [], ... },
|
|
126
|
+
{ id: 'analyze', name: 'Analyze Data', dependencies: ['gather'], ... },
|
|
127
|
+
{ id: 'report', name: 'Generate Report', dependencies: ['analyze'], ... },
|
|
128
|
+
],
|
|
129
|
+
finalTaskId: 'report',
|
|
130
|
+
payoutAmount: 500,
|
|
131
|
+
};
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Team Coordinator (`team-coordinator.ts`)
|
|
135
|
+
|
|
136
|
+
Manages team battles where multiple agents collaborate on a bounty.
|
|
137
|
+
|
|
138
|
+
**Key Interfaces:**
|
|
139
|
+
|
|
140
|
+
- `TeamConfig` - Team configuration (teamId, memberIds, sharedStateEnabled)
|
|
141
|
+
- `TeamState` - Shared state with optimistic locking (data, version, lastModifiedBy)
|
|
142
|
+
- `StateUpdateResult` - Result of state update operation
|
|
143
|
+
- `MemberContribution` - Contribution metrics per team member
|
|
144
|
+
- `TeamPayoutDistribution` - Payout distribution across team members
|
|
145
|
+
- `TeamStateChangeEvent` - Event emitted on state changes
|
|
146
|
+
|
|
147
|
+
**Error Classes:**
|
|
148
|
+
|
|
149
|
+
- `StateConflictError` - Thrown when optimistic locking detects a conflict
|
|
150
|
+
- `TeamNotFoundError` - Thrown when team doesn't exist
|
|
151
|
+
- `NotTeamMemberError` - Thrown when agent isn't a team member
|
|
152
|
+
|
|
153
|
+
**Usage:**
|
|
154
|
+
|
|
155
|
+
```typescript
|
|
156
|
+
import {
|
|
157
|
+
TeamConfig,
|
|
158
|
+
TeamState,
|
|
159
|
+
StateConflictError
|
|
160
|
+
} from './team-coordinator.js';
|
|
161
|
+
|
|
162
|
+
const team: TeamConfig = {
|
|
163
|
+
teamId: 'team-alpha',
|
|
164
|
+
memberIds: ['agent-1', 'agent-2', 'agent-3'],
|
|
165
|
+
sharedStateEnabled: true,
|
|
166
|
+
};
|
|
167
|
+
|
|
168
|
+
// Shared state uses optimistic locking
|
|
169
|
+
const state: TeamState = {
|
|
170
|
+
data: { researchNotes: [], completedTasks: [] },
|
|
171
|
+
version: 1,
|
|
172
|
+
lastModifiedBy: 'agent-1',
|
|
173
|
+
lastModifiedAt: new Date(),
|
|
174
|
+
};
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Design Principles
|
|
178
|
+
|
|
179
|
+
1. **Parallel Execution** - All agents run simultaneously
|
|
180
|
+
2. **First Win** - First successful agent wins, no ties
|
|
181
|
+
3. **Fail Closed** - Limit violations terminate without payout
|
|
182
|
+
4. **Isolation** - Each agent tracks costs independently (solo battles)
|
|
183
|
+
5. **Observable** - All terminations and results are logged
|
|
184
|
+
6. **Team Coordination** - Shared state with optimistic locking for team battles
|
|
185
|
+
7. **Fair Distribution** - Contribution-based payout distribution for teams
|
|
186
|
+
8. **DAG Validation** - Multi-step bounties validate task graphs for cycles
|
|
187
|
+
9. **Output Propagation** - Task outputs flow to dependent tasks automatically
|
|
188
|
+
10. **Partial Completion** - Multi-step bounties track which tasks completed on failure
|