outcome-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/README.md +261 -0
  2. package/package.json +95 -0
  3. package/src/agents/README.md +139 -0
  4. package/src/agents/adapters/anthropic.adapter.ts +166 -0
  5. package/src/agents/adapters/dalle.adapter.ts +145 -0
  6. package/src/agents/adapters/gemini.adapter.ts +134 -0
  7. package/src/agents/adapters/imagen.adapter.ts +106 -0
  8. package/src/agents/adapters/nano-banana.adapter.ts +129 -0
  9. package/src/agents/adapters/openai.adapter.ts +165 -0
  10. package/src/agents/adapters/veo.adapter.ts +130 -0
  11. package/src/agents/agent.schema.property.test.ts +379 -0
  12. package/src/agents/agent.schema.test.ts +148 -0
  13. package/src/agents/agent.schema.ts +263 -0
  14. package/src/agents/index.ts +60 -0
  15. package/src/agents/registered-agent.schema.ts +356 -0
  16. package/src/agents/registry.ts +97 -0
  17. package/src/agents/tournament-configs.property.test.ts +266 -0
  18. package/src/cli/README.md +145 -0
  19. package/src/cli/commands/define.ts +79 -0
  20. package/src/cli/commands/list.ts +46 -0
  21. package/src/cli/commands/logs.ts +83 -0
  22. package/src/cli/commands/run.ts +416 -0
  23. package/src/cli/commands/verify.ts +110 -0
  24. package/src/cli/index.ts +81 -0
  25. package/src/config/README.md +128 -0
  26. package/src/config/env.ts +262 -0
  27. package/src/config/index.ts +19 -0
  28. package/src/eval/README.md +318 -0
  29. package/src/eval/ai-judge.test.ts +435 -0
  30. package/src/eval/ai-judge.ts +368 -0
  31. package/src/eval/code-validators.ts +414 -0
  32. package/src/eval/evaluateOutcome.property.test.ts +1174 -0
  33. package/src/eval/evaluateOutcome.ts +591 -0
  34. package/src/eval/immigration-validators.ts +122 -0
  35. package/src/eval/index.ts +90 -0
  36. package/src/eval/judge-cache.ts +402 -0
  37. package/src/eval/tournament-validators.property.test.ts +439 -0
  38. package/src/eval/validators.property.test.ts +1118 -0
  39. package/src/eval/validators.ts +1199 -0
  40. package/src/eval/weighted-scorer.ts +285 -0
  41. package/src/index.ts +17 -0
  42. package/src/league/README.md +188 -0
  43. package/src/league/health-check.ts +353 -0
  44. package/src/league/index.ts +93 -0
  45. package/src/league/killAgent.ts +151 -0
  46. package/src/league/league.test.ts +1151 -0
  47. package/src/league/runLeague.ts +843 -0
  48. package/src/league/scoreAgent.ts +175 -0
  49. package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
  50. package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
  51. package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
  52. package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
  53. package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
  54. package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
  55. package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
  56. package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
  57. package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
  58. package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
  59. package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
  60. package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
  61. package/src/modules/omnibridge/api/.gitkeep +1 -0
  62. package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
  63. package/src/modules/omnibridge/auth/.gitkeep +1 -0
  64. package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
  65. package/src/modules/omnibridge/auth/session-vault.ts +577 -0
  66. package/src/modules/omnibridge/core/.gitkeep +1 -0
  67. package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
  68. package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
  69. package/src/modules/omnibridge/core/types.ts +610 -0
  70. package/src/modules/omnibridge/execution/.gitkeep +1 -0
  71. package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
  72. package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
  73. package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
  74. package/src/modules/omnibridge/index.ts +212 -0
  75. package/src/modules/omnibridge/omnibridge.ts +510 -0
  76. package/src/modules/omnibridge/verification/.gitkeep +1 -0
  77. package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
  78. package/src/outcomes/README.md +75 -0
  79. package/src/outcomes/acquire-pilot-customer.ts +297 -0
  80. package/src/outcomes/code-delivery-outcomes.ts +89 -0
  81. package/src/outcomes/code-outcomes.ts +256 -0
  82. package/src/outcomes/code_review_battle.test.ts +135 -0
  83. package/src/outcomes/code_review_battle.ts +135 -0
  84. package/src/outcomes/cold_email_battle.ts +97 -0
  85. package/src/outcomes/content_creation_battle.ts +160 -0
  86. package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
  87. package/src/outcomes/index.ts +107 -0
  88. package/src/outcomes/lead_gen_battle.test.ts +113 -0
  89. package/src/outcomes/lead_gen_battle.ts +99 -0
  90. package/src/outcomes/outcome.schema.property.test.ts +229 -0
  91. package/src/outcomes/outcome.schema.ts +187 -0
  92. package/src/outcomes/qualified_sales_interest.ts +118 -0
  93. package/src/outcomes/swarm_planner.property.test.ts +370 -0
  94. package/src/outcomes/swarm_planner.ts +96 -0
  95. package/src/outcomes/web_extraction.ts +234 -0
  96. package/src/runtime/README.md +220 -0
  97. package/src/runtime/agentRunner.test.ts +341 -0
  98. package/src/runtime/agentRunner.ts +746 -0
  99. package/src/runtime/claudeAdapter.ts +232 -0
  100. package/src/runtime/costTracker.ts +123 -0
  101. package/src/runtime/index.ts +34 -0
  102. package/src/runtime/modelAdapter.property.test.ts +305 -0
  103. package/src/runtime/modelAdapter.ts +144 -0
  104. package/src/runtime/openaiAdapter.ts +235 -0
  105. package/src/utils/README.md +122 -0
  106. package/src/utils/command-runner.ts +134 -0
  107. package/src/utils/cost-guard.ts +379 -0
  108. package/src/utils/errors.test.ts +290 -0
  109. package/src/utils/errors.ts +442 -0
  110. package/src/utils/index.ts +37 -0
  111. package/src/utils/logger.test.ts +361 -0
  112. package/src/utils/logger.ts +419 -0
  113. package/src/utils/output-parsers.ts +216 -0
@@ -0,0 +1,843 @@
1
+ /**
2
+ * League Runner - Parallel agent competition orchestration
3
+ *
4
+ * Implements the league execution model where N agents compete in parallel
5
+ * to achieve an outcome. First agent to succeed wins the bounty.
6
+ *
7
+ * @module league/runLeague
8
+ * @see Requirements 4.1, 4.5, 10.3
9
+ */
10
+
11
+ import { runCommand, CommandResult } from '../utils/command-runner';
12
+ import {
13
+ parseTestOutput,
14
+ parseLintOutput,
15
+ parseBenchmarkOutput,
16
+ parseSecurityScanOutput,
17
+ TestResult,
18
+ LintResult,
19
+ BenchmarkResult,
20
+ SecurityScanResult,
21
+ } from '../utils/output-parsers';
22
+ import type { AgentConfig } from '../agents/agent.schema.js';
23
+ import type { Outcome } from '../outcomes/outcome.schema.js';
24
+ import type { Lead } from '../jobs/job.interface.js';
25
+ import {
26
+ type AgentRun,
27
+ type AgentRunConfig,
28
+ runAgent,
29
+ runAgentMock,
30
+ } from '../runtime/agentRunner.js';
31
+ import { evaluateOutcome, type EvaluationResult } from '../eval/evaluateOutcome.js';
32
+ import { calculateDawsCodeScore } from '../eval/weighted-scorer.js';
33
+ import { CODE_DELIVERY_OUTCOMES } from '../outcomes/code-delivery-outcomes.js';
34
+ import { logSuccess, logFailure } from '../utils/logger.js';
35
+ import { analyticsCollector } from '../analytics/collector.js';
36
+ import { dataCollector } from '../data/collector.js';
37
+ import { launchMonitor } from '../monitoring/launch-monitor.js';
38
+
39
+ /**
40
+ * Configuration for running a league.
41
+ *
42
+ * @see design.md - League System
43
+ */
44
+ export interface LeagueConfig {
45
+ /** ID of the outcome to compete for */
46
+ outcomeId: string;
47
+ /** Number of agents to run in parallel */
48
+ agentCount: number;
49
+ /** Maximum total tokens across all agents */
50
+ globalSpendCeiling: number;
51
+ /** Agent configurations to use (will be assigned to agents) */
52
+ agentConfigs: AgentConfig[];
53
+ /** Outcome definition */
54
+ outcome: Outcome;
55
+ /** Lead data for agents to process */
56
+ lead: Lead;
57
+ /** Whether to use mock mode (no real API calls) */
58
+ mockMode?: boolean;
59
+ /** Optional API key for model calls */
60
+ apiKey?: string;
61
+ }
62
+
63
+ /**
64
+ * Result of a single agent's participation in the league.
65
+ */
66
+ export interface AgentResult {
67
+ /** Agent ID */
68
+ agentId: string;
69
+ /** Final status */
70
+ status: 'winner' | 'killed' | 'failed';
71
+ /** Reason for termination if killed */
72
+ killReason?: string;
73
+ /** Number of attempts made */
74
+ attempts: number;
75
+ /** Tokens spent */
76
+ tokensSpent: number;
77
+ /** Duration in milliseconds */
78
+ durationMs: number;
79
+ /** Evaluation result if completed */
80
+ evaluationResult?: EvaluationResult;
81
+ /** DAWS score (code outcomes) */
82
+ dawsScore?: number;
83
+ }
84
+
85
+ /**
86
+ * Tournament metrics for analyzing agent performance and profitability.
87
+ *
88
+ * @see Requirements 5.1, 5.2, 5.3, 5.4
89
+ */
90
+ export interface TournamentMetrics {
91
+ /** Net profitability calculated as (payoutAmount - totalCost) / totalCost */
92
+ netProfitability: number;
93
+ /** Cost per success calculated as totalSpend / successCount */
94
+ costPerSuccess: number;
95
+ /** Number of successful agents in the tournament */
96
+ successCount: number;
97
+ }
98
+
99
+ /**
100
+ * Result of a league run.
101
+ *
102
+ * @see design.md - League System
103
+ */
104
+ export interface LeagueResult {
105
+ /** ID of the winning agent, null if no winner */
106
+ winnerId: string | null;
107
+ /** Results for all agents */
108
+ agents: AgentResult[];
109
+ /** Total tokens spent across all agents */
110
+ totalCost: number;
111
+ /** Total duration of the league run in milliseconds */
112
+ duration: number;
113
+ /** Whether the global spend ceiling was hit */
114
+ globalCeilingHit: boolean;
115
+ /** Tournament metrics for profitability analysis */
116
+ tournamentMetrics: TournamentMetrics;
117
+ }
118
+
119
+ /**
120
+ * Internal state for tracking a running agent.
121
+ */
122
+ interface RunningAgent {
123
+ agentId: string;
124
+ config: AgentConfig;
125
+ promise: Promise<AgentRun>;
126
+ startTime: number;
127
+ killed: boolean;
128
+ killReason?: string;
129
+ }
130
+
131
+ function normalizeDawsScores(params: {
132
+ evalResult: EvaluationResult;
133
+ runDurationMs: number;
134
+ tokensSpent: number;
135
+ attempts: number;
136
+ outcomeTimeLimitMs?: number;
137
+ globalSpendCeiling?: number;
138
+ }): { GCR: number; QAS: number; CEF: number; DTT: number; RES: number } {
139
+ const { evalResult, runDurationMs, tokensSpent, attempts, outcomeTimeLimitMs, globalSpendCeiling } = params;
140
+ const criteriaCount = evalResult.criteriaResults.length || 1;
141
+ const passedCount = evalResult.criteriaResults.filter((c) => c.passed).length;
142
+
143
+ const gcr = evalResult.status === 'SUCCESS' ? 1 : 0;
144
+ const qas = Math.min(1, passedCount / criteriaCount);
145
+
146
+ const spendCeiling = globalSpendCeiling && globalSpendCeiling > 0 ? globalSpendCeiling : 50000;
147
+ const cef = Math.max(0, Math.min(1, 1 - tokensSpent / spendCeiling));
148
+
149
+ const tl = outcomeTimeLimitMs && outcomeTimeLimitMs > 0 ? outcomeTimeLimitMs : 30 * 60 * 1000;
150
+ const dtt = Math.max(0, Math.min(1, 1 - runDurationMs / tl));
151
+
152
+ const res = Math.max(0, Math.min(1, 1 - (attempts - 1) / Math.max(1, attempts)));
153
+
154
+ return { GCR: gcr, QAS: qas, CEF: cef, DTT: dtt, RES: res };
155
+ }
156
+
157
+ async function runCommandAsync(command?: string, cwd?: string): Promise<CommandResult | undefined> {
158
+ if (!command || !cwd) return undefined;
159
+ try {
160
+ return await runCommand(command, [], { cwd, timeoutMs: 30000, maxOutputSize: 100000 });
161
+ } catch (err) {
162
+ return {
163
+ exitCode: 1,
164
+ stdout: '',
165
+ stderr: (err as Error).message,
166
+ };
167
+ }
168
+ }
169
+
170
+ async function enrichCodeExecution(artifact: AgentRun['artifacts'][number]): Promise<AgentRun['artifacts'][number]> {
171
+ const content: any = { ...(artifact.content as any) };
172
+ const cwd = (content.worktreePath || content.repoPath) as string | undefined;
173
+
174
+ // Run test command
175
+ const testExec = await runCommandAsync(content.testCommand, cwd);
176
+ if (testExec) {
177
+ const parsedTest = parseTestOutput(testExec);
178
+ content.testResult = parsedTest;
179
+ }
180
+
181
+ // Run build command
182
+ const buildExec = await runCommandAsync(content.buildCommand, cwd);
183
+ if (buildExec) {
184
+ content.buildResult = {
185
+ success: buildExec.exitCode === 0,
186
+ exitCode: buildExec.exitCode,
187
+ };
188
+ }
189
+
190
+ // Run lint command
191
+ const lintExec = await runCommandAsync(content.lintCommand, cwd);
192
+ if (lintExec) {
193
+ const parsedLint = parseLintOutput(lintExec);
194
+ content.lintResult = parsedLint;
195
+ }
196
+
197
+ // Run benchmark command
198
+ const benchExec = await runCommandAsync(content.benchmarkCommand, cwd);
199
+ if (benchExec) {
200
+ const parsedBench = parseBenchmarkOutput(benchExec);
201
+ // Store the parsed metrics for validation
202
+ content.benchmarkResult = parsedBench;
203
+ }
204
+
205
+ // Run security scan command
206
+ const secExec = await runCommandAsync(content.securityScanCommand, cwd);
207
+ if (secExec) {
208
+ const parsedSec = parseSecurityScanOutput(secExec);
209
+ content.securityScanResult = parsedSec;
210
+ }
211
+
212
+ return {
213
+ ...artifact,
214
+ content,
215
+ };
216
+ }
217
+
218
+ /**
219
+ * Calculates tournament metrics for profitability analysis.
220
+ *
221
+ * Implements:
222
+ * - NetProfitability as (payoutAmount - totalCost) / totalCost (Requirement 5.1)
223
+ * - CostPerSuccess as totalSpend / successCount (Requirement 5.2)
224
+ * - Handles edge cases: zero cost returns 0, zero successes returns Infinity (Requirement 5.4)
225
+ *
226
+ * @param payoutAmount - The payout amount for the outcome
227
+ * @param totalCost - Total tokens spent across all agents
228
+ * @param successCount - Number of successful agents
229
+ * @returns TournamentMetrics with calculated profitability metrics
230
+ *
231
+ * @example
232
+ * const metrics = calculateTournamentMetrics(250, 100, 1);
233
+ * // { netProfitability: 1.5, costPerSuccess: 100, successCount: 1 }
234
+ *
235
+ * @see Requirements 5.1, 5.2, 5.4
236
+ */
237
+ export function calculateTournamentMetrics(
238
+ payoutAmount: number,
239
+ totalCost: number,
240
+ successCount: number
241
+ ): TournamentMetrics {
242
+ // Handle edge case: zero cost returns 0 profitability (Requirement 5.4)
243
+ const netProfitability = totalCost === 0 ? 0 : (payoutAmount - totalCost) / totalCost;
244
+
245
+ // Handle edge case: zero successes returns Infinity (Requirement 5.4)
246
+ const costPerSuccess = successCount === 0 ? Infinity : totalCost / successCount;
247
+
248
+ return {
249
+ netProfitability,
250
+ costPerSuccess,
251
+ successCount,
252
+ };
253
+ }
254
+
255
+ /**
256
+ * Runs a league with N agents competing in parallel.
257
+ *
258
+ * Implements:
259
+ * - Parallel agent execution (Requirement 4.1)
260
+ * - Winner promotion and competitor termination (Requirement 4.5)
261
+ * - Global spend ceiling enforcement (Requirement 10.3)
262
+ * - Analytics and monitoring integration (Requirements 12.1)
263
+ *
264
+ * @param config - League configuration
265
+ * @returns LeagueResult with winner and all agent results
266
+ *
267
+ * @example
268
+ * const result = await runLeague({
269
+ * outcomeId: 'qualified_sales_interest',
270
+ * agentCount: 3,
271
+ * globalSpendCeiling: 50000,
272
+ * agentConfigs: [agent1, agent2, agent3],
273
+ * outcome: qualifiedSalesInterest,
274
+ * lead: leadData,
275
+ * });
276
+ *
277
+ * @see Requirements 4.1, 4.5, 10.3, 12.1
278
+ */
279
+ export async function runLeague(config: LeagueConfig): Promise<LeagueResult> {
280
+ const startTime = Date.now();
281
+ const {
282
+ outcomeId,
283
+ agentCount,
284
+ globalSpendCeiling,
285
+ agentConfigs,
286
+ outcome,
287
+ lead,
288
+ mockMode = false,
289
+ apiKey,
290
+ } = config;
291
+
292
+ // Track league start in analytics and monitoring systems
293
+ await dataCollector.trackBattleExecution({
294
+ battleId: `league_${outcomeId}_${startTime}`,
295
+ outcomeId,
296
+ agents: agentConfigs.slice(0, agentCount).map(a => ({
297
+ agentId: a.id,
298
+ developerId: 'system',
299
+ modelType: a.modelProvider,
300
+ version: '1.0.0'
301
+ })),
302
+ config: {
303
+ maxAttempts: 3,
304
+ timeLimit: 300000,
305
+ payoutAmount: 0
306
+ },
307
+ results: {
308
+ attempts: 0,
309
+ duration: 0,
310
+ totalCost: 0,
311
+ success: false
312
+ },
313
+ metrics: {
314
+ tokensUsed: 0,
315
+ averageResponseTime: 0,
316
+ successRate: 0,
317
+ costPerAttempt: 0
318
+ },
319
+ metadata: {
320
+ agentCount,
321
+ globalSpendCeiling,
322
+ mockMode
323
+ }
324
+ });
325
+
326
+ // Start battle tracking in analytics collector (extends MetricsCollector)
327
+ analyticsCollector.startBattle(`league_${outcomeId}_${startTime}`, agentCount);
328
+
329
+ launchMonitor.recordEvent({
330
+ type: 'business',
331
+ metric: 'league_started',
332
+ value: 1,
333
+ timestamp: new Date(startTime),
334
+ metadata: { outcomeId, agentCount }
335
+ });
336
+
337
+ // Validate configuration
338
+ if (agentCount <= 0) {
339
+ throw new Error('Agent count must be positive');
340
+ }
341
+ if (agentConfigs.length < agentCount) {
342
+ throw new Error(`Not enough agent configs: need ${agentCount}, got ${agentConfigs.length}`);
343
+ }
344
+
345
+ // Track running agents and their state
346
+ const runningAgents: RunningAgent[] = [];
347
+ let winnerId: string | null = null;
348
+ let globalTokensSpent = 0;
349
+ let globalCeilingHit = false;
350
+
351
+ // Signal for terminating agents when winner found
352
+ const killSignals: Map<string, () => boolean> = new Map();
353
+
354
+ // Start N agents in parallel (Requirement 4.1)
355
+ for (let i = 0; i < agentCount; i++) {
356
+ const agentConfig = agentConfigs[i];
357
+ const agentId = `${agentConfig.id}-${i}`;
358
+
359
+ // Create kill signal checker for this agent
360
+ const shouldKill = (): boolean => {
361
+ // Kill if another agent already won
362
+ if (winnerId !== null && winnerId !== agentId) {
363
+ return true;
364
+ }
365
+ // Kill if global ceiling exceeded
366
+ if (globalTokensSpent >= globalSpendCeiling) {
367
+ return true;
368
+ }
369
+ return false;
370
+ };
371
+ killSignals.set(agentId, shouldKill);
372
+
373
+ // Create agent run config
374
+ const runConfig: AgentRunConfig = {
375
+ agent: { ...agentConfig, id: agentId },
376
+ outcome,
377
+ lead,
378
+ apiKey,
379
+ onKillSignal: shouldKill,
380
+ };
381
+
382
+ // Start agent (mock or real)
383
+ const runFn = mockMode ? runAgentMock : runAgent;
384
+ const promise = runFn(runConfig);
385
+
386
+ runningAgents.push({
387
+ agentId,
388
+ config: agentConfig,
389
+ promise,
390
+ startTime: Date.now(),
391
+ killed: false,
392
+ });
393
+
394
+ // Track agent start
395
+ await dataCollector.trackUserInteraction({
396
+ userId: agentId,
397
+ sessionId: `league_${outcomeId}_${startTime}`,
398
+ type: 'form_submit', // Agent submission is a form submit action
399
+ action: 'agent_started',
400
+ page: 'league',
401
+ success: true,
402
+ metadata: {
403
+ outcomeId,
404
+ agentConfig: agentConfig.id,
405
+ leagueStartTime: startTime
406
+ }
407
+ });
408
+
409
+ // Track agent submission in analytics
410
+ analyticsCollector.recordUserActivity({
411
+ userId: agentId,
412
+ userRole: 'developer',
413
+ action: 'submit_agent',
414
+ timestamp: new Date(),
415
+ metadata: {
416
+ outcomeId,
417
+ agentConfig: agentConfig.id
418
+ }
419
+ });
420
+ }
421
+
422
+ // Process agent completions
423
+ const agentResults: AgentResult[] = [];
424
+ const pendingPromises = new Map<string, Promise<AgentRun>>();
425
+
426
+ for (const agent of runningAgents) {
427
+ pendingPromises.set(agent.agentId, agent.promise);
428
+ }
429
+
430
+ // Process agents as they complete
431
+ while (pendingPromises.size > 0) {
432
+ // Wait for any agent to complete
433
+ const completedRuns = await Promise.race(
434
+ Array.from(pendingPromises.entries()).map(async ([agentId, promise]) => {
435
+ const run = await promise;
436
+ return { agentId, run };
437
+ })
438
+ );
439
+
440
+ const { agentId, run } = completedRuns;
441
+ pendingPromises.delete(agentId);
442
+
443
+ // Update global token count
444
+ globalTokensSpent += run.tokensSpent;
445
+
446
+ // Track token usage in analytics
447
+ analyticsCollector.recordFinancialEvent({
448
+ type: 'cost',
449
+ category: 'ai_model',
450
+ amount: run.tokensSpent * 0.001, // Convert tokens to cost estimate
451
+ currency: 'USD',
452
+ timestamp: new Date(),
453
+ metadata: {
454
+ agentId,
455
+ outcomeId,
456
+ tokensSpent: run.tokensSpent
457
+ }
458
+ });
459
+
460
+ // Record tokens in analytics collector (extends MetricsCollector)
461
+ analyticsCollector.recordTokens(`league_${outcomeId}_${startTime}`, run.tokensSpent, run.tokensSpent * 0.001);
462
+
463
+ // Check global ceiling
464
+ if (globalTokensSpent >= globalSpendCeiling && !globalCeilingHit) {
465
+ globalCeilingHit = true;
466
+
467
+ launchMonitor.recordEvent({
468
+ type: 'system',
469
+ metric: 'global_ceiling_hit',
470
+ value: globalTokensSpent,
471
+ timestamp: new Date(),
472
+ metadata: { outcomeId, ceiling: globalSpendCeiling }
473
+ });
474
+ }
475
+
476
+ // Determine agent result
477
+ let agentResult: AgentResult;
478
+
479
+ if (run.status === 'killed') {
480
+ // Agent was killed (cost exceeded, timeout, or competitor won)
481
+ agentResult = {
482
+ agentId,
483
+ status: 'killed',
484
+ killReason: run.killReason ?? 'unknown',
485
+ attempts: run.attempts,
486
+ tokensSpent: run.tokensSpent,
487
+ durationMs: run.durationMs,
488
+ };
489
+
490
+ logFailure(
491
+ agentId,
492
+ outcomeId,
493
+ 'v1.0.0',
494
+ run.tokensSpent,
495
+ `Killed: ${run.killReason}`
496
+ );
497
+
498
+ // Track agent failure
499
+ await dataCollector.trackBattleExecution({
500
+ battleId: `agent_${agentId}_${startTime}`,
501
+ outcomeId,
502
+ agents: [{
503
+ agentId: agentId,
504
+ developerId: 'system',
505
+ modelType: 'unknown',
506
+ version: '1.0.0'
507
+ }],
508
+ config: {
509
+ maxAttempts: 3,
510
+ timeLimit: 300000,
511
+ payoutAmount: 0
512
+ },
513
+ results: {
514
+ attempts: run.attempts,
515
+ duration: Date.now() - startTime,
516
+ totalCost: run.tokensSpent,
517
+ success: false,
518
+ errorMessage: run.killReason
519
+ },
520
+ metrics: {
521
+ tokensUsed: run.tokensSpent,
522
+ averageResponseTime: 0,
523
+ successRate: 0,
524
+ costPerAttempt: run.tokensSpent / Math.max(run.attempts, 1)
525
+ },
526
+ metadata: {
527
+ killReason: run.killReason,
528
+ attempts: run.attempts
529
+ }
530
+ });
531
+
532
+ } else if (run.status === 'completed' && run.artifacts.length > 0) {
533
+ // Agent completed - evaluate the artifact
534
+ const artifact = run.artifacts[run.artifacts.length - 1];
535
+ const isCodeOutcome = Boolean(CODE_DELIVERY_OUTCOMES[outcome.name]);
536
+ const enrichedArtifact = isCodeOutcome ? await enrichCodeExecution(artifact) : artifact;
537
+ const evalResult = await evaluateOutcome(outcome, enrichedArtifact);
538
+ const dawsScore = isCodeOutcome
539
+ ? calculateDawsCodeScore(
540
+ normalizeDawsScores({
541
+ evalResult,
542
+ runDurationMs: run.durationMs,
543
+ tokensSpent: run.tokensSpent,
544
+ attempts: run.attempts,
545
+ outcomeTimeLimitMs: outcome.timeLimitMs,
546
+ globalSpendCeiling,
547
+ })
548
+ ).finalScore
549
+ : undefined;
550
+
551
+ if (evalResult.status === 'SUCCESS' && winnerId === null) {
552
+ // First successful agent wins! (Requirement 4.5)
553
+ winnerId = agentId;
554
+ agentResult = {
555
+ agentId,
556
+ status: 'winner',
557
+ attempts: run.attempts,
558
+ tokensSpent: run.tokensSpent,
559
+ durationMs: run.durationMs,
560
+ evaluationResult: evalResult,
561
+ dawsScore,
562
+ };
563
+
564
+ logSuccess(agentId, outcomeId, 'v1.0.0', run.tokensSpent, {
565
+ winner: true,
566
+ payoutAmount: outcome.payoutAmount,
567
+ });
568
+
569
+ // Track winner in analytics
570
+ analyticsCollector.recordDeveloperEarnings({
571
+ developerId: agentId,
572
+ agentId,
573
+ battleId: `league_${outcomeId}_${startTime}`,
574
+ earnings: outcome.payoutAmount,
575
+ timestamp: new Date(),
576
+ winRate: 1.0
577
+ });
578
+
579
+ analyticsCollector.recordFinancialEvent({
580
+ type: 'cost',
581
+ category: 'payout',
582
+ amount: outcome.payoutAmount,
583
+ currency: 'USD',
584
+ timestamp: new Date(),
585
+ metadata: {
586
+ agentId,
587
+ outcomeId,
588
+ winnerId: agentId
589
+ }
590
+ });
591
+
592
+ // Track successful battle
593
+ await dataCollector.trackBattleExecution({
594
+ battleId: `agent_${agentId}_${startTime}`,
595
+ outcomeId,
596
+ agents: [{
597
+ agentId: agentId,
598
+ developerId: 'system',
599
+ modelType: 'unknown',
600
+ version: '1.0.0'
601
+ }],
602
+ config: {
603
+ maxAttempts: 3,
604
+ timeLimit: 300000,
605
+ payoutAmount: outcome.payoutAmount
606
+ },
607
+ results: {
608
+ winner: agentId,
609
+ attempts: run.attempts,
610
+ duration: Date.now() - startTime,
611
+ totalCost: run.tokensSpent,
612
+ success: true
613
+ },
614
+ metrics: {
615
+ tokensUsed: run.tokensSpent,
616
+ averageResponseTime: 0,
617
+ successRate: 1.0,
618
+ costPerAttempt: run.tokensSpent / Math.max(run.attempts, 1)
619
+ },
620
+ metadata: {
621
+ payoutAmount: outcome.payoutAmount,
622
+ attempts: run.attempts,
623
+ evaluationResult: evalResult.status
624
+ }
625
+ });
626
+
627
+ // Complete battle in analytics collector
628
+ analyticsCollector.completeBattle(`league_${outcomeId}_${startTime}`, agentId);
629
+
630
+ } else {
631
+ // Evaluation failed or another agent already won
632
+ const isCompetitorWon = winnerId !== null;
633
+ agentResult = {
634
+ agentId,
635
+ status: isCompetitorWon ? 'killed' : 'failed',
636
+ killReason: isCompetitorWon ? 'competitor_won' : undefined,
637
+ attempts: run.attempts,
638
+ tokensSpent: run.tokensSpent,
639
+ durationMs: run.durationMs,
640
+ evaluationResult: evalResult,
641
+ dawsScore,
642
+ };
643
+
644
+ logFailure(
645
+ agentId,
646
+ outcomeId,
647
+ 'v1.0.0',
648
+ run.tokensSpent,
649
+ isCompetitorWon ? 'Competitor won first' : evalResult.reason
650
+ );
651
+
652
+ // Track agent failure
653
+ await dataCollector.trackBattleExecution({
654
+ battleId: `agent_${agentId}_${startTime}`,
655
+ outcomeId,
656
+ agents: [{
657
+ agentId: agentId,
658
+ developerId: 'system',
659
+ modelType: 'unknown',
660
+ version: '1.0.0'
661
+ }],
662
+ config: {
663
+ maxAttempts: 3,
664
+ timeLimit: 300000,
665
+ payoutAmount: 0
666
+ },
667
+ results: {
668
+ attempts: run.attempts,
669
+ duration: run.durationMs,
670
+ totalCost: run.tokensSpent,
671
+ success: false,
672
+ errorMessage: isCompetitorWon ? 'Competitor won first' : evalResult.reason
673
+ },
674
+ metrics: {
675
+ tokensUsed: run.tokensSpent,
676
+ averageResponseTime: 0,
677
+ successRate: 0,
678
+ costPerAttempt: run.tokensSpent / Math.max(run.attempts, 1)
679
+ },
680
+ metadata: {
681
+ evaluationStatus: evalResult.status,
682
+ evaluationReason: evalResult.reason,
683
+ competitorWon: isCompetitorWon
684
+ }
685
+ });
686
+
687
+ // Mark battle as failed in analytics collector if no winner yet
688
+ if (!winnerId) {
689
+ analyticsCollector.failBattle(`league_${outcomeId}_${startTime}`, evalResult.reason);
690
+ }
691
+ }
692
+ } else {
693
+ // Agent failed without producing artifacts
694
+ agentResult = {
695
+ agentId,
696
+ status: 'failed',
697
+ attempts: run.attempts,
698
+ tokensSpent: run.tokensSpent,
699
+ durationMs: run.durationMs,
700
+ };
701
+
702
+ logFailure(
703
+ agentId,
704
+ outcomeId,
705
+ 'v1.0.0',
706
+ run.tokensSpent,
707
+ run.error ?? 'No artifacts produced'
708
+ );
709
+
710
+ // Track agent failure
711
+ await dataCollector.trackBattleExecution({
712
+ battleId: `agent_${agentId}_${startTime}`,
713
+ outcomeId,
714
+ agents: [{
715
+ agentId: agentId,
716
+ developerId: 'system',
717
+ modelType: 'unknown',
718
+ version: '1.0.0'
719
+ }],
720
+ config: {
721
+ maxAttempts: 3,
722
+ timeLimit: 300000,
723
+ payoutAmount: 0
724
+ },
725
+ results: {
726
+ attempts: run.attempts,
727
+ duration: run.durationMs,
728
+ totalCost: run.tokensSpent,
729
+ success: false,
730
+ errorMessage: run.error ?? 'No artifacts produced'
731
+ },
732
+ metrics: {
733
+ tokensUsed: run.tokensSpent,
734
+ averageResponseTime: 0,
735
+ successRate: 0,
736
+ costPerAttempt: run.tokensSpent / Math.max(run.attempts, 1)
737
+ },
738
+ metadata: {
739
+ error: run.error,
740
+ noArtifacts: true
741
+ }
742
+ });
743
+
744
+ // Mark battle as failed in analytics collector
745
+ analyticsCollector.failBattle(`league_${outcomeId}_${startTime}`, run.error ?? 'No artifacts produced');
746
+ }
747
+
748
+ agentResults.push(agentResult);
749
+ }
750
+
751
+ // Calculate tournament metrics (Requirements 5.1, 5.2, 5.3)
752
+ const successCount = agentResults.filter(agent => agent.status === 'winner').length;
753
+ const tournamentMetrics = calculateTournamentMetrics(
754
+ outcome.payoutAmount,
755
+ globalTokensSpent,
756
+ successCount
757
+ );
758
+
759
+ const endTime = Date.now();
760
+ const duration = endTime - startTime;
761
+
762
+ // Track league completion
763
+ await dataCollector.trackBattleExecution({
764
+ battleId: `league_${outcomeId}_${startTime}`,
765
+ outcomeId,
766
+ agents: agentConfigs.slice(0, agentCount).map(a => ({
767
+ agentId: a.id,
768
+ developerId: 'system',
769
+ modelType: a.modelProvider,
770
+ version: '1.0.0'
771
+ })),
772
+ config: {
773
+ maxAttempts: 3,
774
+ timeLimit: 300000,
775
+ payoutAmount: winnerId ? (outcome?.payoutAmount || 0) : 0
776
+ },
777
+ results: {
778
+ winner: winnerId || undefined,
779
+ attempts: agentCount,
780
+ duration: duration,
781
+ totalCost: globalTokensSpent,
782
+ success: winnerId !== null
783
+ },
784
+ metrics: {
785
+ tokensUsed: globalTokensSpent,
786
+ averageResponseTime: duration / agentCount,
787
+ successRate: winnerId ? 1.0 : 0.0,
788
+ costPerAttempt: globalTokensSpent / agentCount
789
+ },
790
+ metadata: {
791
+ agentCount,
792
+ globalSpendCeiling,
793
+ globalCeilingHit,
794
+ tournamentMetrics,
795
+ duration
796
+ }
797
+ });
798
+
799
+ // Track business metrics
800
+ launchMonitor.recordEvent({
801
+ type: 'business',
802
+ metric: 'league_completed',
803
+ value: 1,
804
+ timestamp: new Date(endTime),
805
+ metadata: {
806
+ outcomeId,
807
+ winnerId,
808
+ totalCost: globalTokensSpent,
809
+ duration,
810
+ success: winnerId !== null
811
+ }
812
+ });
813
+
814
+ // Track system performance
815
+ launchMonitor.recordEvent({
816
+ type: 'system',
817
+ metric: 'league_duration',
818
+ value: duration,
819
+ timestamp: new Date(endTime),
820
+ metadata: { outcomeId, agentCount }
821
+ });
822
+
823
+ return {
824
+ winnerId,
825
+ agents: agentResults,
826
+ totalCost: globalTokensSpent,
827
+ duration,
828
+ globalCeilingHit,
829
+ tournamentMetrics,
830
+ };
831
+ }
832
+
833
+ /**
834
+ * Runs a league in mock mode for testing.
835
+ *
836
+ * @param config - League configuration (mockMode will be forced to true)
837
+ * @returns LeagueResult
838
+ */
839
+ export async function runLeagueMock(
840
+ config: Omit<LeagueConfig, 'mockMode'>
841
+ ): Promise<LeagueResult> {
842
+ return runLeague({ ...config, mockMode: true });
843
+ }